]> git.saurik.com Git - apple/xnu.git/blame - bsd/netinet/tcp_input.c
xnu-2782.10.72.tar.gz
[apple/xnu.git] / bsd / netinet / tcp_input.c
CommitLineData
1c79356b 1/*
8a3053a0 2 * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
5d5c5d0d 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
1c79356b 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
8f6c56a5 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
8f6c56a5 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b
A
27 */
28/*
29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
30 * The Regents of the University of California. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
9bccf70c 61 * $FreeBSD: src/sys/netinet/tcp_input.c,v 1.107.2.16 2001/08/22 00:59:12 silby Exp $
1c79356b 62 */
2d21ac55
A
63/*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
1c79356b
A
69
70#include <sys/param.h>
71#include <sys/systm.h>
72#include <sys/kernel.h>
73#include <sys/sysctl.h>
74#include <sys/malloc.h>
75#include <sys/mbuf.h>
76#include <sys/proc.h> /* for proc0 declaration */
77#include <sys/protosw.h>
78#include <sys/socket.h>
79#include <sys/socketvar.h>
80#include <sys/syslog.h>
316670eb 81#include <sys/mcache.h>
39236c6e 82#include <sys/kasl.h>
1c79356b
A
83#include <kern/cpu_number.h> /* before tcp_seq.h, for tcp_random18() */
84
b0d623f7
A
85#include <machine/endian.h>
86
1c79356b 87#include <net/if.h>
d12e1678 88#include <net/if_types.h>
1c79356b 89#include <net/route.h>
6d2010ae 90#include <net/ntstat.h>
39236c6e 91#include <net/dlil.h>
1c79356b
A
92
93#include <netinet/in.h>
94#include <netinet/in_systm.h>
95#include <netinet/ip.h>
9bccf70c 96#include <netinet/ip_icmp.h> /* for ICMP_BANDLIM */
1c79356b 97#include <netinet/in_var.h>
9bccf70c 98#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
1c79356b 99#include <netinet/in_pcb.h>
9bccf70c 100#include <netinet/ip_var.h>
6d2010ae 101#include <mach/sdt.h>
1c79356b
A
102#if INET6
103#include <netinet/ip6.h>
104#include <netinet/icmp6.h>
105#include <netinet6/nd6.h>
106#include <netinet6/ip6_var.h>
107#include <netinet6/in6_pcb.h>
108#endif
109#include <netinet/tcp.h>
110#include <netinet/tcp_fsm.h>
111#include <netinet/tcp_seq.h>
112#include <netinet/tcp_timer.h>
113#include <netinet/tcp_var.h>
6d2010ae 114#include <netinet/tcp_cc.h>
39236c6e 115#include <dev/random/randomdev.h>
6d2010ae 116#include <kern/zalloc.h>
9bccf70c
A
117#if INET6
118#include <netinet6/tcp6_var.h>
119#endif
1c79356b
A
120#include <netinet/tcpip.h>
121#if TCPDEBUG
122#include <netinet/tcp_debug.h>
9bccf70c 123u_char tcp_saveipgen[40]; /* the size must be of max ip header, now IPv6 */
1c79356b
A
124struct tcphdr tcp_savetcp;
125#endif /* TCPDEBUG */
126
127#if IPSEC
128#include <netinet6/ipsec.h>
9bccf70c
A
129#if INET6
130#include <netinet6/ipsec6.h>
131#endif
1c79356b
A
132#include <netkey/key.h>
133#endif /*IPSEC*/
134
2d21ac55
A
135#if CONFIG_MACF_NET || CONFIG_MACF_SOCKET
136#include <security/mac_framework.h>
137#endif /* CONFIG_MACF_NET || CONFIG_MACF_SOCKET */
138
1c79356b 139#include <sys/kdebug.h>
316670eb 140#include <netinet/lro_ext.h>
39236c6e
A
141#if MPTCP
142#include <netinet/mptcp_var.h>
143#include <netinet/mptcp.h>
144#include <netinet/mptcp_opt.h>
145#endif /* MPTCP */
1c79356b
A
146
147#define DBG_LAYER_BEG NETDBG_CODE(DBG_NETTCP, 0)
148#define DBG_LAYER_END NETDBG_CODE(DBG_NETTCP, 2)
149#define DBG_FNC_TCP_INPUT NETDBG_CODE(DBG_NETTCP, (3 << 8))
150#define DBG_FNC_TCP_NEWCONN NETDBG_CODE(DBG_NETTCP, (7 << 8))
151
1c79356b
A
152tcp_cc tcp_ccgen;
153
154struct tcpstat tcpstat;
1c79356b 155
9bccf70c 156static int log_in_vain = 0;
6d2010ae 157SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW | CTLFLAG_LOCKED,
9bccf70c
A
158 &log_in_vain, 0, "Log all incoming TCP connections");
159
160static int blackhole = 0;
6d2010ae 161SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW | CTLFLAG_LOCKED,
9bccf70c 162 &blackhole, 0, "Do not send RST when dropping refused connections");
1c79356b 163
743b1565 164int tcp_delack_enabled = 3;
6d2010ae 165SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW | CTLFLAG_LOCKED,
9bccf70c
A
166 &tcp_delack_enabled, 0,
167 "Delay ACK to try and piggyback it onto a data packet");
168
169int tcp_lq_overflow = 1;
6d2010ae 170SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_lq_overflow, CTLFLAG_RW | CTLFLAG_LOCKED,
9bccf70c
A
171 &tcp_lq_overflow, 0,
172 "Listen Queue Overflow");
173
6d2010ae
A
174int tcp_recv_bg = 0;
175SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbg, CTLFLAG_RW | CTLFLAG_LOCKED,
176 &tcp_recv_bg, 0,
177 "Receive background");
178
9bccf70c 179#if TCP_DROP_SYNFIN
55e303ae 180static int drop_synfin = 1;
6d2010ae 181SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW | CTLFLAG_LOCKED,
9bccf70c
A
182 &drop_synfin, 0, "Drop TCP packets with SYN+FIN set");
183#endif
1c79356b 184
2d21ac55 185SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW|CTLFLAG_LOCKED, 0,
e5568f75
A
186 "TCP Segment Reassembly Queue");
187
e5568f75 188static int tcp_reass_overflows = 0;
6d2010ae 189SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD | CTLFLAG_LOCKED,
e5568f75
A
190 &tcp_reass_overflows, 0,
191 "Global number of TCP Segment Reassembly Queue Overflows");
192
193
d12e1678 194__private_extern__ int slowlink_wsize = 8192;
6d2010ae 195SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowlink_wsize, CTLFLAG_RW | CTLFLAG_LOCKED,
d12e1678
A
196 &slowlink_wsize, 0, "Maximum advertised window size for slowlink");
197
6d2010ae
A
198int maxseg_unacked = 8;
199SYSCTL_INT(_net_inet_tcp, OID_AUTO, maxseg_unacked, CTLFLAG_RW | CTLFLAG_LOCKED,
2d21ac55
A
200 &maxseg_unacked, 0, "Maximum number of outstanding segments left unacked");
201
6d2010ae
A
202int tcp_do_rfc3465 = 1;
203SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW | CTLFLAG_LOCKED,
2d21ac55 204 &tcp_do_rfc3465, 0, "");
b0d623f7 205
6d2010ae
A
206int tcp_do_rfc3465_lim2 = 1;
207SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465_lim2, CTLFLAG_RW | CTLFLAG_LOCKED,
b0d623f7
A
208 &tcp_do_rfc3465_lim2, 0, "Appropriate bytes counting w/ L=2*SMSS");
209
6d2010ae
A
210int rtt_samples_per_slot = 20;
211SYSCTL_INT(_net_inet_tcp, OID_AUTO, rtt_samples_per_slot, CTLFLAG_RW | CTLFLAG_LOCKED,
212 &rtt_samples_per_slot, 0, "Number of RTT samples stored for rtt history");
213
214int tcp_allowed_iaj = ALLOWED_IAJ;
215SYSCTL_INT(_net_inet_tcp, OID_AUTO, recv_allowed_iaj, CTLFLAG_RW | CTLFLAG_LOCKED,
216 &tcp_allowed_iaj, 0, "Allowed inter-packet arrival jiter");
217
218int tcp_acc_iaj_high_thresh = ACC_IAJ_HIGH_THRESH;
219SYSCTL_INT(_net_inet_tcp, OID_AUTO, acc_iaj_high_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
220 &tcp_acc_iaj_high_thresh, 0, "Used in calculating maximum accumulated IAJ");
221
316670eb
A
222u_int32_t tcp_do_autorcvbuf = 1;
223SYSCTL_INT(_net_inet_tcp, OID_AUTO, doautorcvbuf, CTLFLAG_RW | CTLFLAG_LOCKED,
224 &tcp_do_autorcvbuf, 0, "Enable automatic socket buffer tuning");
225
226u_int32_t tcp_autorcvbuf_inc_shift = 3;
227SYSCTL_INT(_net_inet_tcp, OID_AUTO, autorcvbufincshift, CTLFLAG_RW | CTLFLAG_LOCKED,
228 &tcp_autorcvbuf_inc_shift, 0, "Shift for increment in receive socket buffer size");
229
230u_int32_t tcp_autorcvbuf_max = 512 * 1024;
231SYSCTL_INT(_net_inet_tcp, OID_AUTO, autorcvbufmax, CTLFLAG_RW | CTLFLAG_LOCKED,
232 &tcp_autorcvbuf_max, 0, "Maximum receive socket buffer size");
233
39236c6e 234int sw_lro = 0;
316670eb
A
235SYSCTL_INT(_net_inet_tcp, OID_AUTO, lro, CTLFLAG_RW | CTLFLAG_LOCKED,
236 &sw_lro, 0, "Used to coalesce TCP packets");
237
238int lrodebug = 0;
239SYSCTL_INT(_net_inet_tcp, OID_AUTO, lrodbg, CTLFLAG_RW | CTLFLAG_LOCKED,
240 &lrodebug, 0, "Used to debug SW LRO");
241
39236c6e 242int lro_start = 4;
316670eb
A
243SYSCTL_INT(_net_inet_tcp, OID_AUTO, lro_startcnt, CTLFLAG_RW | CTLFLAG_LOCKED,
244 &lro_start, 0, "Segments for starting LRO computed as power of 2");
245
246extern int tcp_do_autosendbuf;
247
39236c6e
A
248int limited_txmt = 1;
249SYSCTL_INT(_net_inet_tcp, OID_AUTO, limited_transmit, CTLFLAG_RW | CTLFLAG_LOCKED,
250 &limited_txmt, 0, "Enable limited transmit");
251
252int early_rexmt = 1;
253SYSCTL_INT(_net_inet_tcp, OID_AUTO, early_rexmt, CTLFLAG_RW | CTLFLAG_LOCKED,
254 &early_rexmt, 0, "Enable Early Retransmit");
255
256int sack_ackadv = 1;
257SYSCTL_INT(_net_inet_tcp, OID_AUTO, sack_ackadv, CTLFLAG_RW | CTLFLAG_LOCKED,
258 &sack_ackadv, 0, "Use SACK with cumulative ack advancement as a dupack");
259
b0d623f7
A
260#if CONFIG_IFEF_NOWINDOWSCALE
261int tcp_obey_ifef_nowindowscale = 0;
6d2010ae 262SYSCTL_INT(_net_inet_tcp, OID_AUTO, obey_ifef_nowindowscale, CTLFLAG_RW | CTLFLAG_LOCKED,
b0d623f7
A
263 &tcp_obey_ifef_nowindowscale, 0, "");
264#endif
39236c6e 265
2d21ac55 266extern int tcp_TCPTV_MIN;
6d2010ae
A
267extern int tcp_acc_iaj_high;
268extern int tcp_acc_iaj_react_limit;
269extern struct zone *tcp_reass_zone;
270
39236c6e 271int tcprexmtthresh = 3;
d12e1678 272
b0d623f7 273u_int32_t tcp_now;
6d2010ae
A
274struct timeval tcp_uptime; /* uptime when tcp_now was last updated */
275lck_spin_t *tcp_uptime_lock; /* Used to sychronize updates to tcp_now */
2d21ac55 276
1c79356b
A
277struct inpcbhead tcb;
278#define tcb6 tcb /* for KAME src sync over BSD*'s */
279struct inpcbinfo tcbinfo;
280
6d2010ae
A
281static void tcp_dooptions(struct tcpcb *, u_char *, int, struct tcphdr *,
282 struct tcpopt *, unsigned int);
91447636
A
283static void tcp_pulloutofband(struct socket *,
284 struct tcphdr *, struct mbuf *, int);
39236c6e
A
285static int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *,
286 struct ifnet *);
287static void tcp_xmit_timer(struct tcpcb *, int, u_int32_t, tcp_seq);
2d21ac55 288static inline unsigned int tcp_maxmtu(struct rtentry *);
6d2010ae 289static inline int tcp_stretch_ack_enable(struct tcpcb *tp);
39236c6e 290static inline void tcp_adaptive_rwtimo_check(struct tcpcb *, int);
6d2010ae
A
291
292#if TRAFFIC_MGT
fe8ab488
A
293static inline void update_iaj_state(struct tcpcb *tp, uint32_t tlen,
294 int reset_size);
39236c6e
A
295void compute_iaj(struct tcpcb *tp, int nlropkts, int lro_delay_factor);
296static void compute_iaj_meat(struct tcpcb *tp, uint32_t cur_iaj);
6d2010ae
A
297#endif /* TRAFFIC_MGT */
298
2d21ac55
A
299#if INET6
300static inline unsigned int tcp_maxmtu6(struct rtentry *);
301#endif
1c79356b 302
316670eb
A
303static void tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sb,
304 struct tcpopt *to, u_int32_t tlen);
305
306void tcp_sbrcv_trim(struct tcpcb *tp, struct sockbuf *sb);
307static void tcp_sbsnd_trim(struct sockbuf *sbsnd);
308static inline void tcp_sbrcv_tstmp_check(struct tcpcb *tp);
309static inline void tcp_sbrcv_reserve(struct tcpcb *tp, struct sockbuf *sb,
310 u_int32_t newsize, u_int32_t idealsize);
39236c6e 311static void tcp_bad_rexmt_restore_state(struct tcpcb *tp, struct tcphdr *th);
39236c6e
A
312static void tcp_compute_rtt(struct tcpcb *tp, struct tcpopt *to,
313 struct tcphdr *th);
fe8ab488
A
314static void tcp_early_rexmt_check(struct tcpcb *tp, struct tcphdr *th);
315static void tcp_bad_rexmt_check(struct tcpcb *tp, struct tcphdr *th,
316 struct tcpopt *to);
39236c6e
A
317/*
318 * Constants used for resizing receive socket buffer
319 * when timestamps are not supported
320 */
316670eb
A
321#define TCPTV_RCVNOTS_QUANTUM 100
322#define TCP_RCVNOTS_BYTELEVEL 204800
39236c6e
A
323
324/*
325 * Constants used for limiting early retransmits
326 * to 10 per minute.
327 */
328#define TCP_EARLY_REXMT_WIN (60 * TCP_RETRANSHZ) /* 60 seconds */
329#define TCP_EARLY_REXMT_LIMIT 10
9bccf70c 330
2d21ac55 331extern void ipfwsyslog( int level, const char *format,...);
91447636
A
332extern int fw_verbose;
333
2d21ac55 334#if IPFIREWALL
39236c6e
A
335extern void ipfw_stealth_stats_incr_tcp(void);
336
91447636
A
337#define log_in_vain_log( a ) { \
338 if ( (log_in_vain == 3 ) && (fw_verbose == 2)) { /* Apple logging, log to ipfw.log */ \
339 ipfwsyslog a ; \
39236c6e
A
340 } else if ( (log_in_vain == 4 ) && (fw_verbose == 2)) { \
341 ipfw_stealth_stats_incr_tcp(); \
91447636
A
342 } \
343 else log a ; \
344}
2d21ac55
A
345#else
346#define log_in_vain_log( a ) { log a; }
347#endif
348
6d2010ae
A
349int tcp_rcvunackwin = TCPTV_UNACKWIN;
350int tcp_maxrcvidle = TCPTV_MAXRCVIDLE;
351int tcp_rcvsspktcnt = TCP_RCV_SS_PKTCOUNT;
316670eb
A
352SYSCTL_INT(_net_inet_tcp, OID_AUTO, rcvsspktcnt, CTLFLAG_RW | CTLFLAG_LOCKED,
353 &tcp_rcvsspktcnt, 0, "packets to be seen before receiver stretches acks");
91447636 354
39236c6e
A
355#define DELAY_ACK(tp, th) \
356 (CC_ALGO(tp)->delay_ack != NULL && CC_ALGO(tp)->delay_ack(tp, th))
91447636 357
2d21ac55 358static int tcp_dropdropablreq(struct socket *head);
8ad349bb 359static void tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th);
9bccf70c 360
6d2010ae 361static void update_base_rtt(struct tcpcb *tp, uint32_t rtt);
6d2010ae
A
362void tcp_set_background_cc(struct socket *so);
363void tcp_set_foreground_cc(struct socket *so);
364static void tcp_set_new_cc(struct socket *so, uint16_t cc_index);
316670eb 365static void tcp_bwmeas_check(struct tcpcb *tp);
6d2010ae
A
366
367#if TRAFFIC_MGT
368void
369reset_acc_iaj(struct tcpcb *tp)
370{
371 tp->acc_iaj = 0;
372 tp->iaj_rwintop = 0;
316670eb 373 CLEAR_IAJ_STATE(tp);
6d2010ae
A
374}
375
376static inline void
377update_iaj_state(struct tcpcb *tp, uint32_t size, int rst_size)
378{
379 if (rst_size > 0)
380 tp->iaj_size = 0;
381 if (tp->iaj_size == 0 || size >= tp->iaj_size) {
382 tp->iaj_size = size;
383 tp->iaj_rcv_ts = tcp_now;
384 tp->iaj_small_pkt = 0;
385 }
386}
387
6d2010ae
A
388/* For every 32 bit unsigned integer(v), this function will find the
389 * largest integer n such that (n*n <= v). This takes at most 16 iterations
390 * irrespective of the value of v and does not involve multiplications.
391 */
392static inline int
393isqrt(unsigned int val) {
394 unsigned int sqrt_cache[11] = {0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100};
395 unsigned int temp, g=0, b=0x8000, bshft=15;
396 if ( val <= 100) {
397 for (g = 0; g <= 10; ++g) {
398 if (sqrt_cache[g] > val) {
399 g--;
400 break;
401 } else if (sqrt_cache[g] == val) {
402 break;
403 }
404 }
405 } else {
406 do {
407 temp = (((g << 1) + b) << (bshft--));
408 if (val >= temp) {
409 g += b;
410 val -= temp;
411 }
412 b >>= 1;
413 } while ( b > 0 && val > 0);
414 }
415 return(g);
416}
417
39236c6e
A
418/*
419* With LRO, roughly estimate the inter arrival time between
420* each sub coalesced packet as an average. Count the delay
421* cur_iaj to be the delay between the last packet received
422* and the first packet of the LRO stream. Due to round off errors
423* cur_iaj may be the same as lro_delay_factor. Averaging has
424* round off errors too. lro_delay_factor may be close to 0
425* in steady state leading to lower values fed to compute_iaj_meat.
426*/
6d2010ae 427void
39236c6e 428compute_iaj(struct tcpcb *tp, int nlropkts, int lro_delay_factor)
6d2010ae 429{
39236c6e
A
430 uint32_t cur_iaj = tcp_now - tp->iaj_rcv_ts;
431 uint32_t timediff = 0;
432
433 if (cur_iaj >= lro_delay_factor) {
434 cur_iaj = cur_iaj - lro_delay_factor;
435 }
436
437 compute_iaj_meat(tp, cur_iaj);
438
439 if (nlropkts <= 1)
440 return;
441
442 nlropkts--;
443
444 timediff = lro_delay_factor/nlropkts;
445
446 while (nlropkts > 0)
447 {
448 compute_iaj_meat(tp, timediff);
449 nlropkts--;
450 }
451}
452
453static
454void compute_iaj_meat(struct tcpcb *tp, uint32_t cur_iaj)
455{
456 /* When accumulated IAJ reaches MAX_ACC_IAJ in milliseconds,
457 * throttle the receive window to a minimum of MIN_IAJ_WIN packets
6d2010ae
A
458 */
459#define MAX_ACC_IAJ (tcp_acc_iaj_high_thresh + tcp_acc_iaj_react_limit)
39236c6e
A
460#define IAJ_DIV_SHIFT 4
461#define IAJ_ROUNDUP_CONST (1 << (IAJ_DIV_SHIFT - 1))
6d2010ae
A
462
463 uint32_t allowed_iaj, acc_iaj = 0;
6d2010ae
A
464
465 uint32_t mean, temp;
466 int32_t cur_iaj_dev;
39236c6e 467
6d2010ae
A
468 cur_iaj_dev = (cur_iaj - tp->avg_iaj);
469
39236c6e
A
470 /* Allow a jitter of "allowed_iaj" milliseconds. Some connections
471 * may have a constant jitter more than that. We detect this by
472 * using standard deviation.
6d2010ae
A
473 */
474 allowed_iaj = tp->avg_iaj + tp->std_dev_iaj;
475 if (allowed_iaj < tcp_allowed_iaj)
476 allowed_iaj = tcp_allowed_iaj;
477
39236c6e
A
478 /* Initially when the connection starts, the senders congestion
479 * window is small. During this period we avoid throttling a
480 * connection because we do not have a good starting point for
481 * allowed_iaj. IAJ_IGNORE_PKTCNT is used to quietly gloss over
482 * the first few packets.
6d2010ae
A
483 */
484 if (tp->iaj_pktcnt > IAJ_IGNORE_PKTCNT) {
485 if ( cur_iaj <= allowed_iaj ) {
486 if (tp->acc_iaj >= 2)
487 acc_iaj = tp->acc_iaj - 2;
488 else
489 acc_iaj = 0;
39236c6e 490
6d2010ae
A
491 } else {
492 acc_iaj = tp->acc_iaj + (cur_iaj - allowed_iaj);
493 }
494
495 if (acc_iaj > MAX_ACC_IAJ)
496 acc_iaj = MAX_ACC_IAJ;
497 tp->acc_iaj = acc_iaj;
498 }
499
500 /* Compute weighted average where the history has a weight of
501 * 15 out of 16 and the current value has a weight of 1 out of 16.
502 * This will make the short-term measurements have more weight.
39236c6e
A
503 *
504 * The addition of 8 will help to round-up the value
505 * instead of round-down
6d2010ae 506 */
39236c6e
A
507 tp->avg_iaj = (((tp->avg_iaj << IAJ_DIV_SHIFT) - tp->avg_iaj)
508 + cur_iaj + IAJ_ROUNDUP_CONST) >> IAJ_DIV_SHIFT;
6d2010ae
A
509
510 /* Compute Root-mean-square of deviation where mean is a weighted
39236c6e 511 * average as described above.
6d2010ae
A
512 */
513 temp = tp->std_dev_iaj * tp->std_dev_iaj;
39236c6e
A
514 mean = (((temp << IAJ_DIV_SHIFT) - temp)
515 + (cur_iaj_dev * cur_iaj_dev)
516 + IAJ_ROUNDUP_CONST) >> IAJ_DIV_SHIFT;
6d2010ae
A
517
518 tp->std_dev_iaj = isqrt(mean);
519
39236c6e
A
520 DTRACE_TCP3(iaj, struct tcpcb *, tp, uint32_t, cur_iaj,
521 uint32_t, allowed_iaj);
6d2010ae
A
522
523 return;
524}
525#endif /* TRAFFIC_MGT */
9bccf70c 526
316670eb
A
527/* Check if enough amount of data has been acknowledged since
528 * bw measurement was started
529 */
530static void
531tcp_bwmeas_check(struct tcpcb *tp)
532{
533 int32_t bw_meas_bytes;
534 uint32_t bw, bytes, elapsed_time;
535 bw_meas_bytes = tp->snd_una - tp->t_bwmeas->bw_start;
536 if ((tp->t_flagsext & TF_BWMEAS_INPROGRESS) != 0 &&
537 bw_meas_bytes >= (int32_t)(tp->t_bwmeas->bw_size)) {
538 bytes = bw_meas_bytes;
539 elapsed_time = tcp_now - tp->t_bwmeas->bw_ts;
540 if (elapsed_time > 0) {
541 bw = bytes / elapsed_time;
542 if ( bw > 0) {
543 if (tp->t_bwmeas->bw_sndbw > 0) {
544 tp->t_bwmeas->bw_sndbw =
545 (((tp->t_bwmeas->bw_sndbw << 3) - tp->t_bwmeas->bw_sndbw) + bw) >> 3;
546 } else {
547 tp->t_bwmeas->bw_sndbw = bw;
548 }
549 }
550 }
551 tp->t_flagsext &= ~(TF_BWMEAS_INPROGRESS);
552 }
553}
554
9bccf70c 555static int
39236c6e
A
556tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m,
557 struct ifnet *ifp)
1c79356b 558{
9bccf70c
A
559 struct tseg_qent *q;
560 struct tseg_qent *p = NULL;
561 struct tseg_qent *nq;
8ad349bb 562 struct tseg_qent *te = NULL;
39236c6e
A
563 struct inpcb *inp = tp->t_inpcb;
564 struct socket *so = inp->inp_socket;
565 int flags = 0;
91447636 566 int dowakeup = 0;
39236c6e
A
567 struct mbuf *oodata = NULL;
568 int copy_oodata = 0;
fe8ab488 569 u_int16_t qlimit;
39236c6e
A
570 boolean_t cell = IFNET_IS_CELLULAR(ifp);
571 boolean_t wifi = (!cell && IFNET_IS_WIFI(ifp));
fe8ab488 572 boolean_t wired = (!wifi && IFNET_IS_WIRED(ifp));
1c79356b
A
573
574 /*
575 * Call with th==0 after become established to
576 * force pre-ESTABLISHED data up to user socket.
577 */
8ad349bb 578 if (th == NULL)
1c79356b 579 goto present;
6d2010ae 580
fe8ab488
A
581 /*
582 * If the reassembly queue already has entries or if we are going
583 * to add a new one, then the connection has reached a loss state.
6d2010ae
A
584 * Reset the stretch-ack algorithm at this point.
585 */
fe8ab488 586 tcp_reset_stretch_ack(tp);
6d2010ae
A
587
588#if TRAFFIC_MGT
589 if (tp->acc_iaj > 0)
590 reset_acc_iaj(tp);
591#endif /* TRAFFIC_MGT */
1c79356b 592
e5568f75
A
593 /*
594 * Limit the number of segments in the reassembly queue to prevent
595 * holding on to too many segments (and thus running out of mbufs).
596 * Make sure to let the missing segment through which caused this
597 * queue. Always keep one global queue entry spare to be able to
598 * process the missing segment.
599 */
fe8ab488
A
600 qlimit = min(max(100, so->so_rcv.sb_hiwat >> 10),
601 tcp_autorcvbuf_max >> 10);
e5568f75 602 if (th->th_seq != tp->rcv_nxt &&
fe8ab488 603 (tp->t_reassqlen + 1) >= qlimit) {
e5568f75
A
604 tcp_reass_overflows++;
605 tcpstat.tcps_rcvmemdrop++;
606 m_freem(m);
2d21ac55 607 *tlenp = 0;
e5568f75
A
608 return (0);
609 }
610
9bccf70c 611 /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
fe8ab488 612 te = (struct tseg_qent *) zalloc(tcp_reass_zone);
9bccf70c 613 if (te == NULL) {
1c79356b
A
614 tcpstat.tcps_rcvmemdrop++;
615 m_freem(m);
616 return (0);
617 }
fe8ab488 618 tp->t_reassqlen++;
1c79356b
A
619
620 /*
621 * Find a segment which begins after this one does.
622 */
9bccf70c
A
623 LIST_FOREACH(q, &tp->t_segq, tqe_q) {
624 if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
1c79356b 625 break;
9bccf70c
A
626 p = q;
627 }
1c79356b
A
628
629 /*
630 * If there is a preceding segment, it may provide some of
631 * our data already. If so, drop the data from the incoming
632 * segment. If it provides all of our data, drop us.
633 */
634 if (p != NULL) {
1c79356b 635 register int i;
1c79356b 636 /* conversion to int (in i) handles seq wraparound */
9bccf70c 637 i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
1c79356b 638 if (i > 0) {
9bccf70c 639 if (i >= *tlenp) {
1c79356b 640 tcpstat.tcps_rcvduppack++;
9bccf70c 641 tcpstat.tcps_rcvdupbyte += *tlenp;
6d2010ae 642 if (nstat_collect) {
fe8ab488
A
643 nstat_route_rx(inp->inp_route.ro_rt,
644 1, *tlenp,
645 NSTAT_RX_FLAG_DUPLICATE);
646 INP_ADD_STAT(inp, cell, wifi, wired,
647 rxpackets, 1);
648 INP_ADD_STAT(inp, cell, wifi, wired,
649 rxbytes, *tlenp);
6d2010ae
A
650 tp->t_stat.rxduplicatebytes += *tlenp;
651 }
1c79356b 652 m_freem(m);
6d2010ae 653 zfree(tcp_reass_zone, te);
39236c6e 654 te = NULL;
fe8ab488 655 tp->t_reassqlen--;
1c79356b
A
656 /*
657 * Try to present any queued data
658 * at the left window edge to the user.
659 * This is needed after the 3-WHS
660 * completes.
661 */
39236c6e 662 goto present;
1c79356b
A
663 }
664 m_adj(m, i);
9bccf70c 665 *tlenp -= i;
1c79356b
A
666 th->th_seq += i;
667 }
668 }
669 tcpstat.tcps_rcvoopack++;
9bccf70c 670 tcpstat.tcps_rcvoobyte += *tlenp;
6d2010ae 671 if (nstat_collect) {
fe8ab488
A
672 nstat_route_rx(inp->inp_route.ro_rt, 1, *tlenp,
673 NSTAT_RX_FLAG_OUT_OF_ORDER);
674 INP_ADD_STAT(inp, cell, wifi, wired, rxpackets, 1);
675 INP_ADD_STAT(inp, cell, wifi, wired, rxbytes, *tlenp);
6d2010ae
A
676 tp->t_stat.rxoutoforderbytes += *tlenp;
677 }
1c79356b
A
678
679 /*
680 * While we overlap succeeding segments trim them or,
681 * if they are completely covered, dequeue them.
682 */
683 while (q) {
9bccf70c 684 register int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
1c79356b
A
685 if (i <= 0)
686 break;
9bccf70c
A
687 if (i < q->tqe_len) {
688 q->tqe_th->th_seq += i;
689 q->tqe_len -= i;
690 m_adj(q->tqe_m, i);
1c79356b
A
691 break;
692 }
9bccf70c
A
693
694 nq = LIST_NEXT(q, tqe_q);
695 LIST_REMOVE(q, tqe_q);
696 m_freem(q->tqe_m);
6d2010ae 697 zfree(tcp_reass_zone, q);
fe8ab488 698 tp->t_reassqlen--;
1c79356b
A
699 q = nq;
700 }
701
9bccf70c
A
702 /* Insert the new segment queue entry into place. */
703 te->tqe_m = m;
704 te->tqe_th = th;
705 te->tqe_len = *tlenp;
1c79356b 706
1c79356b 707 if (p == NULL) {
9bccf70c 708 LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
1c79356b 709 } else {
9bccf70c 710 LIST_INSERT_AFTER(p, te, tqe_q);
1c79356b
A
711 }
712
39236c6e
A
713 /*
714 * New out-of-order data exists, and is pointed to by
715 * queue entry te. Set copy_oodata to 1 so out-of-order data
716 * can be copied off to sockbuf after in-order data
717 * is copied off.
718 */
719 if (!(so->so_state & SS_CANTRCVMORE))
720 copy_oodata = 1;
721
1c79356b
A
722present:
723 /*
724 * Present data to user, advancing rcv_nxt through
725 * completed sequence space.
726 */
727 if (!TCPS_HAVEESTABLISHED(tp->t_state))
728 return (0);
9bccf70c 729 q = LIST_FIRST(&tp->t_segq);
316670eb
A
730 if (!q || q->tqe_th->th_seq != tp->rcv_nxt) {
731 /* Stop using LRO once out of order packets arrive */
732 if (tp->t_flagsext & TF_LRO_OFFLOADED) {
39236c6e 733 tcp_lro_remove_state(inp->inp_laddr, inp->inp_faddr,
316670eb
A
734 th->th_dport, th->th_sport);
735 tp->t_flagsext &= ~TF_LRO_OFFLOADED;
736 }
39236c6e
A
737
738 /*
739 * continue processing if out-of-order data
740 * can be delivered
741 */
742 if (q && (so->so_flags & SOF_ENABLE_MSGS))
743 goto msg_unordered_delivery;
744
1c79356b 745 return (0);
fe8ab488
A
746 }
747
748 /* lost packet was recovered, so ooo data can be returned */
749 tcpstat.tcps_recovered_pkts++;
750
1c79356b 751 do {
9bccf70c
A
752 tp->rcv_nxt += q->tqe_len;
753 flags = q->tqe_th->th_flags & TH_FIN;
754 nq = LIST_NEXT(q, tqe_q);
755 LIST_REMOVE(q, tqe_q);
39236c6e 756 if (so->so_state & SS_CANTRCVMORE) {
9bccf70c 757 m_freem(q->tqe_m);
39236c6e 758 } else {
6d2010ae 759 so_recv_data_stat(so, q->tqe_m, 0); /* XXXX */
39236c6e
A
760 if (so->so_flags & SOF_ENABLE_MSGS) {
761 /*
762 * Append the inorder data as a message to the
763 * receive socket buffer. Also check to see if
764 * the data we are about to deliver is the same
765 * data that we wanted to pass up to the user
766 * out of order. If so, reset copy_oodata --
767 * the received data filled a gap, and
768 * is now in order!
769 */
770 if (q == te)
771 copy_oodata = 0;
772 }
773 if (sbappendstream_rcvdemux(so, q->tqe_m,
774 q->tqe_th->th_seq - (tp->irs + 1), 0))
91447636 775 dowakeup = 1;
316670eb
A
776 if (tp->t_flagsext & TF_LRO_OFFLOADED) {
777 tcp_update_lro_seq(tp->rcv_nxt,
39236c6e
A
778 inp->inp_laddr, inp->inp_faddr,
779 th->th_dport, th->th_sport);
316670eb 780 }
91447636 781 }
6d2010ae 782 zfree(tcp_reass_zone, q);
fe8ab488 783 tp->t_reassqlen--;
1c79356b 784 q = nq;
9bccf70c 785 } while (q && q->tqe_th->th_seq == tp->rcv_nxt);
9bccf70c 786
1c79356b 787#if INET6
39236c6e 788 if ((inp->inp_vflag & INP_IPV6) != 0) {
9bccf70c
A
789
790 KERNEL_DEBUG(DBG_LAYER_BEG,
39236c6e
A
791 ((inp->inp_fport << 16) | inp->inp_lport),
792 (((inp->in6p_laddr.s6_addr16[0] & 0xffff) << 16) |
793 (inp->in6p_faddr.s6_addr16[0] & 0xffff)),
9bccf70c
A
794 0,0,0);
795 }
796 else
1c79356b 797#endif
9bccf70c
A
798 {
799 KERNEL_DEBUG(DBG_LAYER_BEG,
39236c6e
A
800 ((inp->inp_fport << 16) | inp->inp_lport),
801 (((inp->inp_laddr.s_addr & 0xffff) << 16) |
802 (inp->inp_faddr.s_addr & 0xffff)),
9bccf70c 803 0,0,0);
91447636 804 }
39236c6e
A
805
806msg_unordered_delivery:
807 /* Deliver out-of-order data as a message */
808 if (te && (so->so_flags & SOF_ENABLE_MSGS) && copy_oodata && te->tqe_len) {
809 /*
810 * make a copy of the mbuf to be delivered up to
811 * the user, and add it to the sockbuf
812 */
813 oodata = m_copym(te->tqe_m, 0, M_COPYALL, M_DONTWAIT);
814 if (oodata != NULL) {
815 if (sbappendmsgstream_rcv(&so->so_rcv, oodata,
816 te->tqe_th->th_seq - (tp->irs + 1), 1)) {
817 dowakeup = 1;
818 tcpstat.tcps_msg_unopkts++;
819 } else {
820 tcpstat.tcps_msg_unoappendfail++;
821 }
822 }
823 }
824
91447636
A
825 if (dowakeup)
826 sorwakeup(so); /* done with socket lock held */
1c79356b
A
827 return (flags);
828}
829
2d21ac55
A
830/*
831 * Reduce congestion window.
832 */
833static void
834tcp_reduce_congestion_window(
316670eb 835 struct tcpcb *tp)
2d21ac55 836{
6d2010ae
A
837 /*
838 * If the current tcp cc module has
839 * defined a hook for tasks to run
840 * before entering FR, call it
841 */
842 if (CC_ALGO(tp)->pre_fr != NULL)
316670eb 843 CC_ALGO(tp)->pre_fr(tp);
2d21ac55
A
844 ENTER_FASTRECOVERY(tp);
845 tp->snd_recover = tp->snd_max;
846 tp->t_timer[TCPT_REXMT] = 0;
fe8ab488 847 tp->t_timer[TCPT_PTO] = 0;
2d21ac55 848 tp->t_rtttime = 0;
2d21ac55
A
849 tp->snd_cwnd = tp->snd_ssthresh +
850 tp->t_maxseg * tcprexmtthresh;
851}
852
39236c6e
A
853/*
854 * The application wants to get an event if there
855 * is a stall during read. Set the initial keepalive
856 * timeout to be equal to twice RTO.
857 */
858static inline void
859tcp_adaptive_rwtimo_check(struct tcpcb *tp, int tlen)
860{
861 if (tp->t_adaptive_rtimo > 0 && tlen > 0 &&
862 tp->t_state == TCPS_ESTABLISHED) {
863 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
864 (TCP_REXMTVAL(tp) << 1));
865 tp->t_flagsext |= TF_DETECT_READSTALL;
866 tp->t_rtimo_probes = 0;
867 }
868}
869
870inline void
871tcp_keepalive_reset(struct tcpcb *tp)
872{
873 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
874 TCP_CONN_KEEPIDLE(tp));
875 tp->t_flagsext &= ~(TF_DETECT_READSTALL);
876 tp->t_rtimo_probes = 0;
877}
9bccf70c 878
1c79356b
A
879/*
880 * TCP input routine, follows pages 65-76 of the
881 * protocol specification dated September, 1981 very closely.
882 */
883#if INET6
884int
6d2010ae 885tcp6_input(struct mbuf **mp, int *offp, int proto)
1c79356b 886{
6d2010ae 887#pragma unused(proto)
9bccf70c 888 register struct mbuf *m = *mp;
39236c6e
A
889 uint32_t ia6_flags;
890 struct ifnet *ifp = m->m_pkthdr.rcvif;
9bccf70c 891
91447636 892 IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), return IPPROTO_DONE);
9bccf70c 893
316670eb
A
894 /* Expect 32-bit aligned data pointer on strict-align platforms */
895 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
896
9bccf70c
A
897 /*
898 * draft-itojun-ipv6-tcp-to-anycast
899 * better place to put this in?
900 */
39236c6e
A
901 if (ip6_getdstifaddr_info(m, NULL, &ia6_flags) == 0) {
902 if (ia6_flags & IN6_IFF_ANYCAST) {
6d2010ae 903 struct ip6_hdr *ip6;
9bccf70c 904
6d2010ae
A
905 ip6 = mtod(m, struct ip6_hdr *);
906 icmp6_error(m, ICMP6_DST_UNREACH,
907 ICMP6_DST_UNREACH_ADDR,
9bccf70c 908 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
39236c6e
A
909
910 IF_TCP_STATINC(ifp, icmp6unreach);
911
6d2010ae
A
912 return (IPPROTO_DONE);
913 }
9bccf70c
A
914 }
915
916 tcp_input(m, *offp);
6d2010ae 917 return (IPPROTO_DONE);
1c79356b
A
918}
919#endif
920
316670eb
A
921/* Depending on the usage of mbuf space in the system, this function
922 * will return true or false. This is used to determine if a socket
923 * buffer can take more memory from the system for auto-tuning or not.
924 */
925u_int8_t
926tcp_cansbgrow(struct sockbuf *sb)
927{
928 /* Calculate the host level space limit in terms of MSIZE buffers.
929 * We can use a maximum of half of the available mbuf space for
930 * socket buffers.
931 */
932 u_int32_t mblim = ((nmbclusters >> 1) << (MCLSHIFT - MSIZESHIFT));
933
934 /* Calculate per sb limit in terms of bytes. We optimize this limit
935 * for upto 16 socket buffers.
936 */
937
938 u_int32_t sbspacelim = ((nmbclusters >> 4) << MCLSHIFT);
939
940 if ((total_sbmb_cnt < mblim) &&
941 (sb->sb_hiwat < sbspacelim)) {
942 return(1);
fe8ab488
A
943 } else {
944 OSIncrementAtomic64(&sbmb_limreached);
316670eb
A
945 }
946 return(0);
947}
948
39236c6e
A
949static void
950tcp_sbrcv_reserve(struct tcpcb *tp, struct sockbuf *sbrcv,
951 u_int32_t newsize, u_int32_t idealsize)
952{
316670eb
A
953
954 /* newsize should not exceed max */
955 newsize = min(newsize, tcp_autorcvbuf_max);
956
957 /* The receive window scale negotiated at the
958 * beginning of the connection will also set a
959 * limit on the socket buffer size
960 */
961 newsize = min(newsize, TCP_MAXWIN << tp->rcv_scale);
962
963 /* Set new socket buffer size */
964 if (newsize > sbrcv->sb_hiwat &&
965 (sbreserve(sbrcv, newsize) == 1)) {
966 sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize,
967 (idealsize != 0) ? idealsize : newsize),
968 tcp_autorcvbuf_max);
969
970 /* Again check the limit set by the advertised
971 * window scale
972 */
973 sbrcv->sb_idealsize = min(sbrcv->sb_idealsize,
974 TCP_MAXWIN << tp->rcv_scale);
975 }
976}
977
978/*
979 * This function is used to grow a receive socket buffer. It
980 * will take into account system-level memory usage and the
981 * bandwidth available on the link to make a decision.
982 */
983static void
984tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sbrcv,
985 struct tcpopt *to, u_int32_t pktlen) {
986
39236c6e
A
987 /*
988 * Do not grow the receive socket buffer if
989 * - auto resizing is disabled, globally or on this socket
fe8ab488 990 * - the high water mark already reached the maximum
39236c6e
A
991 * - the stream is in background and receive side is being
992 * throttled
993 * - if there are segments in reassembly queue indicating loss,
994 * do not need to increase recv window during recovery as more
fe8ab488
A
995 * data is not going to be sent. A duplicate ack sent during
996 * recovery should not change the receive window
39236c6e 997 */
316670eb
A
998 if (tcp_do_autorcvbuf == 0 ||
999 (sbrcv->sb_flags & SB_AUTOSIZE) == 0 ||
1000 tcp_cansbgrow(sbrcv) == 0 ||
39236c6e
A
1001 sbrcv->sb_hiwat >= tcp_autorcvbuf_max ||
1002 (tp->t_flagsext & TF_RECV_THROTTLE) ||
1003 !LIST_EMPTY(&tp->t_segq)) {
316670eb
A
1004 /* Can not resize the socket buffer, just return */
1005 goto out;
1006 }
1007
1008 if (TSTMP_GT(tcp_now,
1009 tp->rfbuf_ts + TCPTV_RCVBUFIDLE)) {
1010 /* If there has been an idle period in the
1011 * connection, just restart the measurement
1012 */
1013 goto out;
1014 }
1015
39236c6e 1016 if (!TSTMP_SUPPORTED(tp)) {
316670eb
A
1017 /*
1018 * Timestamp option is not supported on this connection.
1019 * If the connection reached a state to indicate that
1020 * the receive socket buffer needs to grow, increase
fe8ab488
A
1021 * the high water mark.
1022 */
316670eb
A
1023 if (TSTMP_GEQ(tcp_now,
1024 tp->rfbuf_ts + TCPTV_RCVNOTS_QUANTUM)) {
1025 if (tp->rfbuf_cnt >= TCP_RCVNOTS_BYTELEVEL) {
1026 tcp_sbrcv_reserve(tp, sbrcv,
1027 tcp_autorcvbuf_max, 0);
1028 }
1029 goto out;
1030 } else {
1031 tp->rfbuf_cnt += pktlen;
1032 return;
fe8ab488 1033 }
316670eb 1034 } else if (to->to_tsecr != 0) {
fe8ab488
A
1035 /*
1036 * If the timestamp shows that one RTT has
316670eb
A
1037 * completed, we can stop counting the
1038 * bytes. Here we consider increasing
fe8ab488
A
1039 * the socket buffer if the bandwidth measured in
1040 * last rtt, is more than half of sb_hiwat, this will
1041 * help to scale the buffer according to the bandwidth
1042 * on the link.
316670eb
A
1043 */
1044 if (TSTMP_GEQ(to->to_tsecr, tp->rfbuf_ts)) {
fe8ab488
A
1045 if (tp->rfbuf_cnt > (sbrcv->sb_hiwat -
1046 (sbrcv->sb_hiwat >> 1))) {
1047 int32_t rcvbuf_inc, min_incr;
316670eb 1048 /*
fe8ab488
A
1049 * Increment the receive window by a
1050 * multiple of maximum sized segments.
1051 * This will prevent a connection from
1052 * sending smaller segments on wire if it
1053 * is limited by the receive window.
316670eb 1054 *
fe8ab488
A
1055 * Set the ideal size based on current
1056 * bandwidth measurements. We set the
1057 * ideal size on receive socket buffer to
1058 * be twice the bandwidth delay product.
1059 */
1060 rcvbuf_inc = (tp->rfbuf_cnt << 1)
1061 - sbrcv->sb_hiwat;
1062
1063 /*
1064 * Make the increment equal to 8 segments
1065 * at least
316670eb 1066 */
fe8ab488
A
1067 min_incr = tp->t_maxseg << tcp_autorcvbuf_inc_shift;
1068 if (rcvbuf_inc < min_incr)
1069 rcvbuf_inc = min_incr;
1070
1071 rcvbuf_inc =
1072 (rcvbuf_inc / tp->t_maxseg) * tp->t_maxseg;
316670eb
A
1073 tcp_sbrcv_reserve(tp, sbrcv,
1074 sbrcv->sb_hiwat + rcvbuf_inc,
1075 (tp->rfbuf_cnt * 2));
1076 }
1077 goto out;
1078 } else {
1079 tp->rfbuf_cnt += pktlen;
1080 return;
1081 }
1082 }
1083out:
1084 /* Restart the measurement */
1085 tp->rfbuf_ts = 0;
1086 tp->rfbuf_cnt = 0;
1087 return;
1088}
1089
1090/* This function will trim the excess space added to the socket buffer
1091 * to help a slow-reading app. The ideal-size of a socket buffer depends
1092 * on the link bandwidth or it is set by an application and we aim to
1093 * reach that size.
1094 */
1095void
1096tcp_sbrcv_trim(struct tcpcb *tp, struct sockbuf *sbrcv) {
1097 if (tcp_do_autorcvbuf == 1 && sbrcv->sb_idealsize > 0 &&
1098 sbrcv->sb_hiwat > sbrcv->sb_idealsize) {
1099 int32_t trim;
1100 /* compute the difference between ideal and current sizes */
1101 u_int32_t diff = sbrcv->sb_hiwat - sbrcv->sb_idealsize;
1102
1103 /* Compute the maximum advertised window for
1104 * this connection.
1105 */
1106 u_int32_t advwin = tp->rcv_adv - tp->rcv_nxt;
1107
1108 /* How much can we trim the receive socket buffer?
1109 * 1. it can not be trimmed beyond the max rcv win advertised
1110 * 2. if possible, leave 1/16 of bandwidth*delay to
1111 * avoid closing the win completely
1112 */
1113 u_int32_t leave = max(advwin, (sbrcv->sb_idealsize >> 4));
1114
1115 /* Sometimes leave can be zero, in that case leave at least
1116 * a few segments worth of space.
1117 */
1118 if (leave == 0)
1119 leave = tp->t_maxseg << tcp_autorcvbuf_inc_shift;
1120
1121 trim = sbrcv->sb_hiwat - (sbrcv->sb_cc + leave);
1122 trim = imin(trim, (int32_t)diff);
1123
1124 if (trim > 0)
1125 sbreserve(sbrcv, (sbrcv->sb_hiwat - trim));
1126 }
1127}
1128
1129/* We may need to trim the send socket buffer size for two reasons:
1130 * 1. if the rtt seen on the connection is climbing up, we do not
1131 * want to fill the buffers any more.
1132 * 2. if the congestion win on the socket backed off, there is no need
1133 * to hold more mbufs for that connection than what the cwnd will allow.
1134 */
1135void
1136tcp_sbsnd_trim(struct sockbuf *sbsnd) {
1137 if (tcp_do_autosendbuf == 1 &&
1138 ((sbsnd->sb_flags & (SB_AUTOSIZE | SB_TRIM)) ==
1139 (SB_AUTOSIZE | SB_TRIM)) &&
1140 (sbsnd->sb_idealsize > 0) &&
1141 (sbsnd->sb_hiwat > sbsnd->sb_idealsize)) {
1142 u_int32_t trim = 0;
1143 if (sbsnd->sb_cc <= sbsnd->sb_idealsize) {
1144 trim = sbsnd->sb_hiwat - sbsnd->sb_idealsize;
1145 } else {
1146 trim = sbsnd->sb_hiwat - sbsnd->sb_cc;
1147 }
1148 sbreserve(sbsnd, (sbsnd->sb_hiwat - trim));
1149 }
1150 if (sbsnd->sb_hiwat <= sbsnd->sb_idealsize)
1151 sbsnd->sb_flags &= ~(SB_TRIM);
1152}
1153
1154/*
1155 * If timestamp option was not negotiated on this connection
1156 * and this connection is on the receiving side of a stream
1157 * then we can not measure the delay on the link accurately.
1158 * Instead of enabling automatic receive socket buffer
1159 * resizing, just give more space to the receive socket buffer.
1160 */
1161static inline void
1162tcp_sbrcv_tstmp_check(struct tcpcb *tp) {
1163 struct socket *so = tp->t_inpcb->inp_socket;
1164 u_int32_t newsize = 2 * tcp_recvspace;
1165 struct sockbuf *sbrcv = &so->so_rcv;
1166
1167 if ((tp->t_flags & (TF_REQ_TSTMP | TF_RCVD_TSTMP)) !=
1168 (TF_REQ_TSTMP | TF_RCVD_TSTMP) &&
1169 (sbrcv->sb_flags & SB_AUTOSIZE) != 0) {
1170 tcp_sbrcv_reserve(tp, sbrcv, newsize, 0);
1171 }
1172}
1173
6d2010ae
A
1174/* A receiver will evaluate the flow of packets on a connection
1175 * to see if it can reduce ack traffic. The receiver will start
1176 * stretching acks if all of the following conditions are met:
1177 * 1. tcp_delack_enabled is set to 3
1178 * 2. If the bytes received in the last 100ms is greater than a threshold
1179 * defined by maxseg_unacked
1180 * 3. If the connection has not been idle for tcp_maxrcvidle period.
1181 * 4. If the connection has seen enough packets to let the slow-start
1182 * finish after connection establishment or after some packet loss.
1183 *
1184 * The receiver will stop stretching acks if there is congestion/reordering
1185 * as indicated by packets on reassembly queue or an ECN. If the delayed-ack
1186 * timer fires while stretching acks, it means that the packet flow has gone
1187 * below the threshold defined by maxseg_unacked and the receiver will stop
1188 * stretching acks. The receiver gets no indication when slow-start is completed
1189 * or when the connection reaches an idle state. That is why we use
1190 * tcp_rcvsspktcnt to cover slow-start and tcp_maxrcvidle to identify idle
1191 * state.
1192 */
39236c6e
A
1193static inline int
1194tcp_stretch_ack_enable(struct tcpcb *tp)
1195{
fe8ab488 1196 if (!(tp->t_flagsext & (TF_NOSTRETCHACK|TF_DISABLE_STRETCHACK)) &&
39236c6e 1197 tp->rcv_by_unackwin >= (maxseg_unacked * tp->t_maxseg) &&
6d2010ae 1198 TSTMP_GT(tp->rcv_unackwin + tcp_maxrcvidle, tcp_now) &&
39236c6e 1199 (!(tp->t_flagsext & TF_RCVUNACK_WAITSS) ||
6d2010ae
A
1200 (tp->rcv_waitforss >= tcp_rcvsspktcnt))) {
1201 return(1);
1202 }
316670eb 1203
6d2010ae
A
1204 return(0);
1205}
1206
fe8ab488
A
1207/*
1208 * Reset the state related to stretch-ack algorithm. This will make
6d2010ae
A
1209 * the receiver generate an ack every other packet. The receiver
1210 * will start re-evaluating the rate at which packets come to decide
1211 * if it can benefit by lowering the ack traffic.
1212 */
1213void
1214tcp_reset_stretch_ack(struct tcpcb *tp)
1215{
1216 tp->t_flags &= ~(TF_STRETCHACK);
1217 tp->rcv_by_unackwin = 0;
1218 tp->rcv_unackwin = tcp_now + tcp_rcvunackwin;
fe8ab488
A
1219
1220 /*
1221 * When there is packet loss or packet re-ordering or CWR due to
1222 * ECN, the sender's congestion window is reduced. In these states,
1223 * generate an ack for every other packet for some time to allow
1224 * the sender's congestion window to grow.
1225 */
1226 tp->t_flagsext |= TF_RCVUNACK_WAITSS;
1227 tp->rcv_waitforss = 0;
6d2010ae
A
1228}
1229
39236c6e
A
1230/*
1231 * The last packet was a retransmission, check if this ack
1232 * indicates that the retransmission was spurious.
1233 *
1234 * If the connection supports timestamps, we could use it to
1235 * detect if the last retransmit was not needed. Otherwise,
1236 * we check if the ACK arrived within RTT/2 window, then it
1237 * was a mistake to do the retransmit in the first place.
1238 *
1239 * This function will return 1 if it is a spurious retransmit,
1240 * 0 otherwise.
1241 */
fe8ab488
A
1242int
1243tcp_detect_bad_rexmt(struct tcpcb *tp, struct tcphdr *th,
1244 struct tcpopt *to, u_int32_t rxtime)
39236c6e
A
1245{
1246 int32_t tdiff, bad_rexmt_win;
39236c6e
A
1247 bad_rexmt_win = (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
1248
fe8ab488
A
1249 /* If the ack has ECN CE bit, then cwnd has to be adjusted */
1250 if ((tp->ecn_flags & (TE_ECN_ON)) == (TE_ECN_ON)
1251 && (th->th_flags & TH_ECE))
1252 return (0);
1253 if (TSTMP_SUPPORTED(tp)) {
1254 if (rxtime > 0 && (to->to_flags & TOF_TS)
1255 && to->to_tsecr != 0
1256 && TSTMP_LT(to->to_tsecr, rxtime))
1257 return (1);
1258 } else {
1259 if ((tp->t_rxtshift == 1
1260 || (tp->t_flagsext & TF_SENT_TLPROBE))
1261 && rxtime > 0) {
1262 tdiff = (int32_t)(tcp_now - rxtime);
1263 if (tdiff < bad_rexmt_win)
1264 return(1);
1265 }
39236c6e
A
1266 }
1267 return(0);
1268}
1269
1270
1271/*
1272 * Restore congestion window state if a spurious timeout
1273 * was detected.
1274 */
1275static void
1276tcp_bad_rexmt_restore_state(struct tcpcb *tp, struct tcphdr *th)
1277{
1278 if (TSTMP_SUPPORTED(tp)) {
1279 u_int32_t fsize, acked;
1280 fsize = tp->snd_max - th->th_ack;
1281 acked = BYTES_ACKED(th, tp);
1282
1283 /*
1284 * Implement bad retransmit recovery as
1285 * described in RFC 4015.
1286 */
1287 tp->snd_ssthresh = tp->snd_ssthresh_prev;
1288
1289 /* Initialize cwnd to the initial window */
1290 if (CC_ALGO(tp)->cwnd_init != NULL)
1291 CC_ALGO(tp)->cwnd_init(tp);
1292
1293 tp->snd_cwnd = fsize + min(acked, tp->snd_cwnd);
1294
1295 } else {
1296 tp->snd_cwnd = tp->snd_cwnd_prev;
1297 tp->snd_ssthresh = tp->snd_ssthresh_prev;
1298 if (tp->t_flags & TF_WASFRECOVERY)
1299 ENTER_FASTRECOVERY(tp);
1300 }
fe8ab488 1301 tp->snd_cwnd = max(tp->snd_cwnd, TCP_CC_CWND_INIT_BYTES);
39236c6e
A
1302 tp->snd_recover = tp->snd_recover_prev;
1303 tp->snd_nxt = tp->snd_max;
1304 tp->t_rxtshift = 0;
1305 tp->t_rxtstart = 0;
1306
1307 /* Fix send socket buffer to reflect the change in cwnd */
1308 tcp_bad_rexmt_fix_sndbuf(tp);
1309
1310 /*
1311 * This RTT might reflect the extra delay induced
1312 * by the network. Skip using this sample for RTO
1313 * calculation and mark the connection so we can
1314 * recompute RTT when the next eligible sample is
1315 * found.
1316 */
1317 tp->t_flagsext |= TF_RECOMPUTE_RTT;
1318 tp->t_badrexmt_time = tcp_now;
1319 tp->t_rtttime = 0;
1320}
1321
fe8ab488
A
1322/*
1323 * If the previous packet was sent in retransmission timer, and it was
1324 * not needed, then restore the congestion window to the state before that
1325 * transmission.
1326 *
1327 * If the last packet was sent in tail loss probe timeout, check if that
1328 * recovered the last packet. If so, that will indicate a real loss and
1329 * the congestion window needs to be lowered.
1330 */
1331static void
1332tcp_bad_rexmt_check(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to)
1333{
1334 if (tp->t_rxtshift > 0 &&
1335 tcp_detect_bad_rexmt(tp, th, to, tp->t_rxtstart)) {
1336 ++tcpstat.tcps_sndrexmitbad;
1337 tcp_bad_rexmt_restore_state(tp, th);
1338 tcp_ccdbg_trace(tp, th, TCP_CC_BAD_REXMT_RECOVERY);
1339 } else if ((tp->t_flagsext & TF_SENT_TLPROBE)
1340 && tp->t_tlphighrxt > 0
1341 && SEQ_GEQ(th->th_ack, tp->t_tlphighrxt)
1342 && !tcp_detect_bad_rexmt(tp, th, to, tp->t_tlpstart)) {
1343 /*
1344 * The tail loss probe recovered the last packet and
1345 * we need to adjust the congestion window to take
1346 * this loss into account.
1347 */
1348 ++tcpstat.tcps_tlp_recoverlastpkt;
1349 if (!IN_FASTRECOVERY(tp)) {
1350 tcp_reduce_congestion_window(tp);
1351 EXIT_FASTRECOVERY(tp);
1352 }
1353 tcp_ccdbg_trace(tp, th, TCP_CC_TLP_RECOVER_LASTPACKET);
1354 }
1355
1356 tp->t_flagsext &= ~(TF_SENT_TLPROBE);
1357 tp->t_tlphighrxt = 0;
1358 tp->t_tlpstart = 0;
1359
1360 /*
1361 * check if the latest ack was for a segment sent during PMTU
1362 * blackhole detection. If the timestamp on the ack is before
1363 * PMTU blackhole detection, then revert the size of the max
1364 * segment to previous size.
1365 */
1366 if (tp->t_rxtshift > 0 && (tp->t_flags & TF_BLACKHOLE) &&
1367 tp->t_pmtud_start_ts > 0 && TSTMP_SUPPORTED(tp)) {
1368 if ((to->to_flags & TOF_TS) && to->to_tsecr != 0
1369 && TSTMP_LT(to->to_tsecr, tp->t_pmtud_start_ts)) {
1370 tcp_pmtud_revert_segment_size(tp);
1371 }
1372 }
1373 if (tp->t_pmtud_start_ts > 0)
1374 tp->t_pmtud_start_ts = 0;
1375}
1376
1377/*
1378 * Check if early retransmit can be attempted according to RFC 5827.
1379 *
1380 * If packet reordering is detected on a connection, fast recovery will
1381 * be delayed until it is clear that the packet was lost and not reordered.
1382 * But reordering detection is done only when SACK is enabled.
1383 *
1384 * On connections that do not support SACK, there is a limit on the number
1385 * of early retransmits that can be done per minute. This limit is needed
1386 * to make sure that too many packets are not retransmitted when there is
1387 * packet reordering.
1388 */
1389static void
1390tcp_early_rexmt_check (struct tcpcb *tp, struct tcphdr *th)
1391{
1392 u_int32_t obytes, snd_off;
1393 int32_t snd_len;
1394 struct socket *so = tp->t_inpcb->inp_socket;
1395
1396 if (early_rexmt && (SACK_ENABLED(tp) ||
1397 tp->t_early_rexmt_count < TCP_EARLY_REXMT_LIMIT) &&
1398 SEQ_GT(tp->snd_max, tp->snd_una) &&
1399 (tp->t_dupacks == 1 ||
1400 (SACK_ENABLED(tp) &&
1401 !TAILQ_EMPTY(&tp->snd_holes)))) {
1402 /*
1403 * If there are only a few outstanding
1404 * segments on the connection, we might need
1405 * to lower the retransmit threshold. This
1406 * will allow us to do Early Retransmit as
1407 * described in RFC 5827.
1408 */
1409 if (SACK_ENABLED(tp) &&
1410 !TAILQ_EMPTY(&tp->snd_holes)) {
1411 obytes = (tp->snd_max - tp->snd_fack) +
1412 tp->sackhint.sack_bytes_rexmit;
1413 } else {
1414 obytes = (tp->snd_max - tp->snd_una);
1415 }
1416
1417 /*
1418 * In order to lower retransmit threshold the
1419 * following two conditions must be met.
1420 * 1. the amount of outstanding data is less
1421 * than 4*SMSS bytes
1422 * 2. there is no unsent data ready for
1423 * transmission or the advertised window
1424 * will limit sending new segments.
1425 */
1426 snd_off = tp->snd_max - tp->snd_una;
1427 snd_len = min(so->so_snd.sb_cc, tp->snd_wnd) - snd_off;
1428 if (obytes < (tp->t_maxseg << 2) &&
1429 snd_len <= 0) {
1430 u_int32_t osegs;
1431
1432 osegs = obytes / tp->t_maxseg;
1433 if ((osegs * tp->t_maxseg) < obytes)
1434 osegs++;
1435
1436 /*
1437 * Since the connection might have already
1438 * received some dupacks, we add them to
1439 * to the outstanding segments count to get
1440 * the correct retransmit threshold.
1441 *
1442 * By checking for early retransmit after
1443 * receiving some duplicate acks when SACK
1444 * is supported, the connection will
1445 * enter fast recovery even if multiple
1446 * segments are lost in the same window.
1447 */
1448 osegs += tp->t_dupacks;
1449 if (osegs < 4) {
1450 tp->t_rexmtthresh =
1451 ((osegs - 1) > 1) ? (osegs - 1) : 1;
1452 tp->t_rexmtthresh =
1453 min(tp->t_rexmtthresh, tcprexmtthresh);
1454 tp->t_rexmtthresh =
1455 max(tp->t_rexmtthresh, tp->t_dupacks);
1456
1457 if (tp->t_early_rexmt_count == 0)
1458 tp->t_early_rexmt_win = tcp_now;
1459
1460 if (tp->t_flagsext & TF_SENT_TLPROBE) {
1461 tcpstat.tcps_tlp_recovery++;
1462 tcp_ccdbg_trace(tp, th,
1463 TCP_CC_TLP_RECOVERY);
1464 } else {
1465 tcpstat.tcps_early_rexmt++;
1466 tp->t_early_rexmt_count++;
1467 tcp_ccdbg_trace(tp, th,
1468 TCP_CC_EARLY_RETRANSMIT);
1469 }
1470 }
1471 }
1472 }
1473
1474 /*
1475 * If we ever sent a TLP probe, the acknowledgement will trigger
1476 * early retransmit because the value of snd_fack will be close
1477 * to snd_max. This will take care of adjustments to the
1478 * congestion window. So we can reset TF_SENT_PROBE flag.
1479 */
1480 tp->t_flagsext &= ~(TF_SENT_TLPROBE);
1481 tp->t_tlphighrxt = 0;
1482 tp->t_tlpstart = 0;
1483}
1484
1c79356b 1485void
9bccf70c 1486tcp_input(m, off0)
1c79356b 1487 struct mbuf *m;
9bccf70c 1488 int off0;
1c79356b
A
1489{
1490 register struct tcphdr *th;
1491 register struct ip *ip = NULL;
1c79356b
A
1492 register struct inpcb *inp;
1493 u_char *optp = NULL;
1494 int optlen = 0;
39236c6e 1495 int tlen, off;
9bccf70c 1496 int drop_hdrlen;
1c79356b
A
1497 register struct tcpcb *tp = 0;
1498 register int thflags;
1499 struct socket *so = 0;
1500 int todrop, acked, ourfinisacked, needoutput = 0;
1501 struct in_addr laddr;
9bccf70c 1502#if INET6
1c79356b
A
1503 struct in6_addr laddr6;
1504#endif
1505 int dropsocket = 0;
316670eb 1506 int iss = 0, nosock = 0;
39236c6e 1507 u_int32_t tiwin, sack_bytes_acked = 0;
1c79356b 1508 struct tcpopt to; /* options in this segment */
91447636 1509 struct sockaddr_in *next_hop = NULL;
1c79356b
A
1510#if TCPDEBUG
1511 short ostate = 0;
1512#endif
91447636 1513 struct m_tag *fwd_tag;
2d21ac55 1514 u_char ip_ecn = IPTOS_ECN_NOTECT;
fe8ab488 1515 unsigned int ifscope;
6d2010ae 1516 uint8_t isconnected, isdisconnected;
39236c6e
A
1517 struct ifnet *ifp = m->m_pkthdr.rcvif;
1518 int pktf_sw_lro_pkt = (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) ? 1 : 0;
1519 int nlropkts = (pktf_sw_lro_pkt == 1) ? m->m_pkthdr.lro_npkts : 1;
1520 int turnoff_lro = 0, win;
1521#if MPTCP
1522 struct mptcb *mp_tp = NULL;
1523 uint16_t mptcp_csum = 0;
1524#endif /* MPTCP */
1525 boolean_t cell = IFNET_IS_CELLULAR(ifp);
1526 boolean_t wifi = (!cell && IFNET_IS_WIFI(ifp));
fe8ab488
A
1527 boolean_t wired = (!wifi && IFNET_IS_WIRED(ifp));
1528 struct tcp_respond_args tra;
39236c6e 1529
316670eb 1530#define TCP_INC_VAR(stat, npkts) do { \
39236c6e 1531 stat += npkts; \
316670eb 1532} while (0)
c910b4d9 1533
316670eb 1534 TCP_INC_VAR(tcpstat.tcps_rcvtotal, nlropkts);
39236c6e 1535
91447636 1536 /* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */
b0d623f7
A
1537 if (!SLIST_EMPTY(&m->m_pkthdr.tags)) {
1538 fwd_tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID,
1539 KERNEL_TAG_TYPE_IPFORWARD, NULL);
1540 } else {
1541 fwd_tag = NULL;
1542 }
91447636 1543 if (fwd_tag != NULL) {
39236c6e
A
1544 struct ip_fwd_tag *ipfwd_tag =
1545 (struct ip_fwd_tag *)(fwd_tag+1);
1546
91447636
A
1547 next_hop = ipfwd_tag->next_hop;
1548 m_tag_delete(m, fwd_tag);
1549 }
39236c6e 1550
1c79356b
A
1551#if INET6
1552 struct ip6_hdr *ip6 = NULL;
9bccf70c 1553 int isipv6;
1c79356b 1554#endif /* INET6 */
9bccf70c 1555 int rstreason; /* For badport_bandlim accounting purposes */
1c79356b 1556 struct proc *proc0=current_proc();
39236c6e 1557
1c79356b
A
1558 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_START,0,0,0,0,0);
1559
9bccf70c
A
1560#if INET6
1561 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
1562#endif
1c79356b
A
1563 bzero((char *)&to, sizeof(to));
1564
1c79356b
A
1565#if INET6
1566 if (isipv6) {
39236c6e
A
1567 /*
1568 * Expect 32-bit aligned data pointer on
1569 * strict-align platforms
1570 */
316670eb
A
1571 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
1572
9bccf70c 1573 /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */
1c79356b 1574 ip6 = mtod(m, struct ip6_hdr *);
9bccf70c 1575 tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
316670eb 1576 th = (struct tcphdr *)(void *)((caddr_t)ip6 + off0);
9bccf70c 1577
39236c6e
A
1578 if (tcp_input_checksum(AF_INET6, m, th, off0, tlen))
1579 goto dropnosock;
6d2010ae 1580
9bccf70c
A
1581 KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport),
1582 (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
1583 th->th_seq, th->th_ack, th->th_win);
1c79356b 1584 /*
9bccf70c
A
1585 * Be proactive about unspecified IPv6 address in source.
1586 * As we use all-zero to indicate unbounded/unconnected pcb,
1587 * unspecified IPv6 address can be used to confuse us.
1588 *
1589 * Note that packets with unspecified IPv6 destination is
1590 * already dropped in ip6_input.
1c79356b 1591 */
9bccf70c
A
1592 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
1593 /* XXX stat */
39236c6e 1594 IF_TCP_STATINC(ifp, unspecv6);
91447636 1595 goto dropnosock;
1c79356b 1596 }
39236c6e 1597 DTRACE_TCP5(receive, struct mbuf *, m, struct inpcb *, NULL,
6d2010ae
A
1598 struct ip6_hdr *, ip6, struct tcpcb *, NULL,
1599 struct tcphdr *, th);
39236c6e
A
1600
1601 ip_ecn = (ntohl(ip6->ip6_flow) >> 20) & IPTOS_ECN_MASK;
9bccf70c
A
1602 } else
1603#endif /* INET6 */
ac5ea4a9 1604 {
9bccf70c
A
1605 /*
1606 * Get IP and TCP header together in first mbuf.
1607 * Note: IP leaves IP header in first mbuf.
1608 */
1609 if (off0 > sizeof (struct ip)) {
1610 ip_stripoptions(m, (struct mbuf *)0);
1611 off0 = sizeof(struct ip);
9bccf70c
A
1612 }
1613 if (m->m_len < sizeof (struct tcpiphdr)) {
1614 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) {
1615 tcpstat.tcps_rcvshort++;
1616 return;
0b4e3aa0 1617 }
9bccf70c 1618 }
316670eb
A
1619
1620 /* Expect 32-bit aligned data pointer on strict-align platforms */
1621 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
1622
9bccf70c 1623 ip = mtod(m, struct ip *);
316670eb 1624 th = (struct tcphdr *)(void *)((caddr_t)ip + off0);
9bccf70c 1625 tlen = ip->ip_len;
0b4e3aa0 1626
39236c6e 1627 if (tcp_input_checksum(AF_INET, m, th, off0, tlen))
91447636 1628 goto dropnosock;
39236c6e 1629
9bccf70c
A
1630#if INET6
1631 /* Re-initialization for later version check */
1632 ip->ip_v = IPVERSION;
1633#endif
2d21ac55 1634 ip_ecn = (ip->ip_tos & IPTOS_ECN_MASK);
316670eb
A
1635
1636 DTRACE_TCP5(receive, struct mbuf *, m, struct inpcb *, NULL,
1637 struct ip *, ip, struct tcpcb *, NULL, struct tcphdr *, th);
39236c6e 1638
316670eb
A
1639 KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport),
1640 (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
1641 th->th_seq, th->th_ack, th->th_win);
1642
ac5ea4a9 1643 }
1c79356b
A
1644
1645 /*
1646 * Check that TCP offset makes sense,
1647 * pull out TCP options and adjust length. XXX
1648 */
9bccf70c
A
1649 off = th->th_off << 2;
1650 if (off < sizeof (struct tcphdr) || off > tlen) {
1c79356b 1651 tcpstat.tcps_rcvbadoff++;
39236c6e 1652 IF_TCP_STATINC(ifp, badformat);
91447636 1653 goto dropnosock;
1c79356b 1654 }
9bccf70c
A
1655 tlen -= off; /* tlen is used instead of ti->ti_len */
1656 if (off > sizeof (struct tcphdr)) {
1c79356b
A
1657#if INET6
1658 if (isipv6) {
91447636 1659 IP6_EXTHDR_CHECK(m, off0, off, return);
1c79356b 1660 ip6 = mtod(m, struct ip6_hdr *);
316670eb 1661 th = (struct tcphdr *)(void *)((caddr_t)ip6 + off0);
1c79356b
A
1662 } else
1663#endif /* INET6 */
2d21ac55
A
1664 {
1665 if (m->m_len < sizeof(struct ip) + off) {
1666 if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) {
1667 tcpstat.tcps_rcvshort++;
1668 return;
1669 }
1670 ip = mtod(m, struct ip *);
316670eb 1671 th = (struct tcphdr *)(void *)((caddr_t)ip + off0);
1c79356b
A
1672 }
1673 }
9bccf70c 1674 optlen = off - sizeof (struct tcphdr);
1c79356b
A
1675 optp = (u_char *)(th + 1);
1676 /*
1677 * Do quick retrieval of timestamp options ("options
1678 * prediction?"). If timestamp is the only option and it's
1679 * formatted as recommended in RFC 1323 appendix A, we
1680 * quickly get the values now and not bother calling
1681 * tcp_dooptions(), etc.
1682 */
1683 if ((optlen == TCPOLEN_TSTAMP_APPA ||
2d21ac55 1684 (optlen > TCPOLEN_TSTAMP_APPA &&
1c79356b 1685 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
316670eb 1686 *(u_int32_t *)(void *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
2d21ac55 1687 (th->th_flags & TH_SYN) == 0) {
8ad349bb 1688 to.to_flags |= TOF_TS;
316670eb
A
1689 to.to_tsval = ntohl(*(u_int32_t *)(void *)(optp + 4));
1690 to.to_tsecr = ntohl(*(u_int32_t *)(void *)(optp + 8));
1c79356b
A
1691 optp = NULL; /* we've parsed the options */
1692 }
1693 }
1694 thflags = th->th_flags;
1695
9bccf70c
A
1696#if TCP_DROP_SYNFIN
1697 /*
1698 * If the drop_synfin option is enabled, drop all packets with
1699 * both the SYN and FIN bits set. This prevents e.g. nmap from
1700 * identifying the TCP/IP stack.
1701 *
8ad349bb 1702 * This is a violation of the TCP specification.
9bccf70c 1703 */
316670eb 1704 if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN)) {
39236c6e 1705 IF_TCP_STATINC(ifp, synfin);
91447636 1706 goto dropnosock;
316670eb 1707 }
b0d623f7 1708#endif
1c79356b
A
1709
1710 /*
8ad349bb 1711 * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options,
9bccf70c
A
1712 * until after ip6_savecontrol() is called and before other functions
1713 * which don't want those proto headers.
1714 * Because ip6_savecontrol() is going to parse the mbuf to
1715 * search for data to be passed up to user-land, it wants mbuf
1716 * parameters to be unchanged.
1c79356b 1717 */
9bccf70c 1718 drop_hdrlen = off0 + off;
316670eb
A
1719
1720 /* Since this is an entry point for input processing of tcp packets, we
1721 * can update the tcp clock here.
1722 */
1723 calculate_tcp_clock();
1724
1725 /*
39236c6e
A
1726 * Record the interface where this segment arrived on; this does not
1727 * affect normal data output (for non-detached TCP) as it provides a
1728 * hint about which route and interface to use for sending in the
1729 * absence of a PCB, when scoped routing (and thus source interface
1730 * selection) are enabled.
1731 */
1732 if ((m->m_pkthdr.pkt_flags & PKTF_LOOP) || m->m_pkthdr.rcvif == NULL)
1733 ifscope = IFSCOPE_NONE;
1734 else
1735 ifscope = m->m_pkthdr.rcvif->if_index;
316670eb
A
1736
1737 /*
1738 * Convert TCP protocol specific fields to host format.
1739 */
1740
1741#if BYTE_ORDER != BIG_ENDIAN
1742 NTOHL(th->th_seq);
1743 NTOHL(th->th_ack);
1744 NTOHS(th->th_win);
1745 NTOHS(th->th_urp);
1746#endif
1c79356b
A
1747
1748 /*
1749 * Locate pcb for segment.
1750 */
1751findpcb:
6d2010ae
A
1752
1753 isconnected = FALSE;
1754 isdisconnected = FALSE;
1755
1c79356b 1756#if IPFIREWALL_FORWARD
91447636 1757 if (next_hop != NULL
1c79356b 1758#if INET6
2d21ac55 1759 && isipv6 == 0 /* IPv6 support is not yet */
1c79356b
A
1760#endif /* INET6 */
1761 ) {
1762 /*
1763 * Diverted. Pretend to be the destination.
1764 * already got one like this?
1765 */
1766 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
1767 ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif);
1768 if (!inp) {
1769 /*
1770 * No, then it's new. Try find the ambushing socket
1771 */
91447636 1772 if (!next_hop->sin_port) {
1c79356b 1773 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src,
91447636 1774 th->th_sport, next_hop->sin_addr,
1c79356b
A
1775 th->th_dport, 1, m->m_pkthdr.rcvif);
1776 } else {
1777 inp = in_pcblookup_hash(&tcbinfo,
1778 ip->ip_src, th->th_sport,
91447636
A
1779 next_hop->sin_addr,
1780 ntohs(next_hop->sin_port), 1,
1c79356b
A
1781 m->m_pkthdr.rcvif);
1782 }
1783 }
1c79356b
A
1784 } else
1785#endif /* IPFIREWALL_FORWARD */
9bccf70c 1786 {
1c79356b
A
1787#if INET6
1788 if (isipv6)
1789 inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport,
1790 &ip6->ip6_dst, th->th_dport, 1,
1791 m->m_pkthdr.rcvif);
1792 else
1793#endif /* INET6 */
1794 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
1795 ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif);
9bccf70c 1796 }
1c79356b 1797
c910b4d9
A
1798 /*
1799 * Use the interface scope information from the PCB for outbound
1800 * segments. If the PCB isn't present and if scoped routing is
1801 * enabled, tcp_respond will use the scope of the interface where
1802 * the segment arrived on.
1803 */
1804 if (inp != NULL && (inp->inp_flags & INP_BOUND_IF))
316670eb 1805 ifscope = inp->inp_boundifp->if_index;
fe8ab488
A
1806#if NECP
1807 if (inp != NULL && (
1c79356b 1808#if INET6
fe8ab488
A
1809 isipv6 ? !necp_socket_is_allowed_to_send_recv_v6(inp,
1810 th->th_dport, th->th_sport, &ip6->ip6_dst,
1811 &ip6->ip6_src, ifp, NULL) :
1812#endif
1813 !necp_socket_is_allowed_to_send_recv_v4(inp, th->th_dport,
1814 th->th_sport, &ip->ip_dst, &ip->ip_src,
1815 ifp, NULL))) {
1816 if (in_pcb_checkstate(inp, WNT_RELEASE, 0)
1817 == WNT_STOPUSING) {
1818 inp = NULL; /* pretend we didn't find it */
1819 }
1820 IF_TCP_STATINC(ifp, badformatipsec);
1821 goto dropnosock;
1c79356b 1822 }
fe8ab488 1823#endif /* NECP */
1c79356b
A
1824
1825 /*
1826 * If the state is CLOSED (i.e., TCB does not exist) then
1827 * all data in the incoming segment is discarded.
1828 * If the TCB exists but is in CLOSED state, it is embryonic,
1829 * but should either do a listen or a connect soon.
1830 */
1831 if (inp == NULL) {
9bccf70c 1832 if (log_in_vain) {
1c79356b 1833#if INET6
91447636 1834 char dbuf[MAX_IPv6_STR_LEN], sbuf[MAX_IPv6_STR_LEN];
1c79356b 1835#else /* INET6 */
91447636 1836 char dbuf[MAX_IPv4_STR_LEN], sbuf[MAX_IPv4_STR_LEN];
1c79356b
A
1837#endif /* INET6 */
1838
1839#if INET6
1840 if (isipv6) {
91447636
A
1841 inet_ntop(AF_INET6, &ip6->ip6_dst, dbuf, sizeof(dbuf));
1842 inet_ntop(AF_INET6, &ip6->ip6_src, sbuf, sizeof(sbuf));
9bccf70c 1843 } else
1c79356b 1844#endif
91447636
A
1845 {
1846 inet_ntop(AF_INET, &ip->ip_dst, dbuf, sizeof(dbuf));
1847 inet_ntop(AF_INET, &ip->ip_src, sbuf, sizeof(sbuf));
1848 }
9bccf70c
A
1849 switch (log_in_vain) {
1850 case 1:
1851 if(thflags & TH_SYN)
1852 log(LOG_INFO,
91447636
A
1853 "Connection attempt to TCP %s:%d from %s:%d\n",
1854 dbuf, ntohs(th->th_dport),
1855 sbuf,
1856 ntohs(th->th_sport));
9bccf70c
A
1857 break;
1858 case 2:
1859 log(LOG_INFO,
91447636
A
1860 "Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n",
1861 dbuf, ntohs(th->th_dport), sbuf,
1862 ntohs(th->th_sport), thflags);
1863 break;
1864 case 3:
39236c6e 1865 case 4:
316670eb 1866 if ((thflags & TH_SYN) && !(thflags & TH_ACK) &&
91447636
A
1867 !(m->m_flags & (M_BCAST | M_MCAST)) &&
1868#if INET6
1869 ((isipv6 && !IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) ||
1870 (!isipv6 && ip->ip_dst.s_addr != ip->ip_src.s_addr))
1871#else
1872 ip->ip_dst.s_addr != ip->ip_src.s_addr
1873#endif
1874 )
1875 log_in_vain_log((LOG_INFO,
1876 "Stealth Mode connection attempt to TCP %s:%d from %s:%d\n",
1877 dbuf, ntohs(th->th_dport),
1878 sbuf,
1879 ntohs(th->th_sport)));
9bccf70c
A
1880 break;
1881 default:
1882 break;
1c79356b 1883 }
1c79356b 1884 }
9bccf70c 1885 if (blackhole) {
91447636 1886 if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type != IFT_LOOP)
316670eb 1887
91447636
A
1888 switch (blackhole) {
1889 case 1:
1890 if (thflags & TH_SYN)
1891 goto dropnosock;
1892 break;
1893 case 2:
1894 goto dropnosock;
1895 default:
1896 goto dropnosock;
1897 }
9bccf70c
A
1898 }
1899 rstreason = BANDLIM_RST_CLOSEDPORT;
39236c6e 1900 IF_TCP_STATINC(ifp, noconnnolist);
91447636
A
1901 goto dropwithresetnosock;
1902 }
1903 so = inp->inp_socket;
1904 if (so == NULL) {
b0d623f7
A
1905 /* This case shouldn't happen as the socket shouldn't be null
1906 * if inp_state isn't set to INPCB_STATE_DEAD
1907 * But just in case, we pretend we didn't find the socket if we hit this case
1908 * as this isn't cause for a panic (the socket might be leaked however)...
1909 */
1910 inp = NULL;
91447636 1911#if TEMPDEBUG
b0d623f7 1912 printf("tcp_input: no more socket for inp=%x. This shouldn't happen\n", inp);
91447636
A
1913#endif
1914 goto dropnosock;
1c79356b 1915 }
8ad349bb 1916
6d2010ae 1917 tcp_lock(so, 1, 0);
91447636 1918 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
b0d623f7 1919 tcp_unlock(so, 1, (void *)2);
91447636
A
1920 inp = NULL; // pretend we didn't find it
1921 goto dropnosock;
1922 }
1923
1c79356b 1924 tp = intotcpcb(inp);
9bccf70c
A
1925 if (tp == 0) {
1926 rstreason = BANDLIM_RST_CLOSEDPORT;
39236c6e 1927 IF_TCP_STATINC(ifp, noconnlist);
1c79356b 1928 goto dropwithreset;
9bccf70c 1929 }
1c79356b
A
1930 if (tp->t_state == TCPS_CLOSED)
1931 goto drop;
9bccf70c 1932
1c79356b
A
1933 /* Unscale the window into a 32-bit value. */
1934 if ((thflags & TH_SYN) == 0)
1935 tiwin = th->th_win << tp->snd_scale;
1936 else
1937 tiwin = th->th_win;
1938
2d21ac55
A
1939#if CONFIG_MACF_NET
1940 if (mac_inpcb_check_deliver(inp, m, AF_INET, SOCK_STREAM))
1941 goto drop;
1942#endif
1943
39236c6e
A
1944 /* Avoid processing packets while closing a listen socket */
1945 if (tp->t_state == TCPS_LISTEN &&
1946 (so->so_options & SO_ACCEPTCONN) == 0)
b7266188
A
1947 goto drop;
1948
1c79356b
A
1949 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
1950#if TCPDEBUG
1951 if (so->so_options & SO_DEBUG) {
1952 ostate = tp->t_state;
1953#if INET6
1954 if (isipv6)
9bccf70c
A
1955 bcopy((char *)ip6, (char *)tcp_saveipgen,
1956 sizeof(*ip6));
1c79356b 1957 else
1c79356b 1958#endif /* INET6 */
9bccf70c 1959 bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip));
1c79356b
A
1960 tcp_savetcp = *th;
1961 }
1962#endif
1963 if (so->so_options & SO_ACCEPTCONN) {
91447636 1964 register struct tcpcb *tp0 = tp;
1c79356b 1965 struct socket *so2;
1c79356b 1966 struct socket *oso;
91447636 1967 struct sockaddr_storage from;
1c79356b
A
1968#if INET6
1969 struct inpcb *oinp = sotoinpcb(so);
1970#endif /* INET6 */
316670eb 1971 struct ifnet *head_ifscope;
fe8ab488
A
1972 unsigned int head_nocell, head_recvanyif,
1973 head_noexpensive, head_awdl_unrestricted;
c910b4d9
A
1974
1975 /* Get listener's bound-to-interface, if any */
1976 head_ifscope = (inp->inp_flags & INP_BOUND_IF) ?
316670eb 1977 inp->inp_boundifp : NULL;
6d2010ae 1978 /* Get listener's no-cellular information, if any */
fe8ab488 1979 head_nocell = INP_NO_CELLULAR(inp);
316670eb
A
1980 /* Get listener's recv-any-interface, if any */
1981 head_recvanyif = (inp->inp_flags & INP_RECV_ANYIF);
fe8ab488
A
1982 /* Get listener's no-expensive information, if any */
1983 head_noexpensive = INP_NO_EXPENSIVE(inp);
1984 head_awdl_unrestricted = INP_AWDL_UNRESTRICTED(inp);
1c79356b 1985
9bccf70c 1986 /*
7e4a7d39
A
1987 * If the state is LISTEN then ignore segment if it contains an RST.
1988 * If the segment contains an ACK then it is bad and send a RST.
1989 * If it does not contain a SYN then it is not interesting; drop it.
1990 * If it is from this socket, drop it, it must be forged.
9bccf70c 1991 */
1c79356b 1992 if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
39236c6e
A
1993 IF_TCP_STATINC(ifp, listbadsyn);
1994
7e4a7d39
A
1995 if (thflags & TH_RST) {
1996 goto drop;
1997 }
1c79356b 1998 if (thflags & TH_ACK) {
7e4a7d39 1999 tp = NULL;
1c79356b 2000 tcpstat.tcps_badsyn++;
9bccf70c 2001 rstreason = BANDLIM_RST_OPENPORT;
1c79356b
A
2002 goto dropwithreset;
2003 }
7e4a7d39
A
2004
2005 /* We come here if there is no SYN set */
2006 tcpstat.tcps_badsyn++;
1c79356b
A
2007 goto drop;
2008 }
1c79356b 2009 KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_START,0,0,0,0,0);
7e4a7d39
A
2010 if (th->th_dport == th->th_sport) {
2011#if INET6
2012 if (isipv6) {
2013 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
2014 &ip6->ip6_src))
2015 goto drop;
2016 } else
2017#endif /* INET6 */
2018 if (ip->ip_dst.s_addr == ip->ip_src.s_addr)
2019 goto drop;
2020 }
2021 /*
2022 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
2023 * in_broadcast() should never return true on a received
2024 * packet with M_BCAST not set.
2025 *
2026 * Packets with a multicast source address should also
2027 * be discarded.
2028 */
2029 if (m->m_flags & (M_BCAST|M_MCAST))
2030 goto drop;
2031#if INET6
2032 if (isipv6) {
2033 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
2034 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
2035 goto drop;
2036 } else
2037#endif
2038 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
2039 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
2040 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
2041 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
2042 goto drop;
2043
1c79356b 2044
9bccf70c
A
2045#if INET6
2046 /*
2047 * If deprecated address is forbidden,
2048 * we do not accept SYN to deprecated interface
2049 * address to prevent any new inbound connection from
2050 * getting established.
2051 * When we do not accept SYN, we send a TCP RST,
2052 * with deprecated source address (instead of dropping
2053 * it). We compromise it as it is much better for peer
2054 * to send a RST, and RST will be the final packet
2055 * for the exchange.
2056 *
2057 * If we do not forbid deprecated addresses, we accept
39236c6e
A
2058 * the SYN packet. RFC 4862 forbids dropping SYN in
2059 * this case.
9bccf70c
A
2060 */
2061 if (isipv6 && !ip6_use_deprecated) {
39236c6e
A
2062 uint32_t ia6_flags;
2063
2064 if (ip6_getdstifaddr_info(m, NULL,
2065 &ia6_flags) == 0) {
2066 if (ia6_flags & IN6_IFF_DEPRECATED) {
6d2010ae
A
2067 tp = NULL;
2068 rstreason = BANDLIM_RST_OPENPORT;
39236c6e 2069 IF_TCP_STATINC(ifp, deprecate6);
6d2010ae
A
2070 goto dropwithreset;
2071 }
9bccf70c
A
2072 }
2073 }
2074#endif
91447636 2075 if (so->so_filt) {
2d21ac55 2076#if INET6
91447636
A
2077 if (isipv6) {
2078 struct sockaddr_in6 *sin6 = (struct sockaddr_in6*)&from;
2079
2080 sin6->sin6_len = sizeof(*sin6);
2081 sin6->sin6_family = AF_INET6;
2082 sin6->sin6_port = th->th_sport;
2083 sin6->sin6_flowinfo = 0;
2084 sin6->sin6_addr = ip6->ip6_src;
2085 sin6->sin6_scope_id = 0;
2d21ac55
A
2086 }
2087 else
2088#endif
2089 {
91447636
A
2090 struct sockaddr_in *sin = (struct sockaddr_in*)&from;
2091
2092 sin->sin_len = sizeof(*sin);
2093 sin->sin_family = AF_INET;
2094 sin->sin_port = th->th_sport;
2095 sin->sin_addr = ip->ip_src;
2096 }
2097 so2 = sonewconn(so, 0, (struct sockaddr*)&from);
2098 } else {
2099 so2 = sonewconn(so, 0, NULL);
2100 }
1c79356b
A
2101 if (so2 == 0) {
2102 tcpstat.tcps_listendrop++;
2d21ac55 2103 if (tcp_dropdropablreq(so)) {
91447636
A
2104 if (so->so_filt)
2105 so2 = sonewconn(so, 0, (struct sockaddr*)&from);
2106 else
2107 so2 = sonewconn(so, 0, NULL);
1c79356b 2108 }
91447636 2109 if (!so2)
1c79356b
A
2110 goto drop;
2111 }
b0d623f7
A
2112
2113 /* Point "inp" and "tp" in tandem to new socket */
2114 inp = (struct inpcb *)so2->so_pcb;
2115 tp = intotcpcb(inp);
91447636 2116
1c79356b 2117 oso = so;
91447636
A
2118 tcp_unlock(so, 0, 0); /* Unlock but keep a reference on listener for now */
2119
1c79356b 2120 so = so2;
91447636 2121 tcp_lock(so, 1, 0);
1c79356b 2122 /*
1c79356b
A
2123 * Mark socket as temporary until we're
2124 * committed to keeping it. The code at
2125 * ``drop'' and ``dropwithreset'' check the
2126 * flag dropsocket to see if the temporary
2127 * socket created here should be discarded.
2128 * We mark the socket as discardable until
2129 * we're committed to it below in TCPS_LISTEN.
7e4a7d39
A
2130 * There are some error conditions in which we
2131 * have to drop the temporary socket.
1c79356b
A
2132 */
2133 dropsocket++;
c910b4d9
A
2134 /*
2135 * Inherit INP_BOUND_IF from listener; testing if
316670eb 2136 * head_ifscope is non-NULL is sufficient, since it
c910b4d9
A
2137 * can only be set to a non-zero value earlier if
2138 * the listener has such a flag set.
2139 */
316670eb 2140 if (head_ifscope != NULL) {
c910b4d9 2141 inp->inp_flags |= INP_BOUND_IF;
316670eb
A
2142 inp->inp_boundifp = head_ifscope;
2143 } else {
2144 inp->inp_flags &= ~INP_BOUND_IF;
c910b4d9 2145 }
6d2010ae 2146 /*
fe8ab488 2147 * Inherit restrictions from listener.
6d2010ae 2148 */
fe8ab488
A
2149 if (head_nocell)
2150 inp_set_nocellular(inp);
2151 if (head_noexpensive)
2152 inp_set_noexpensive(inp);
2153 if (head_awdl_unrestricted)
2154 inp_set_awdl_unrestricted(inp);
316670eb
A
2155 /*
2156 * Inherit {IN,IN6}_RECV_ANYIF from listener.
2157 */
2158 if (head_recvanyif)
2159 inp->inp_flags |= INP_RECV_ANYIF;
2160 else
2161 inp->inp_flags &= ~INP_RECV_ANYIF;
1c79356b
A
2162#if INET6
2163 if (isipv6)
2164 inp->in6p_laddr = ip6->ip6_dst;
2165 else {
9bccf70c
A
2166 inp->inp_vflag &= ~INP_IPV6;
2167 inp->inp_vflag |= INP_IPV4;
1c79356b 2168#endif /* INET6 */
7e4a7d39 2169 inp->inp_laddr = ip->ip_dst;
1c79356b
A
2170#if INET6
2171 }
2172#endif /* INET6 */
1c79356b 2173 inp->inp_lport = th->th_dport;
91447636 2174 if (in_pcbinshash(inp, 0) != 0) {
1c79356b 2175 /*
9bccf70c
A
2176 * Undo the assignments above if we failed to
2177 * put the PCB on the hash lists.
1c79356b
A
2178 */
2179#if INET6
2180 if (isipv6)
2181 inp->in6p_laddr = in6addr_any;
2182 else
2183#endif /* INET6 */
316670eb 2184 inp->inp_laddr.s_addr = INADDR_ANY;
1c79356b 2185 inp->inp_lport = 0;
91447636
A
2186 tcp_lock(oso, 0, 0); /* release ref on parent */
2187 tcp_unlock(oso, 1, 0);
1c79356b
A
2188 goto drop;
2189 }
1c79356b
A
2190#if INET6
2191 if (isipv6) {
9bccf70c
A
2192 /*
2193 * Inherit socket options from the listening
2194 * socket.
2195 * Note that in6p_inputopts are not (even
2196 * should not be) copied, since it stores
1c79356b 2197 * previously received options and is used to
9bccf70c
A
2198 * detect if each new option is different than
2199 * the previous one and hence should be passed
2200 * to a user.
2201 * If we copied in6p_inputopts, a user would
2202 * not be able to receive options just after
2203 * calling the accept system call.
2204 */
1c79356b
A
2205 inp->inp_flags |=
2206 oinp->inp_flags & INP_CONTROLOPTS;
9bccf70c
A
2207 if (oinp->in6p_outputopts)
2208 inp->in6p_outputopts =
2209 ip6_copypktopts(oinp->in6p_outputopts,
2210 M_NOWAIT);
1c79356b
A
2211 } else
2212#endif /* INET6 */
316670eb 2213 inp->inp_options = ip_srcroute();
91447636 2214 tcp_lock(oso, 0, 0);
1c79356b
A
2215#if IPSEC
2216 /* copy old policy into new socket's */
9bccf70c
A
2217 if (sotoinpcb(oso)->inp_sp)
2218 {
2219 int error = 0;
2220 /* Is it a security hole here to silently fail to copy the policy? */
2221 if (inp->inp_sp != NULL)
2222 error = ipsec_init_policy(so, &inp->inp_sp);
2223 if (error != 0 || ipsec_copy_policy(sotoinpcb(oso)->inp_sp, inp->inp_sp))
2224 printf("tcp_input: could not copy policy\n");
2225 }
1c79356b 2226#endif
b0d623f7 2227 /* inherit states from the listener */
6d2010ae
A
2228 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
2229 struct tcpcb *, tp, int32_t, TCPS_LISTEN);
1c79356b 2230 tp->t_state = TCPS_LISTEN;
9bccf70c 2231 tp->t_flags |= tp0->t_flags & (TF_NOPUSH|TF_NOOPT|TF_NODELAY);
fe8ab488 2232 tp->t_flagsext |= (tp0->t_flagsext & (TF_RXTFINDROP|TF_NOTIMEWAIT));
b0d623f7 2233 tp->t_keepinit = tp0->t_keepinit;
39236c6e
A
2234 tp->t_keepcnt = tp0->t_keepcnt;
2235 tp->t_keepintvl = tp0->t_keepintvl;
2236 tp->t_adaptive_wtimo = tp0->t_adaptive_wtimo;
2237 tp->t_adaptive_rtimo = tp0->t_adaptive_rtimo;
91447636 2238 tp->t_inpcb->inp_ip_ttl = tp0->t_inpcb->inp_ip_ttl;
316670eb
A
2239 if ((so->so_flags & SOF_NOTSENT_LOWAT) != 0)
2240 tp->t_notsent_lowat = tp0->t_notsent_lowat;
b0d623f7
A
2241
2242 /* now drop the reference on the listener */
2243 tcp_unlock(oso, 1, 0);
2244
316670eb 2245 tcp_set_max_rwinscale(tp, so);
1c79356b
A
2246
2247 KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_END,0,0,0,0,0);
2248 }
2249 }
fe8ab488
A
2250 lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx,
2251 LCK_MTX_ASSERT_OWNED);
6d2010ae 2252
743b1565 2253 if (tp->t_state == TCPS_ESTABLISHED && tlen > 0) {
39236c6e
A
2254 /*
2255 * Evaluate the rate of arrival of packets to see if the
6d2010ae
A
2256 * receiver can reduce the ack traffic. The algorithm to
2257 * stretch acks will be enabled if the connection meets
2258 * certain criteria defined in tcp_stretch_ack_enable function.
2259 */
2260 if ((tp->t_flagsext & TF_RCVUNACK_WAITSS) != 0) {
316670eb 2261 TCP_INC_VAR(tp->rcv_waitforss, nlropkts);
6d2010ae
A
2262 }
2263 if (tcp_stretch_ack_enable(tp)) {
2264 tp->t_flags |= TF_STRETCHACK;
2265 tp->t_flagsext &= ~(TF_RCVUNACK_WAITSS);
2266 tp->rcv_waitforss = 0;
2267 } else {
2268 tp->t_flags &= ~(TF_STRETCHACK);
2269 }
2270 if (TSTMP_GT(tp->rcv_unackwin, tcp_now)) {
2271 tp->rcv_by_unackwin += (tlen + off);
2272 } else {
2273 tp->rcv_unackwin = tcp_now + tcp_rcvunackwin;
2274 tp->rcv_by_unackwin = tlen + off;
2275 }
91447636 2276 }
316670eb
A
2277
2278 /*
2279 * Keep track of how many bytes were received in the LRO packet
2280 */
39236c6e 2281 if ((pktf_sw_lro_pkt) && (nlropkts > 2)) {
316670eb
A
2282 tp->t_lropktlen += tlen;
2283 }
2d21ac55 2284 /*
39236c6e
A
2285 * Explicit Congestion Notification - Flag that we need to send ECT if
2286 * + The IP Congestion experienced flag was set.
2287 * + Socket is in established state
2288 * + We negotiated ECN in the TCP setup
2289 * + This isn't a pure ack (tlen > 0)
2290 * + The data is in the valid window
2291 *
2292 * TE_SENDECE will be cleared when we receive a packet with TH_CWR set.
2d21ac55
A
2293 */
2294 if (ip_ecn == IPTOS_ECN_CE && tp->t_state == TCPS_ESTABLISHED &&
316670eb 2295 ((tp->ecn_flags & (TE_ECN_ON)) == (TE_ECN_ON)) && tlen > 0 &&
2d21ac55
A
2296 SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
2297 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
2298 tp->ecn_flags |= TE_SENDECE;
2299 }
2300
2301 /*
39236c6e
A
2302 * Clear TE_SENDECE if TH_CWR is set. This is harmless, so we don't
2303 * bother doing extensive checks for state and whatnot.
2d21ac55
A
2304 */
2305 if ((thflags & TH_CWR) == TH_CWR) {
2306 tp->ecn_flags &= ~TE_SENDECE;
2307 }
6d2010ae 2308
39236c6e
A
2309 /*
2310 * If we received an explicit notification of congestion in
6d2010ae 2311 * ip tos ecn bits or by the CWR bit in TCP header flags, reset
fe8ab488
A
2312 * the ack-strteching state. We need to handle ECN notification if
2313 * an ECN setup SYN was sent even once.
6d2010ae 2314 */
fe8ab488
A
2315 if (tp->t_state == TCPS_ESTABLISHED
2316 && (tp->ecn_flags & TE_SETUPSENT)
2317 && (ip_ecn == IPTOS_ECN_CE || (thflags & TH_CWR)))
6d2010ae 2318 tcp_reset_stretch_ack(tp);
316670eb
A
2319
2320 /*
2321 * Try to determine if we are receiving a packet after a long time.
2322 * Use our own approximation of idletime to roughly measure remote
2323 * end's idle time. Since slowstart is used after an idle period
2324 * we want to avoid doing LRO if the remote end is not up to date
2325 * on initial window support and starts with 1 or 2 packets as its IW.
2326 */
2327 if (sw_lro && (tp->t_flagsext & TF_LRO_OFFLOADED) &&
2328 ((tcp_now - tp->t_rcvtime) >= (TCP_IDLETIMEOUT(tp)))) {
2329 turnoff_lro = 1;
2330 }
2331
39236c6e
A
2332 /* Update rcvtime as a new segment was received on the connection */
2333 tp->t_rcvtime = tcp_now;
2334
1c79356b
A
2335 /*
2336 * Segment received on connection.
2337 * Reset idle time and keep-alive timer.
2338 */
1c79356b 2339 if (TCPS_HAVEESTABLISHED(tp->t_state))
39236c6e 2340 tcp_keepalive_reset(tp);
1c79356b
A
2341
2342 /*
2343 * Process options if not in LISTEN state,
2344 * else do it below (after getting remote address).
2345 */
39236c6e 2346 if (tp->t_state != TCPS_LISTEN && optp) {
c910b4d9 2347 tcp_dooptions(tp, optp, optlen, th, &to, ifscope);
39236c6e
A
2348#if MPTCP
2349 mptcp_csum = mptcp_input_csum(tp, m, drop_hdrlen);
2350 if (mptcp_csum) {
2351 tp->t_mpflags |= TMPF_SND_MPFAIL;
2352 tp->t_mpflags &= ~TMPF_EMBED_DSN;
2353 mptcp_notify_mpfail(so);
2354 m_freem(m);
2355 tcpstat.tcps_mp_badcsum++;
2356 tcp_check_timer_state(tp);
2357 tcp_unlock(so, 1, 0);
2358 KERNEL_DEBUG(DBG_FNC_TCP_INPUT |
2359 DBG_FUNC_END,0,0,0,0,0);
2360 return;
8ad349bb 2361 }
39236c6e
A
2362 mptcp_insert_rmap(tp, m);
2363#endif /* MPTCP */
2364 }
2365 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
8ad349bb
A
2366 if (to.to_flags & TOF_TS) {
2367 tp->t_flags |= TF_RCVD_TSTMP;
2368 tp->ts_recent = to.to_tsval;
2369 tp->ts_recent_age = tcp_now;
2370 }
2371 if (to.to_flags & TOF_MSS)
c910b4d9 2372 tcp_mss(tp, to.to_mss, ifscope);
39236c6e 2373 if (SACK_ENABLED(tp)) {
8ad349bb 2374 if (!(to.to_flags & TOF_SACK))
39236c6e 2375 tp->t_flagsext &= ~(TF_SACK_ENABLE);
8ad349bb
A
2376 else
2377 tp->t_flags |= TF_SACK_PERMIT;
2378 }
2379 }
2380
6d2010ae
A
2381#if TRAFFIC_MGT
2382 /* Compute inter-packet arrival jitter. According to RFC 3550, inter-packet
2383 * arrival jitter is defined as the difference in packet spacing at the
2384 * receiver compared to the sender for a pair of packets. When two packets
2385 * of maximum segment size come one after the other with consecutive
2386 * sequence numbers, we consider them as packets sent together at the
2387 * sender and use them as a pair to compute inter-packet arrival jitter.
2388 * This metric indicates the delay induced by the network components due
2389 * to queuing in edge/access routers.
2390 */
2391 if (tp->t_state == TCPS_ESTABLISHED &&
2392 (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK|TH_ECE|TH_PUSH)) == TH_ACK &&
2393 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
2394 ((to.to_flags & TOF_TS) == 0 ||
2395 TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
2396 th->th_seq == tp->rcv_nxt &&
2397 LIST_EMPTY(&tp->t_segq)) {
316670eb 2398 int seg_size = tlen;
6d2010ae 2399 if (tp->iaj_pktcnt <= IAJ_IGNORE_PKTCNT) {
316670eb 2400 TCP_INC_VAR(tp->iaj_pktcnt, nlropkts);
6d2010ae
A
2401 }
2402
39236c6e 2403 if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) {
316670eb
A
2404 seg_size = m->m_pkthdr.lro_pktlen;
2405 }
2406 if ( tp->iaj_size == 0 || seg_size > tp->iaj_size ||
2407 (seg_size == tp->iaj_size && tp->iaj_rcv_ts == 0)) {
6d2010ae
A
2408 /* State related to inter-arrival jitter is uninitialized
2409 * or we are trying to find a good first packet to start
2410 * computing the metric
2411 */
316670eb 2412 update_iaj_state(tp, seg_size, 0);
6d2010ae 2413 } else {
316670eb 2414 if (seg_size == tp->iaj_size) {
6d2010ae
A
2415 /* Compute inter-arrival jitter taking this packet
2416 * as the second packet
2417 */
39236c6e
A
2418 if (pktf_sw_lro_pkt)
2419 compute_iaj(tp, nlropkts,
2420 m->m_pkthdr.lro_elapsed);
2421 else
2422 compute_iaj(tp, 1, 0);
6d2010ae 2423 }
316670eb 2424 if (seg_size < tp->iaj_size) {
6d2010ae
A
2425 /* There is a smaller packet in the stream.
2426 * Some times the maximum size supported on a path can
2427 * change if there is a new link with smaller MTU.
2428 * The receiver will not know about this change.
2429 * If there are too many packets smaller than iaj_size,
2430 * we try to learn the iaj_size again.
2431 */
39236c6e 2432 TCP_INC_VAR(tp->iaj_small_pkt, nlropkts);
6d2010ae 2433 if (tp->iaj_small_pkt > RESET_IAJ_SIZE_THRESH) {
316670eb 2434 update_iaj_state(tp, seg_size, 1);
6d2010ae 2435 } else {
316670eb 2436 CLEAR_IAJ_STATE(tp);
6d2010ae
A
2437 }
2438 } else {
316670eb 2439 update_iaj_state(tp, seg_size, 0);
6d2010ae
A
2440 }
2441 }
2442 } else {
316670eb 2443 CLEAR_IAJ_STATE(tp);
6d2010ae
A
2444 }
2445#endif /* TRAFFIC_MGT */
2446
1c79356b
A
2447 /*
2448 * Header prediction: check for the two common cases
2449 * of a uni-directional data xfer. If the packet has
2450 * no control flags, is in-sequence, the window didn't
2451 * change and we're not retransmitting, it's a
2452 * candidate. If the length is zero and the ack moved
2453 * forward, we're the sender side of the xfer. Just
2454 * free the data acked & wake any higher level process
2455 * that was blocked waiting for space. If the length
2456 * is non-zero and the ack didn't move, we're the
2457 * receiver side. If we're getting packets in-order
2458 * (the reassembly queue is empty), add the data to
2459 * the socket buffer and note that we need a delayed ack.
2460 * Make sure that the hidden state-flags are also off.
2461 * Since we check for TCPS_ESTABLISHED above, it can only
2462 * be TH_NEEDSYN.
2463 */
2464 if (tp->t_state == TCPS_ESTABLISHED &&
2d21ac55 2465 (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK|TH_ECE)) == TH_ACK &&
1c79356b 2466 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
8ad349bb 2467 ((to.to_flags & TOF_TS) == 0 ||
1c79356b 2468 TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
1c79356b
A
2469 th->th_seq == tp->rcv_nxt &&
2470 tiwin && tiwin == tp->snd_wnd &&
2471 tp->snd_nxt == tp->snd_max) {
2472
2473 /*
2474 * If last ACK falls within this segment's sequence numbers,
2475 * record the timestamp.
2476 * NOTE that the test is modified according to the latest
2477 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
2478 */
8ad349bb 2479 if ((to.to_flags & TOF_TS) != 0 &&
1c79356b
A
2480 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
2481 tp->ts_recent_age = tcp_now;
2482 tp->ts_recent = to.to_tsval;
2483 }
2484
c910b4d9
A
2485 /* Force acknowledgment if we received a FIN */
2486
2487 if (thflags & TH_FIN)
2488 tp->t_flags |= TF_ACKNOW;
2489
9bccf70c 2490 if (tlen == 0) {
1c79356b
A
2491 if (SEQ_GT(th->th_ack, tp->snd_una) &&
2492 SEQ_LEQ(th->th_ack, tp->snd_max) &&
2d21ac55 2493 tp->snd_cwnd >= tp->snd_ssthresh &&
6d2010ae 2494 (!IN_FASTRECOVERY(tp) &&
fe8ab488
A
2495 ((!(SACK_ENABLED(tp)) &&
2496 tp->t_dupacks < tp->t_rexmtthresh) ||
2497 (SACK_ENABLED(tp) && to.to_nsacks == 0 &&
2498 TAILQ_EMPTY(&tp->snd_holes))))) {
1c79356b
A
2499 /*
2500 * this is a pure ack for outstanding data.
2501 */
2502 ++tcpstat.tcps_predack;
39236c6e 2503
fe8ab488 2504 tcp_bad_rexmt_check(tp, th, &to),
39236c6e
A
2505
2506 /* Recalculate the RTT */
2507 tcp_compute_rtt(tp, &to, th);
2508
2509 acked = BYTES_ACKED(th, tp);
1c79356b
A
2510 tcpstat.tcps_rcvackpack++;
2511 tcpstat.tcps_rcvackbyte += acked;
6d2010ae
A
2512
2513 /* Handle an ack that is in sequence during congestion
2514 * avoidance phase. The calculations in this function
2515 * assume that snd_una is not updated yet.
2d21ac55 2516 */
fe8ab488
A
2517 if (CC_ALGO(tp)->congestion_avd != NULL)
2518 CC_ALGO(tp)->congestion_avd(tp, th);
2519 tcp_ccdbg_trace(tp, th, TCP_CC_INSEQ_ACK_RCVD);
1c79356b 2520 sbdrop(&so->so_snd, acked);
39236c6e
A
2521 if (so->so_flags & SOF_ENABLE_MSGS) {
2522 VERIFY(acked <= so->so_msg_state->msg_serial_bytes);
2523 so->so_msg_state->msg_serial_bytes -= acked;
2524 }
316670eb
A
2525 tcp_sbsnd_trim(&so->so_snd);
2526
8ad349bb
A
2527 if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
2528 SEQ_LEQ(th->th_ack, tp->snd_recover))
2529 tp->snd_recover = th->th_ack - 1;
1c79356b 2530 tp->snd_una = th->th_ack;
316670eb 2531
8ad349bb
A
2532 /*
2533 * pull snd_wl2 up to prevent seq wrap relative
2534 * to th_ack.
2535 */
2536 tp->snd_wl2 = th->th_ack;
39236c6e
A
2537
2538 if (tp->t_dupacks > 0) {
2539 tp->t_dupacks = 0;
2540 tp->t_rexmtthresh = tcprexmtthresh;
2541 }
2542
1c79356b 2543 m_freem(m);
1c79356b
A
2544
2545 /*
2546 * If all outstanding data are acked, stop
2547 * retransmit timer, otherwise restart timer
2548 * using current (possibly backed-off) value.
2549 * If process is waiting for space,
2550 * wakeup/selwakeup/signal. If data
2551 * are ready to send, let tcp_output
2552 * decide between more output or persist.
2553 */
fe8ab488 2554 if (tp->snd_una == tp->snd_max) {
1c79356b 2555 tp->t_timer[TCPT_REXMT] = 0;
fe8ab488
A
2556 tp->t_timer[TCPT_PTO] = 0;
2557 } else if (tp->t_timer[TCPT_PERSIST] == 0) {
2558 tp->t_timer[TCPT_REXMT] =
2559 OFFSET_FROM_START(tp,
2560 tp->t_rxtcur);
2561 }
1c79356b 2562
316670eb
A
2563 if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 &&
2564 tp->t_bwmeas != NULL)
2565 tcp_bwmeas_check(tp);
91447636 2566 sowwakeup(so); /* has to be done with socket lock held */
2d21ac55 2567 if ((so->so_snd.sb_cc) || (tp->t_flags & TF_ACKNOW)) {
1c79356b 2568 (void) tcp_output(tp);
2d21ac55 2569 }
6d2010ae
A
2570
2571 tcp_check_timer_state(tp);
91447636 2572 tcp_unlock(so, 1, 0);
1c79356b
A
2573 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
2574 return;
2575 }
2576 } else if (th->th_ack == tp->snd_una &&
9bccf70c 2577 LIST_EMPTY(&tp->t_segq) &&
2d21ac55 2578 tlen <= tcp_sbspace(tp)) {
1c79356b
A
2579 /*
2580 * this is a pure, in-sequence data packet
2581 * with nothing on the reassembly queue and
2582 * we have enough buffer space to take it.
2583 */
316670eb
A
2584
2585 /*
2586 * If this is a connection in steady state, start
2587 * coalescing packets belonging to this flow.
2588 */
2589 if (turnoff_lro) {
2590 tcp_lro_remove_state(tp->t_inpcb->inp_laddr,
39236c6e
A
2591 tp->t_inpcb->inp_faddr,
2592 tp->t_inpcb->inp_lport,
2593 tp->t_inpcb->inp_fport);
316670eb
A
2594 tp->t_flagsext &= ~TF_LRO_OFFLOADED;
2595 tp->t_idleat = tp->rcv_nxt;
39236c6e
A
2596 } else if (sw_lro && !pktf_sw_lro_pkt && !isipv6 &&
2597 (so->so_flags & SOF_USELRO) &&
2598 !IFNET_IS_CELLULAR(m->m_pkthdr.rcvif) &&
316670eb
A
2599 (m->m_pkthdr.rcvif->if_type != IFT_LOOP) &&
2600 ((th->th_seq - tp->irs) >
39236c6e 2601 (tp->t_maxseg << lro_start)) &&
316670eb
A
2602 ((tp->t_idleat == 0) || ((th->th_seq -
2603 tp->t_idleat) > (tp->t_maxseg << lro_start)))) {
2604 tp->t_flagsext |= TF_LRO_OFFLOADED;
2605 tcp_start_coalescing(ip, th, tlen);
2606 tp->t_idleat = 0;
2607 }
2608
8ad349bb 2609 /* Clean receiver SACK report if present */
39236c6e 2610 if (SACK_ENABLED(tp) && tp->rcv_numsacks)
8ad349bb 2611 tcp_clean_sackreport(tp);
1c79356b 2612 ++tcpstat.tcps_preddat;
9bccf70c 2613 tp->rcv_nxt += tlen;
8ad349bb
A
2614 /*
2615 * Pull snd_wl1 up to prevent seq wrap relative to
2616 * th_seq.
2617 */
2618 tp->snd_wl1 = th->th_seq;
2619 /*
2620 * Pull rcv_up up to prevent seq wrap relative to
2621 * rcv_nxt.
2622 */
2623 tp->rcv_up = tp->rcv_nxt;
316670eb 2624 TCP_INC_VAR(tcpstat.tcps_rcvpack, nlropkts);
9bccf70c 2625 tcpstat.tcps_rcvbyte += tlen;
6d2010ae 2626 if (nstat_collect) {
39236c6e 2627 if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) {
fe8ab488
A
2628 INP_ADD_STAT(inp, cell, wifi, wired,
2629 rxpackets, m->m_pkthdr.lro_npkts);
39236c6e 2630 } else {
fe8ab488
A
2631 INP_ADD_STAT(inp, cell, wifi, wired,
2632 rxpackets, 1);
316670eb 2633 }
fe8ab488
A
2634 INP_ADD_STAT(inp, cell, wifi, wired,rxbytes,
2635 tlen);
6d2010ae 2636 }
39236c6e
A
2637
2638 /*
2639 * Calculate the RTT on the receiver only if the
2640 * connection is in streaming mode and the last
2641 * packet was not an end-of-write
2642 */
2643 if ((tp->t_flags & TF_STRETCHACK) &&
2644 !(tp->t_flagsext & TF_STREAMEOW))
2645 tcp_compute_rtt(tp, &to, th);
316670eb
A
2646
2647 tcp_sbrcv_grow(tp, &so->so_rcv, &to, tlen);
2648
9bccf70c
A
2649 /*
2650 * Add data to socket buffer.
2651 */
6d2010ae 2652 so_recv_data_stat(so, m, 0);
9bccf70c 2653 m_adj(m, drop_hdrlen); /* delayed header drop */
39236c6e
A
2654
2655 /*
2656 * If message delivery (SOF_ENABLE_MSGS) is enabled on
2657 * this socket, deliver the packet received as an
2658 * in-order message with sequence number attached to it.
2659 */
2660 if (sbappendstream_rcvdemux(so, m,
2661 th->th_seq - (tp->irs + 1), 0)) {
91447636 2662 sorwakeup(so);
39236c6e 2663 }
9bccf70c
A
2664#if INET6
2665 if (isipv6) {
2666 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
2667 (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
2668 th->th_seq, th->th_ack, th->th_win);
2669 }
2670 else
2671#endif
2672 {
2673 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
2674 (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
2675 th->th_seq, th->th_ack, th->th_win);
2676 }
316670eb 2677 TCP_INC_VAR(tp->t_unacksegs, nlropkts);
6d2010ae
A
2678 if (DELAY_ACK(tp, th)) {
2679 if ((tp->t_flags & TF_DELACK) == 0) {
2680 tp->t_flags |= TF_DELACK;
2681 tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
2682 }
1c79356b
A
2683 } else {
2684 tp->t_flags |= TF_ACKNOW;
2685 tcp_output(tp);
2686 }
39236c6e
A
2687
2688 tcp_adaptive_rwtimo_check(tp, tlen);
2689
6d2010ae 2690 tcp_check_timer_state(tp);
91447636 2691 tcp_unlock(so, 1, 0);
1c79356b
A
2692 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
2693 return;
2694 }
2695 }
2696
2697 /*
2698 * Calculate amount of space in receive window,
2699 * and then do TCP input processing.
2700 * Receive window is amount of space in rcv queue,
2701 * but not less than advertised window.
2702 */
fe8ab488
A
2703 lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx,
2704 LCK_MTX_ASSERT_OWNED);
2d21ac55 2705 win = tcp_sbspace(tp);
1c79356b
A
2706 if (win < 0)
2707 win = 0;
d12e1678
A
2708 else { /* clip rcv window to 4K for modems */
2709 if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0)
2710 win = min(win, slowlink_wsize);
2711 }
1c79356b 2712 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
39236c6e
A
2713#if MPTCP
2714 /*
2715 * Ensure that the subflow receive window isn't greater
2716 * than the connection level receive window.
2717 */
2718 if ((tp->t_mpflags & TMPF_MPTCP_TRUE) &&
2719 (mp_tp = tptomptp(tp))) {
2720 MPT_LOCK(mp_tp);
2721 if (tp->rcv_wnd > mp_tp->mpt_rcvwnd) {
2722 tp->rcv_wnd = mp_tp->mpt_rcvwnd;
2723 tcpstat.tcps_mp_reducedwin++;
2724 }
2725 MPT_UNLOCK(mp_tp);
1c79356b 2726 }
39236c6e 2727#endif /* MPTCP */
1c79356b
A
2728
2729 switch (tp->t_state) {
2730
2731 /*
7e4a7d39 2732 * Initialize tp->rcv_nxt, and tp->irs, select an initial
1c79356b
A
2733 * tp->iss, and send a segment:
2734 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
2735 * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
2736 * Fill in remote peer address fields if not previously specified.
2737 * Enter SYN_RECEIVED state, and process any other fields of this
2738 * segment in this state.
2739 */
2740 case TCPS_LISTEN: {
2741 register struct sockaddr_in *sin;
9bccf70c 2742#if INET6
1c79356b
A
2743 register struct sockaddr_in6 *sin6;
2744#endif
2745
fe8ab488
A
2746 lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx,
2747 LCK_MTX_ASSERT_OWNED);
9bccf70c
A
2748#if INET6
2749 if (isipv6) {
1c79356b
A
2750 MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6,
2751 M_SONAME, M_NOWAIT);
2752 if (sin6 == NULL)
2753 goto drop;
2754 bzero(sin6, sizeof(*sin6));
2755 sin6->sin6_family = AF_INET6;
2756 sin6->sin6_len = sizeof(*sin6);
2757 sin6->sin6_addr = ip6->ip6_src;
2758 sin6->sin6_port = th->th_sport;
2759 laddr6 = inp->in6p_laddr;
2760 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
2761 inp->in6p_laddr = ip6->ip6_dst;
2762 if (in6_pcbconnect(inp, (struct sockaddr *)sin6,
9bccf70c 2763 proc0)) {
1c79356b
A
2764 inp->in6p_laddr = laddr6;
2765 FREE(sin6, M_SONAME);
2766 goto drop;
2767 }
2768 FREE(sin6, M_SONAME);
9bccf70c 2769 } else
1c79356b 2770#endif
9bccf70c 2771 {
6d2010ae 2772 lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1c79356b 2773 MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME,
9bccf70c 2774 M_NOWAIT);
1c79356b
A
2775 if (sin == NULL)
2776 goto drop;
2777 sin->sin_family = AF_INET;
2778 sin->sin_len = sizeof(*sin);
2779 sin->sin_addr = ip->ip_src;
2780 sin->sin_port = th->th_sport;
2781 bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero));
2782 laddr = inp->inp_laddr;
2783 if (inp->inp_laddr.s_addr == INADDR_ANY)
2784 inp->inp_laddr = ip->ip_dst;
39236c6e
A
2785 if (in_pcbconnect(inp, (struct sockaddr *)sin, proc0,
2786 IFSCOPE_NONE, NULL)) {
1c79356b
A
2787 inp->inp_laddr = laddr;
2788 FREE(sin, M_SONAME);
2789 goto drop;
2790 }
2791 FREE(sin, M_SONAME);
1c79356b 2792 }
8ad349bb 2793
c910b4d9 2794 tcp_dooptions(tp, optp, optlen, th, &to, ifscope);
8ad349bb 2795
39236c6e 2796 if (SACK_ENABLED(tp)) {
8ad349bb 2797 if (!(to.to_flags & TOF_SACK))
39236c6e 2798 tp->t_flagsext &= ~(TF_SACK_ENABLE);
8ad349bb
A
2799 else
2800 tp->t_flags |= TF_SACK_PERMIT;
2801 }
2802
1c79356b
A
2803 if (iss)
2804 tp->iss = iss;
0b4e3aa0 2805 else {
9bccf70c
A
2806 tp->iss = tcp_new_isn(tp);
2807 }
1c79356b
A
2808 tp->irs = th->th_seq;
2809 tcp_sendseqinit(tp);
2810 tcp_rcvseqinit(tp);
9bccf70c 2811 tp->snd_recover = tp->snd_una;
1c79356b
A
2812 /*
2813 * Initialization of the tcpcb for transaction;
2814 * set SND.WND = SEG.WND,
2815 * initialize CCsend and CCrecv.
2816 */
2817 tp->snd_wnd = tiwin; /* initial send-window */
1c79356b 2818 tp->t_flags |= TF_ACKNOW;
2d21ac55 2819 tp->t_unacksegs = 0;
6d2010ae
A
2820 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
2821 struct tcpcb *, tp, int32_t, TCPS_SYN_RECEIVED);
1c79356b 2822 tp->t_state = TCPS_SYN_RECEIVED;
6d2010ae 2823 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
39236c6e 2824 TCP_CONN_KEEPINIT(tp));
1c79356b 2825 dropsocket = 0; /* committed to socket */
6d2010ae 2826
316670eb
A
2827 if (inp->inp_flowhash == 0)
2828 inp->inp_flowhash = inp_calc_flowhash(inp);
39236c6e
A
2829#if INET6
2830 /* update flowinfo - RFC 6437 */
2831 if (inp->inp_flow == 0 &&
2832 inp->in6p_flags & IN6P_AUTOFLOWLABEL) {
2833 inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
2834 inp->inp_flow |=
2835 (htonl(inp->inp_flowhash) & IPV6_FLOWLABEL_MASK);
2836 }
2837#endif /* INET6 */
316670eb 2838
6d2010ae
A
2839 /* reset the incomp processing flag */
2840 so->so_flags &= ~(SOF_INCOMP_INPROGRESS);
1c79356b 2841 tcpstat.tcps_accepts++;
2d21ac55
A
2842 if ((thflags & (TH_ECE | TH_CWR)) == (TH_ECE | TH_CWR)) {
2843 /* ECN-setup SYN */
2844 tp->ecn_flags |= (TE_SETUPRECEIVED | TE_SENDIPECT);
2845 }
316670eb 2846
b0d623f7
A
2847#if CONFIG_IFEF_NOWINDOWSCALE
2848 if (tcp_obey_ifef_nowindowscale && m->m_pkthdr.rcvif != NULL &&
2849 (m->m_pkthdr.rcvif->if_eflags & IFEF_NOWINDOWSCALE)) {
2850 /* Window scaling is not enabled on this interface */
2851 tp->t_flags &= ~TF_REQ_SCALE;
593a1d5f
A
2852 }
2853#endif
1c79356b
A
2854 goto trimthenstep6;
2855 }
2856
2857 /*
2858 * If the state is SYN_RECEIVED:
2859 * if seg contains an ACK, but not for our SYN/ACK, send a RST.
2860 */
2861 case TCPS_SYN_RECEIVED:
2862 if ((thflags & TH_ACK) &&
2863 (SEQ_LEQ(th->th_ack, tp->snd_una) ||
9bccf70c
A
2864 SEQ_GT(th->th_ack, tp->snd_max))) {
2865 rstreason = BANDLIM_RST_OPENPORT;
39236c6e 2866 IF_TCP_STATINC(ifp, ooopacket);
1c79356b 2867 goto dropwithreset;
9bccf70c 2868 }
39236c6e
A
2869
2870 /*
2871 * In SYN_RECEIVED state, if we recv some SYNS with
2872 * window scale and others without, window scaling should
2873 * be disabled. Otherwise the window advertised will be
2874 * lower if we assume scaling and the other end does not.
2875 */
2876 if ((thflags & TH_SYN) &&
2877 !(to.to_flags & TOF_SCALE))
2878 tp->t_flags &= ~TF_RCVD_SCALE;
1c79356b
A
2879 break;
2880
2881 /*
2882 * If the state is SYN_SENT:
2883 * if seg contains an ACK, but not for our SYN, drop the input.
2884 * if seg contains a RST, then drop the connection.
2885 * if seg does not contain SYN, then drop it.
2886 * Otherwise this is an acceptable SYN segment
2887 * initialize tp->rcv_nxt and tp->irs
2888 * if seg contains ack then advance tp->snd_una
2889 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
2890 * arrange for segment to be acked (eventually)
2891 * continue processing rest of data/controls, beginning with URG
2892 */
2893 case TCPS_SYN_SENT:
1c79356b
A
2894 if ((thflags & TH_ACK) &&
2895 (SEQ_LEQ(th->th_ack, tp->iss) ||
2896 SEQ_GT(th->th_ack, tp->snd_max))) {
8ad349bb 2897 rstreason = BANDLIM_UNLIMITED;
39236c6e 2898 IF_TCP_STATINC(ifp, ooopacket);
8ad349bb 2899 goto dropwithreset;
1c79356b
A
2900 }
2901 if (thflags & TH_RST) {
2d21ac55 2902 if ((thflags & TH_ACK) != 0) {
fe8ab488
A
2903#if MPTCP
2904 if ((so->so_flags & SOF_MPTCP_FASTJOIN) &&
2905 SEQ_GT(th->th_ack, tp->iss+1)) {
2906 so->so_flags &= ~SOF_MPTCP_FASTJOIN;
2907 /* ignore the RST and retransmit SYN */
2908 goto drop;
2909 }
2910#endif /* MPTCP */
316670eb
A
2911 soevent(so,
2912 (SO_FILT_HINT_LOCKED |
2913 SO_FILT_HINT_CONNRESET));
1c79356b
A
2914 tp = tcp_drop(tp, ECONNREFUSED);
2915 postevent(so, 0, EV_RESET);
2d21ac55 2916 }
1c79356b
A
2917 goto drop;
2918 }
2919 if ((thflags & TH_SYN) == 0)
2920 goto drop;
2921 tp->snd_wnd = th->th_win; /* initial send window */
1c79356b
A
2922
2923 tp->irs = th->th_seq;
2924 tcp_rcvseqinit(tp);
2925 if (thflags & TH_ACK) {
1c79356b 2926 tcpstat.tcps_connects++;
2d21ac55
A
2927
2928 if ((thflags & (TH_ECE | TH_CWR)) == (TH_ECE)) {
2929 /* ECN-setup SYN-ACK */
2930 tp->ecn_flags |= TE_SETUPRECEIVED;
fe8ab488 2931 tcpstat.tcps_ecn_setup++;
2d21ac55
A
2932 }
2933 else {
2934 /* non-ECN-setup SYN-ACK */
2935 tp->ecn_flags &= ~TE_SENDIPECT;
2936 }
2937
2d21ac55
A
2938#if CONFIG_MACF_NET && CONFIG_MACF_SOCKET
2939 /* XXXMAC: recursive lock: SOCK_LOCK(so); */
2940 mac_socketpeer_label_associate_mbuf(m, so);
2941 /* XXXMAC: SOCK_UNLOCK(so); */
2942#endif
1c79356b
A
2943 /* Do window scaling on this connection? */
2944 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
2945 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
2946 tp->snd_scale = tp->requested_s_scale;
2947 tp->rcv_scale = tp->request_r_scale;
2948 }
316670eb 2949 tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN << tp->rcv_scale);
1c79356b
A
2950 tp->snd_una++; /* SYN is acked */
2951 /*
2952 * If there's data, delay ACK; if there's also a FIN
2953 * ACKNOW will be turned on later.
2954 */
316670eb
A
2955 TCP_INC_VAR(tp->t_unacksegs, nlropkts);
2956 if (DELAY_ACK(tp, th) && tlen != 0 ) {
6d2010ae
A
2957 if ((tp->t_flags & TF_DELACK) == 0) {
2958 tp->t_flags |= TF_DELACK;
2959 tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
2960 }
1c79356b 2961 }
91447636 2962 else {
1c79356b 2963 tp->t_flags |= TF_ACKNOW;
91447636 2964 }
1c79356b
A
2965 /*
2966 * Received <SYN,ACK> in SYN_SENT[*] state.
2967 * Transitions:
2968 * SYN_SENT --> ESTABLISHED
2969 * SYN_SENT* --> FIN_WAIT_1
2970 */
6d2010ae 2971 tp->t_starttime = tcp_now;
316670eb 2972 tcp_sbrcv_tstmp_check(tp);
1c79356b 2973 if (tp->t_flags & TF_NEEDFIN) {
6d2010ae
A
2974 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
2975 struct tcpcb *, tp, int32_t, TCPS_FIN_WAIT_1);
1c79356b
A
2976 tp->t_state = TCPS_FIN_WAIT_1;
2977 tp->t_flags &= ~TF_NEEDFIN;
2978 thflags &= ~TH_SYN;
2979 } else {
6d2010ae
A
2980 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
2981 struct tcpcb *, tp, int32_t, TCPS_ESTABLISHED);
1c79356b 2982 tp->t_state = TCPS_ESTABLISHED;
39236c6e
A
2983 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
2984 TCP_CONN_KEEPIDLE(tp));
6d2010ae
A
2985 if (nstat_collect)
2986 nstat_route_connect_success(tp->t_inpcb->inp_route.ro_rt);
1c79356b 2987 }
39236c6e
A
2988#if MPTCP
2989 /*
2990 * Do not send the connect notification for additional
2991 * subflows until ACK for 3-way handshake arrives.
2992 */
2993 if ((!(tp->t_mpflags & TMPF_MPTCP_TRUE)) &&
2994 (tp->t_mpflags & TMPF_SENT_JOIN)) {
2995 isconnected = FALSE;
fe8ab488
A
2996 /* Start data xmit if fastjoin */
2997 if (mptcp_fastjoin && (so->so_flags & SOF_MPTCP_FASTJOIN)) {
2998 soevent(so, (SO_FILT_HINT_LOCKED |
2999 SO_FILT_HINT_MPFASTJ));
3000 }
39236c6e
A
3001 } else
3002#endif /* MPTCP */
3003 isconnected = TRUE;
1c79356b 3004 } else {
6d2010ae
A
3005 /*
3006 * Received initial SYN in SYN-SENT[*] state => simul-
3007 * taneous open. If segment contains CC option and there is
3008 * a cached CC, apply TAO test; if it succeeds, connection is
3009 * half-synchronized. Otherwise, do 3-way handshake:
3010 * SYN-SENT -> SYN-RECEIVED
3011 * SYN-SENT* -> SYN-RECEIVED*
3012 */
1c79356b
A
3013 tp->t_flags |= TF_ACKNOW;
3014 tp->t_timer[TCPT_REXMT] = 0;
6d2010ae
A
3015 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
3016 struct tcpcb *, tp, int32_t, TCPS_SYN_RECEIVED);
8ad349bb
A
3017 tp->t_state = TCPS_SYN_RECEIVED;
3018
1c79356b
A
3019 }
3020
3021trimthenstep6:
3022 /*
3023 * Advance th->th_seq to correspond to first data byte.
3024 * If data, trim to stay within window,
3025 * dropping FIN if necessary.
3026 */
3027 th->th_seq++;
9bccf70c
A
3028 if (tlen > tp->rcv_wnd) {
3029 todrop = tlen - tp->rcv_wnd;
1c79356b 3030 m_adj(m, -todrop);
9bccf70c 3031 tlen = tp->rcv_wnd;
1c79356b
A
3032 thflags &= ~TH_FIN;
3033 tcpstat.tcps_rcvpackafterwin++;
3034 tcpstat.tcps_rcvbyteafterwin += todrop;
3035 }
3036 tp->snd_wl1 = th->th_seq - 1;
3037 tp->rcv_up = th->th_seq;
3038 /*
3039 * Client side of transaction: already sent SYN and data.
3040 * If the remote host used T/TCP to validate the SYN,
3041 * our data will be ACK'd; if so, enter normal data segment
3042 * processing in the middle of step 5, ack processing.
3043 * Otherwise, goto step 6.
3044 */
3045 if (thflags & TH_ACK)
3046 goto process_ACK;
3047 goto step6;
3048 /*
3049 * If the state is LAST_ACK or CLOSING or TIME_WAIT:
8ad349bb 3050 * do normal processing.
1c79356b 3051 *
8ad349bb 3052 * NB: Leftover from RFC1644 T/TCP. Cases to be reused later.
1c79356b
A
3053 */
3054 case TCPS_LAST_ACK:
3055 case TCPS_CLOSING:
3056 case TCPS_TIME_WAIT:
1c79356b 3057 break; /* continue normal processing */
55e303ae
A
3058
3059 /* Received a SYN while connection is already established.
3060 * This is a "half open connection and other anomalies" described
3061 * in RFC793 page 34, send an ACK so the remote reset the connection
3062 * or recovers by adjusting its sequence numberering
3063 */
3064 case TCPS_ESTABLISHED:
3065 if (thflags & TH_SYN)
3066 goto dropafterack;
3067 break;
1c79356b
A
3068 }
3069
3070 /*
3071 * States other than LISTEN or SYN_SENT.
3072 * First check the RST flag and sequence number since reset segments
3073 * are exempt from the timestamp and connection count tests. This
3074 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
3075 * below which allowed reset segments in half the sequence space
3076 * to fall though and be processed (which gives forged reset
3077 * segments with a random sequence number a 50 percent chance of
3078 * killing a connection).
3079 * Then check timestamp, if present.
3080 * Then check the connection count, if present.
3081 * Then check that at least some bytes of segment are within
3082 * receive window. If segment begins before rcv_nxt,
3083 * drop leading data (and SYN); if nothing left, just ack.
3084 *
3085 *
3086 * If the RST bit is set, check the sequence number to see
3087 * if this is a valid reset segment.
3088 * RFC 793 page 37:
3089 * In all states except SYN-SENT, all reset (RST) segments
3090 * are validated by checking their SEQ-fields. A reset is
3091 * valid if its sequence number is in the window.
3092 * Note: this does not take into account delayed ACKs, so
3093 * we should test against last_ack_sent instead of rcv_nxt.
9bccf70c
A
3094 * The sequence number in the reset segment is normally an
3095 * echo of our outgoing acknowlegement numbers, but some hosts
3096 * send a reset with the sequence number at the rightmost edge
3097 * of our receive window, and we have to handle this case.
8ad349bb
A
3098 * Note 2: Paul Watson's paper "Slipping in the Window" has shown
3099 * that brute force RST attacks are possible. To combat this,
3100 * we use a much stricter check while in the ESTABLISHED state,
3101 * only accepting RSTs where the sequence number is equal to
3102 * last_ack_sent. In all other states (the states in which a
3103 * RST is more likely), the more permissive check is used.
1c79356b
A
3104 * If we have multiple segments in flight, the intial reset
3105 * segment sequence numbers will be to the left of last_ack_sent,
3106 * but they will eventually catch up.
3107 * In any case, it never made sense to trim reset segments to
3108 * fit the receive window since RFC 1122 says:
3109 * 4.2.2.12 RST Segment: RFC-793 Section 3.4
3110 *
3111 * A TCP SHOULD allow a received RST segment to include data.
3112 *
3113 * DISCUSSION
3114 * It has been suggested that a RST segment could contain
3115 * ASCII text that encoded and explained the cause of the
3116 * RST. No standard has yet been established for such
3117 * data.
3118 *
3119 * If the reset segment passes the sequence number test examine
3120 * the state:
3121 * SYN_RECEIVED STATE:
3122 * If passive open, return to LISTEN state.
3123 * If active open, inform user that connection was refused.
8ad349bb 3124 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES:
1c79356b 3125 * Inform user that connection was reset, and close tcb.
9bccf70c 3126 * CLOSING, LAST_ACK STATES:
1c79356b 3127 * Close the tcb.
9bccf70c 3128 * TIME_WAIT STATE:
1c79356b
A
3129 * Drop the segment - see Stevens, vol. 2, p. 964 and
3130 * RFC 1337.
0c530ab8 3131 *
2d21ac55
A
3132 * Radar 4803931: Allows for the case where we ACKed the FIN but
3133 * there is already a RST in flight from the peer.
3134 * In that case, accept the RST for non-established
3135 * state if it's one off from last_ack_sent.
3136
1c79356b
A
3137 */
3138 if (thflags & TH_RST) {
8ad349bb
A
3139 if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
3140 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
0c530ab8 3141 (tp->rcv_wnd == 0 &&
39236c6e
A
3142 ((tp->last_ack_sent == th->th_seq) ||
3143 ((tp->last_ack_sent -1) == th->th_seq)))) {
1c79356b
A
3144 switch (tp->t_state) {
3145
3146 case TCPS_SYN_RECEIVED:
39236c6e 3147 IF_TCP_STATINC(ifp, rstinsynrcv);
1c79356b
A
3148 so->so_error = ECONNREFUSED;
3149 goto close;
3150
3151 case TCPS_ESTABLISHED:
8ad349bb 3152 if (tp->last_ack_sent != th->th_seq) {
2d21ac55 3153 tcpstat.tcps_badrst++;
8ad349bb
A
3154 goto drop;
3155 }
1c79356b
A
3156 case TCPS_FIN_WAIT_1:
3157 case TCPS_CLOSE_WAIT:
1c79356b
A
3158 /*
3159 Drop through ...
3160 */
3161 case TCPS_FIN_WAIT_2:
3162 so->so_error = ECONNRESET;
3163 close:
3164 postevent(so, 0, EV_RESET);
316670eb
A
3165 soevent(so,
3166 (SO_FILT_HINT_LOCKED |
3167 SO_FILT_HINT_CONNRESET));
3168
1c79356b
A
3169 tcpstat.tcps_drops++;
3170 tp = tcp_close(tp);
3171 break;
3172
3173 case TCPS_CLOSING:
3174 case TCPS_LAST_ACK:
1c79356b
A
3175 tp = tcp_close(tp);
3176 break;
3177
3178 case TCPS_TIME_WAIT:
3179 break;
3180 }
3181 }
3182 goto drop;
3183 }
3184
3185 /*
3186 * RFC 1323 PAWS: If we have a timestamp reply on this segment
3187 * and it's less than ts_recent, drop it.
3188 */
8ad349bb 3189 if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
1c79356b
A
3190 TSTMP_LT(to.to_tsval, tp->ts_recent)) {
3191
3192 /* Check to see if ts_recent is over 24 days old. */
3193 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) {
3194 /*
3195 * Invalidate ts_recent. If this segment updates
3196 * ts_recent, the age will be reset later and ts_recent
3197 * will get a valid value. If it does not, setting
3198 * ts_recent to zero will at least satisfy the
3199 * requirement that zero be placed in the timestamp
3200 * echo reply when ts_recent isn't valid. The
3201 * age isn't reset until we get a valid ts_recent
3202 * because we don't want out-of-order segments to be
3203 * dropped when ts_recent is old.
3204 */
3205 tp->ts_recent = 0;
3206 } else {
3207 tcpstat.tcps_rcvduppack++;
9bccf70c 3208 tcpstat.tcps_rcvdupbyte += tlen;
1c79356b 3209 tcpstat.tcps_pawsdrop++;
6d2010ae 3210 if (nstat_collect) {
39236c6e
A
3211 nstat_route_rx(tp->t_inpcb->inp_route.ro_rt,
3212 1, tlen, NSTAT_RX_FLAG_DUPLICATE);
fe8ab488
A
3213 INP_ADD_STAT(inp, cell, wifi, wired,
3214 rxpackets, 1);
3215 INP_ADD_STAT(inp, cell, wifi, wired,
3216 rxbytes, tlen);
6d2010ae
A
3217 tp->t_stat.rxduplicatebytes += tlen;
3218 }
8ad349bb
A
3219 if (tlen)
3220 goto dropafterack;
3221 goto drop;
1c79356b
A
3222 }
3223 }
3224
1c79356b
A
3225 /*
3226 * In the SYN-RECEIVED state, validate that the packet belongs to
3227 * this connection before trimming the data to fit the receive
3228 * window. Check the sequence number versus IRS since we know
3229 * the sequence numbers haven't wrapped. This is a partial fix
3230 * for the "LAND" DoS attack.
3231 */
9bccf70c
A
3232 if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
3233 rstreason = BANDLIM_RST_OPENPORT;
39236c6e 3234 IF_TCP_STATINC(ifp, dospacket);
1c79356b 3235 goto dropwithreset;
9bccf70c 3236 }
1c79356b
A
3237
3238 todrop = tp->rcv_nxt - th->th_seq;
3239 if (todrop > 0) {
3240 if (thflags & TH_SYN) {
3241 thflags &= ~TH_SYN;
3242 th->th_seq++;
3243 if (th->th_urp > 1)
3244 th->th_urp--;
3245 else
3246 thflags &= ~TH_URG;
3247 todrop--;
3248 }
3249 /*
3250 * Following if statement from Stevens, vol. 2, p. 960.
3251 */
9bccf70c
A
3252 if (todrop > tlen
3253 || (todrop == tlen && (thflags & TH_FIN) == 0)) {
1c79356b
A
3254 /*
3255 * Any valid FIN must be to the left of the window.
3256 * At this point the FIN must be a duplicate or out
3257 * of sequence; drop it.
3258 */
3259 thflags &= ~TH_FIN;
3260
3261 /*
3262 * Send an ACK to resynchronize and drop any data.
3263 * But keep on processing for RST or ACK.
3264 */
3265 tp->t_flags |= TF_ACKNOW;
316670eb
A
3266 if (todrop == 1) {
3267 /* This could be a keepalive */
3268 soevent(so, SO_FILT_HINT_LOCKED |
3269 SO_FILT_HINT_KEEPALIVE);
3270 }
9bccf70c 3271 todrop = tlen;
1c79356b 3272 tcpstat.tcps_rcvduppack++;
316670eb 3273 tcpstat.tcps_rcvdupbyte += todrop;
1c79356b
A
3274 } else {
3275 tcpstat.tcps_rcvpartduppack++;
3276 tcpstat.tcps_rcvpartdupbyte += todrop;
3277 }
6d2010ae 3278 if (nstat_collect) {
39236c6e
A
3279 nstat_route_rx(tp->t_inpcb->inp_route.ro_rt, 1,
3280 todrop, NSTAT_RX_FLAG_DUPLICATE);
fe8ab488
A
3281 INP_ADD_STAT(inp, cell, wifi, wired, rxpackets, 1);
3282 INP_ADD_STAT(inp, cell, wifi, wired, rxbytes, todrop);
6d2010ae
A
3283 tp->t_stat.rxduplicatebytes += todrop;
3284 }
9bccf70c 3285 drop_hdrlen += todrop; /* drop from the top afterwards */
1c79356b 3286 th->th_seq += todrop;
9bccf70c 3287 tlen -= todrop;
1c79356b
A
3288 if (th->th_urp > todrop)
3289 th->th_urp -= todrop;
3290 else {
3291 thflags &= ~TH_URG;
3292 th->th_urp = 0;
3293 }
3294 }
3295
3296 /*
39236c6e
A
3297 * If new data are received on a connection after the user processes
3298 * are gone, then RST the other end. Note that an MPTCP subflow socket
3299 * would have SS_NOFDREF set by default, so check to make sure that
3300 * we test for SOF_MP_SUBFLOW socket flag (which would be cleared when
3301 * the socket is closed.)
1c79356b 3302 */
39236c6e
A
3303 if (!(so->so_flags & SOF_MP_SUBFLOW) &&
3304 (so->so_state & SS_NOFDREF) &&
9bccf70c 3305 tp->t_state > TCPS_CLOSE_WAIT && tlen) {
1c79356b
A
3306 tp = tcp_close(tp);
3307 tcpstat.tcps_rcvafterclose++;
9bccf70c 3308 rstreason = BANDLIM_UNLIMITED;
39236c6e 3309 IF_TCP_STATINC(ifp, cleanup);
1c79356b
A
3310 goto dropwithreset;
3311 }
3312
3313 /*
3314 * If segment ends after window, drop trailing data
3315 * (and PUSH and FIN); if nothing left, just ACK.
3316 */
9bccf70c 3317 todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd);
1c79356b
A
3318 if (todrop > 0) {
3319 tcpstat.tcps_rcvpackafterwin++;
9bccf70c
A
3320 if (todrop >= tlen) {
3321 tcpstat.tcps_rcvbyteafterwin += tlen;
1c79356b
A
3322 /*
3323 * If a new connection request is received
3324 * while in TIME_WAIT, drop the old connection
3325 * and start over if the sequence numbers
3326 * are above the previous ones.
3327 */
3328 if (thflags & TH_SYN &&
3329 tp->t_state == TCPS_TIME_WAIT &&
3330 SEQ_GT(th->th_seq, tp->rcv_nxt)) {
9bccf70c 3331 iss = tcp_new_isn(tp);
1c79356b 3332 tp = tcp_close(tp);
91447636 3333 tcp_unlock(so, 1, 0);
1c79356b
A
3334 goto findpcb;
3335 }
3336 /*
3337 * If window is closed can only take segments at
3338 * window edge, and have to drop data and PUSH from
3339 * incoming segments. Continue processing, but
3340 * remember to ack. Otherwise, drop segment
3341 * and ack.
3342 */
3343 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
3344 tp->t_flags |= TF_ACKNOW;
3345 tcpstat.tcps_rcvwinprobe++;
3346 } else
3347 goto dropafterack;
3348 } else
3349 tcpstat.tcps_rcvbyteafterwin += todrop;
3350 m_adj(m, -todrop);
9bccf70c 3351 tlen -= todrop;
1c79356b
A
3352 thflags &= ~(TH_PUSH|TH_FIN);
3353 }
3354
3355 /*
3356 * If last ACK falls within this segment's sequence numbers,
3357 * record its timestamp.
8ad349bb
A
3358 * NOTE:
3359 * 1) That the test incorporates suggestions from the latest
3360 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
3361 * 2) That updating only on newer timestamps interferes with
3362 * our earlier PAWS tests, so this check should be solely
3363 * predicated on the sequence space of this segment.
3364 * 3) That we modify the segment boundary check to be
3365 * Last.ACK.Sent <= SEG.SEQ + SEG.Len
3366 * instead of RFC1323's
3367 * Last.ACK.Sent < SEG.SEQ + SEG.Len,
3368 * This modified check allows us to overcome RFC1323's
3369 * limitations as described in Stevens TCP/IP Illustrated
3370 * Vol. 2 p.869. In such cases, we can still calculate the
3371 * RTT correctly when RCV.NXT == Last.ACK.Sent.
1c79356b 3372 */
8ad349bb
A
3373 if ((to.to_flags & TOF_TS) != 0 &&
3374 SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
3375 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
3376 ((thflags & (TH_SYN|TH_FIN)) != 0))) {
1c79356b
A
3377 tp->ts_recent_age = tcp_now;
3378 tp->ts_recent = to.to_tsval;
3379 }
3380
3381 /*
3382 * If a SYN is in the window, then this is an
3383 * error and we send an RST and drop the connection.
3384 */
3385 if (thflags & TH_SYN) {
3386 tp = tcp_drop(tp, ECONNRESET);
9bccf70c 3387 rstreason = BANDLIM_UNLIMITED;
1c79356b 3388 postevent(so, 0, EV_RESET);
39236c6e 3389 IF_TCP_STATINC(ifp, synwindow);
1c79356b
A
3390 goto dropwithreset;
3391 }
3392
3393 /*
3394 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN
3395 * flag is on (half-synchronized state), then queue data for
3396 * later processing; else drop segment and return.
3397 */
3398 if ((thflags & TH_ACK) == 0) {
3399 if (tp->t_state == TCPS_SYN_RECEIVED ||
3400 (tp->t_flags & TF_NEEDSYN))
3401 goto step6;
2d21ac55
A
3402 else if (tp->t_flags & TF_ACKNOW)
3403 goto dropafterack;
1c79356b
A
3404 else
3405 goto drop;
3406 }
3407
3408 /*
3409 * Ack processing.
3410 */
39236c6e 3411
1c79356b
A
3412 switch (tp->t_state) {
3413
3414 /*
3415 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
3416 * ESTABLISHED state and continue processing.
3417 * The ACK was checked above.
3418 */
3419 case TCPS_SYN_RECEIVED:
3420
3421 tcpstat.tcps_connects++;
1c79356b
A
3422
3423 /* Do window scaling? */
3424 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3425 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
3426 tp->snd_scale = tp->requested_s_scale;
3427 tp->rcv_scale = tp->request_r_scale;
6d2010ae
A
3428 tp->snd_wnd = th->th_win << tp->snd_scale;
3429 tiwin = tp->snd_wnd;
1c79356b 3430 }
1c79356b
A
3431 /*
3432 * Make transitions:
3433 * SYN-RECEIVED -> ESTABLISHED
3434 * SYN-RECEIVED* -> FIN-WAIT-1
3435 */
6d2010ae 3436 tp->t_starttime = tcp_now;
316670eb 3437 tcp_sbrcv_tstmp_check(tp);
1c79356b 3438 if (tp->t_flags & TF_NEEDFIN) {
6d2010ae
A
3439 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
3440 struct tcpcb *, tp, int32_t, TCPS_FIN_WAIT_1);
1c79356b
A
3441 tp->t_state = TCPS_FIN_WAIT_1;
3442 tp->t_flags &= ~TF_NEEDFIN;
3443 } else {
6d2010ae
A
3444 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
3445 struct tcpcb *, tp, int32_t, TCPS_ESTABLISHED);
1c79356b 3446 tp->t_state = TCPS_ESTABLISHED;
39236c6e
A
3447 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
3448 TCP_CONN_KEEPIDLE(tp));
6d2010ae
A
3449 if (nstat_collect)
3450 nstat_route_connect_success(tp->t_inpcb->inp_route.ro_rt);
1c79356b
A
3451 }
3452 /*
3453 * If segment contains data or ACK, will call tcp_reass()
3454 * later; if not, do so now to pass queued data to user.
3455 */
9bccf70c 3456 if (tlen == 0 && (thflags & TH_FIN) == 0)
2d21ac55 3457 (void) tcp_reass(tp, (struct tcphdr *)0, &tlen,
39236c6e 3458 NULL, ifp);
1c79356b 3459 tp->snd_wl1 = th->th_seq - 1;
4a3eedf9 3460
8ad349bb 3461 /* FALLTHROUGH */
39236c6e
A
3462#if MPTCP
3463 /*
3464 * Do not send the connect notification for additional subflows
3465 * until ACK for 3-way handshake arrives.
3466 */
3467 if ((!(tp->t_mpflags & TMPF_MPTCP_TRUE)) &&
3468 (tp->t_mpflags & TMPF_SENT_JOIN)) {
3469 isconnected = FALSE;
3470 } else
3471#endif /* MPTCP */
3472 isconnected = TRUE;
4a3eedf9 3473
1c79356b
A
3474 /*
3475 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
3476 * ACKs. If the ack is in the range
3477 * tp->snd_una < th->th_ack <= tp->snd_max
3478 * then advance tp->snd_una to th->th_ack and drop
3479 * data from the retransmission queue. If this ACK reflects
3480 * more up to date window information we update our window information.
3481 */
3482 case TCPS_ESTABLISHED:
3483 case TCPS_FIN_WAIT_1:
3484 case TCPS_FIN_WAIT_2:
3485 case TCPS_CLOSE_WAIT:
3486 case TCPS_CLOSING:
3487 case TCPS_LAST_ACK:
3488 case TCPS_TIME_WAIT:
8ad349bb
A
3489 if (SEQ_GT(th->th_ack, tp->snd_max)) {
3490 tcpstat.tcps_rcvacktoomuch++;
3491 goto dropafterack;
3492 }
39236c6e 3493 if (SACK_ENABLED(tp) &&
8ad349bb 3494 (to.to_nsacks > 0 || !TAILQ_EMPTY(&tp->snd_holes)))
fe8ab488
A
3495 tcp_sack_doack(tp, &to, th, &sack_bytes_acked);
3496
39236c6e
A
3497#if MPTCP
3498 if ((tp->t_mpuna) && (SEQ_GEQ(th->th_ack, tp->t_mpuna))) {
39236c6e
A
3499 if (tp->t_mpflags & TMPF_PREESTABLISHED) {
3500 /* MP TCP establishment succeeded */
3501 tp->t_mpuna = 0;
3502 if (tp->t_mpflags & TMPF_JOINED_FLOW) {
3503 if (tp->t_mpflags & TMPF_SENT_JOIN) {
3504 tp->t_mpflags &=
3505 ~TMPF_PREESTABLISHED;
3506 tp->t_mpflags |=
3507 TMPF_MPTCP_TRUE;
3508 so->so_flags |= SOF_MPTCP_TRUE;
3509 if (mptcp_dbg >= MP_ERR_DEBUG)
3510 printf("MPTCP SUCCESS"
fe8ab488 3511 " %s \n",__func__);
39236c6e
A
3512 tp->t_timer[TCPT_JACK_RXMT] = 0;
3513 tp->t_mprxtshift = 0;
3514 isconnected = TRUE;
3515 } else {
3516 isconnected = FALSE;
3517 }
3518 } else {
3519 isconnected = TRUE;
3520 tp->t_mpflags &= ~TMPF_SENT_KEYS;
39236c6e
A
3521 }
3522 }
3523 }
3524#endif /* MPTCP */
3525 /*
3526 * If we have outstanding data (other than
3527 * a window probe), this is a completely
3528 * duplicate ack (ie, window info didn't
3529 * change) and the ack is the biggest we've seen.
3530 */
1c79356b 3531 if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
9bccf70c 3532 if (tlen == 0 && tiwin == tp->snd_wnd) {
8a3053a0
A
3533 /*
3534 * If both ends send FIN at the same time,
3535 * then the ack will be a duplicate ack
3536 * but we have to process the FIN. Check
3537 * for this condition and process the FIN
3538 * instead of the dupack
3539 */
3540 if ((thflags & TH_FIN) &&
3541 (tp->t_flags & TF_SENTFIN) &&
3542 !TCPS_HAVERCVDFIN(tp->t_state) &&
3543 (th->th_ack + 1) == tp->snd_max) {
3544 break;
3545 }
39236c6e
A
3546process_dupack:
3547#if MPTCP
3548 /*
3549 * MPTCP options that are ignored must
3550 * not be treated as duplicate ACKs.
3551 */
3552 if (to.to_flags & TOF_MPTCP) {
3553 goto drop;
3554 }
fe8ab488
A
3555
3556 if ((isconnected) && (tp->t_mpflags & TMPF_JOINED_FLOW)) {
3557 if (mptcp_dbg >= MP_ERR_DEBUG)
3558 printf("%s: bypass ack recovery\n",__func__);
3559 break;
3560 }
39236c6e 3561#endif /* MPTCP */
fe8ab488
A
3562 /*
3563 * If a duplicate acknowledgement was seen
3564 * after ECN, it indicates packet loss in
3565 * addition to ECN. Reset INRECOVERY flag
3566 * so that we can process partial acks
3567 * correctly
3568 */
3569 if (tp->ecn_flags & TE_INRECOVERY)
3570 tp->ecn_flags &= ~TE_INRECOVERY;
3571
1c79356b 3572 tcpstat.tcps_rcvdupack++;
39236c6e 3573 ++tp->t_dupacks;
fe8ab488
A
3574
3575 /*
3576 * Check if we need to reset the limit on
3577 * early retransmit
39236c6e 3578 */
fe8ab488
A
3579 if (tp->t_early_rexmt_count > 0 &&
3580 TSTMP_GEQ(tcp_now,
3581 (tp->t_early_rexmt_win +
3582 TCP_EARLY_REXMT_WIN)))
39236c6e
A
3583 tp->t_early_rexmt_count = 0;
3584
3585 /*
3586 * Is early retransmit needed? We check for
3587 * this when the connection is waiting for
fe8ab488 3588 * duplicate acks to enter fast recovery.
39236c6e 3589 */
fe8ab488
A
3590 if (!IN_FASTRECOVERY(tp))
3591 tcp_early_rexmt_check(tp, th);
39236c6e 3592
1c79356b 3593 /*
fe8ab488 3594 * If we've seen exactly rexmt threshold
39236c6e 3595 * of duplicate acks, assume a packet
1c79356b
A
3596 * has been dropped and retransmit it.
3597 * Kludge snd_nxt & the congestion
3598 * window so we send only this one
3599 * packet.
3600 *
3601 * We know we're losing at the current
3602 * window size so do congestion avoidance
3603 * (set ssthresh to half the current window
3604 * and pull our congestion window back to
3605 * the new ssthresh).
3606 *
3607 * Dup acks mean that packets have left the
3608 * network (they're now cached at the receiver)
3609 * so bump cwnd by the amount in the receiver
3610 * to keep a constant cwnd packets in the
3611 * network.
3612 */
3613 if (tp->t_timer[TCPT_REXMT] == 0 ||
fe8ab488
A
3614 (th->th_ack != tp->snd_una
3615 && sack_bytes_acked == 0)) {
1c79356b 3616 tp->t_dupacks = 0;
39236c6e
A
3617 tp->t_rexmtthresh = tcprexmtthresh;
3618 } else if (tp->t_dupacks > tp->t_rexmtthresh ||
fe8ab488
A
3619 IN_FASTRECOVERY(tp)) {
3620
3621 /*
3622 * If this connection was seeing packet
3623 * reordering, then recovery might be
3624 * delayed to disambiguate between
3625 * reordering and loss
3626 */
3627 if (SACK_ENABLED(tp) && !IN_FASTRECOVERY(tp) &&
3628 (tp->t_flagsext &
3629 (TF_PKTS_REORDERED|TF_DELAY_RECOVERY)) ==
3630 (TF_PKTS_REORDERED|TF_DELAY_RECOVERY)) {
3631 /*
3632 * Since the SACK information is already
3633 * updated, this ACK will be dropped
3634 */
3635 break;
3636 }
3637
3638 if (SACK_ENABLED(tp)
3639 && IN_FASTRECOVERY(tp)) {
8ad349bb
A
3640 int awnd;
3641
3642 /*
3643 * Compute the amount of data in flight first.
3644 * We can inject new data into the pipe iff
3645 * we have less than 1/2 the original window's
3646 * worth of data in flight.
9bccf70c 3647 */
8ad349bb
A
3648 awnd = (tp->snd_nxt - tp->snd_fack) +
3649 tp->sackhint.sack_bytes_rexmit;
3650 if (awnd < tp->snd_ssthresh) {
3651 tp->snd_cwnd += tp->t_maxseg;
3652 if (tp->snd_cwnd > tp->snd_ssthresh)
3653 tp->snd_cwnd = tp->snd_ssthresh;
3654 }
3655 } else
9bccf70c 3656 tp->snd_cwnd += tp->t_maxseg;
6d2010ae 3657
fe8ab488 3658 tcp_ccdbg_trace(tp, th, TCP_CC_IN_FASTRECOVERY);
6d2010ae 3659
8ad349bb
A
3660 (void) tcp_output(tp);
3661 goto drop;
39236c6e 3662 } else if (tp->t_dupacks == tp->t_rexmtthresh) {
8ad349bb 3663 tcp_seq onxt = tp->snd_nxt;
8ad349bb
A
3664
3665 /*
3666 * If we're doing sack, check to
3667 * see if we're already in sack
3668 * recovery. If we're not doing sack,
3669 * check to see if we're in newreno
3670 * recovery.
3671 */
39236c6e 3672 if (SACK_ENABLED(tp)) {
8ad349bb
A
3673 if (IN_FASTRECOVERY(tp)) {
3674 tp->t_dupacks = 0;
3675 break;
fe8ab488
A
3676 } else if (tp->t_flagsext & TF_DELAY_RECOVERY) {
3677 break;
8ad349bb 3678 }
6d2010ae 3679 } else {
8ad349bb
A
3680 if (SEQ_LEQ(th->th_ack,
3681 tp->snd_recover)) {
3682 tp->t_dupacks = 0;
3683 break;
3684 }
9bccf70c 3685 }
6d2010ae 3686
fe8ab488
A
3687 tp->snd_recover = tp->snd_max;
3688 tp->t_timer[TCPT_PTO] = 0;
3689 tp->t_rtttime = 0;
3690
3691 /*
3692 * If the connection has seen pkt
3693 * reordering, delay recovery until
3694 * it is clear that the packet
3695 * was lost.
3696 */
3697 if (SACK_ENABLED(tp) &&
3698 (tp->t_flagsext &
3699 (TF_PKTS_REORDERED|TF_DELAY_RECOVERY))
3700 == TF_PKTS_REORDERED &&
3701 !IN_FASTRECOVERY(tp) &&
3702 tp->t_reorderwin > 0 &&
3703 tp->t_state == TCPS_ESTABLISHED) {
3704 tp->t_timer[TCPT_DELAYFR] =
3705 OFFSET_FROM_START(tp,
3706 tp->t_reorderwin);
3707 tp->t_flagsext |= TF_DELAY_RECOVERY;
3708 tcpstat.tcps_delay_recovery++;
3709 tcp_ccdbg_trace(tp, th,
3710 TCP_CC_DELAY_FASTRECOVERY);
3711 break;
3712 }
3713
6d2010ae
A
3714 /*
3715 * If the current tcp cc module has
3716 * defined a hook for tasks to run
3717 * before entering FR, call it
3718 */
3719 if (CC_ALGO(tp)->pre_fr != NULL)
316670eb 3720 CC_ALGO(tp)->pre_fr(tp);
8ad349bb 3721 ENTER_FASTRECOVERY(tp);
1c79356b 3722 tp->t_timer[TCPT_REXMT] = 0;
fe8ab488
A
3723 if ((tp->ecn_flags & TE_ECN_ON)
3724 == TE_ECN_ON)
316670eb 3725 tp->ecn_flags |= TE_SENDCWR;
fe8ab488 3726
39236c6e 3727 if (SACK_ENABLED(tp)) {
8ad349bb
A
3728 tcpstat.tcps_sack_recovery_episode++;
3729 tp->sack_newdata = tp->snd_nxt;
3730 tp->snd_cwnd = tp->t_maxseg;
6d2010ae 3731
fe8ab488
A
3732 /*
3733 * Enable probe timeout to detect
3734 * a tail loss in the recovery
3735 * window.
3736 */
3737 tp->t_timer[TCPT_PTO] =
3738 OFFSET_FROM_START(tp,
3739 max(10, (tp->t_srtt >> TCP_RTT_SHIFT)));
3740
3741 tcp_ccdbg_trace(tp, th,
3742 TCP_CC_ENTER_FASTRECOVERY);
6d2010ae 3743
8ad349bb
A
3744 (void) tcp_output(tp);
3745 goto drop;
3746 }
1c79356b
A
3747 tp->snd_nxt = th->th_ack;
3748 tp->snd_cwnd = tp->t_maxseg;
3749 (void) tcp_output(tp);
3750 tp->snd_cwnd = tp->snd_ssthresh +
8ad349bb 3751 tp->t_maxseg * tp->t_dupacks;
1c79356b
A
3752 if (SEQ_GT(onxt, tp->snd_nxt))
3753 tp->snd_nxt = onxt;
fe8ab488
A
3754 tcp_ccdbg_trace(tp, th,
3755 TCP_CC_ENTER_FASTRECOVERY);
1c79356b 3756 goto drop;
39236c6e
A
3757 } else if (limited_txmt &&
3758 ALLOW_LIMITED_TRANSMIT(tp) &&
3759 (!(SACK_ENABLED(tp)) || sack_bytes_acked > 0) &&
3760 (so->so_snd.sb_cc - (tp->snd_max - tp->snd_una)) > 0) {
3761 u_int32_t incr = (tp->t_maxseg * tp->t_dupacks);
3762
3763 /* Use Limited Transmit algorithm on the first two
3764 * duplicate acks when there is new data to transmit
3765 */
3766 tp->snd_cwnd += incr;
3767 tcpstat.tcps_limited_txt++;
3768 (void) tcp_output(tp);
3769
fe8ab488 3770 tcp_ccdbg_trace(tp, th, TCP_CC_LIMITED_TRANSMIT);
39236c6e
A
3771
3772 /* Reset snd_cwnd back to normal */
3773 tp->snd_cwnd -= incr;
1c79356b 3774 }
39236c6e 3775 } else {
1c79356b 3776 tp->t_dupacks = 0;
39236c6e
A
3777 tp->t_rexmtthresh = tcprexmtthresh;
3778 }
1c79356b
A
3779 break;
3780 }
b0d623f7
A
3781 /*
3782 * If the congestion window was inflated to account
3783 * for the other side's cached packets, retract it.
3784 */
6d2010ae
A
3785 if (IN_FASTRECOVERY(tp)) {
3786 if (SEQ_LT(th->th_ack, tp->snd_recover)) {
fe8ab488
A
3787 /*
3788 * If we received an ECE and entered
3789 * recovery, the subsequent ACKs should
3790 * not be treated as partial acks.
3791 */
3792 if (tp->ecn_flags & TE_INRECOVERY)
3793 goto process_ACK;
3794
39236c6e 3795 if (SACK_ENABLED(tp))
6d2010ae
A
3796 tcp_sack_partialack(tp, th);
3797 else
3798 tcp_newreno_partial_ack(tp, th);
fe8ab488 3799 tcp_ccdbg_trace(tp, th, TCP_CC_PARTIAL_ACK);
6d2010ae
A
3800 } else {
3801 EXIT_FASTRECOVERY(tp);
3802 if (CC_ALGO(tp)->post_fr != NULL)
3803 CC_ALGO(tp)->post_fr(tp, th);
fe8ab488
A
3804
3805 tcp_ccdbg_trace(tp, th,
3806 TCP_CC_EXIT_FASTRECOVERY);
3807 }
3808 } else if ((tp->t_flagsext &
3809 (TF_PKTS_REORDERED|TF_DELAY_RECOVERY))
3810 == (TF_PKTS_REORDERED|TF_DELAY_RECOVERY)) {
3811 /*
3812 * If the ack acknowledges upto snd_recover or if
3813 * it acknowledges all the snd holes, exit
3814 * recovery and cancel the timer. Otherwise,
3815 * this is a partial ack. Wait for recovery timer
3816 * to enter recovery. The snd_holes have already
3817 * been updated.
3818 */
3819 if (SEQ_GEQ(th->th_ack, tp->snd_recover) ||
3820 TAILQ_EMPTY(&tp->snd_holes)) {
3821 tp->t_timer[TCPT_DELAYFR] = 0;
3822 tp->t_flagsext &= ~TF_DELAY_RECOVERY;
3823 EXIT_FASTRECOVERY(tp);
3824 tcp_ccdbg_trace(tp, th,
3825 TCP_CC_EXIT_FASTRECOVERY);
6d2010ae
A
3826 }
3827 } else {
593a1d5f 3828 /*
fe8ab488
A
3829 * We were not in fast recovery. Reset the
3830 * duplicate ack counter.
593a1d5f
A
3831 */
3832 tp->t_dupacks = 0;
39236c6e 3833 tp->t_rexmtthresh = tcprexmtthresh;
593a1d5f 3834 }
593a1d5f
A
3835
3836
1c79356b 3837 /*
8ad349bb 3838 * If we reach this point, ACK is not a duplicate,
1c79356b
A
3839 * i.e., it ACKs something we sent.
3840 */
3841 if (tp->t_flags & TF_NEEDSYN) {
3842 /*
3843 * T/TCP: Connection was half-synchronized, and our
3844 * SYN has been ACK'd (so connection is now fully
3845 * synchronized). Go to non-starred state,
3846 * increment snd_una for ACK of SYN, and check if
3847 * we can do window scaling.
3848 */
3849 tp->t_flags &= ~TF_NEEDSYN;
3850 tp->snd_una++;
3851 /* Do window scaling? */
3852 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3853 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
3854 tp->snd_scale = tp->requested_s_scale;
3855 tp->rcv_scale = tp->request_r_scale;
3856 }
3857 }
3858
3859process_ACK:
39236c6e 3860 acked = BYTES_ACKED(th, tp);
1c79356b
A
3861 tcpstat.tcps_rcvackpack++;
3862 tcpstat.tcps_rcvackbyte += acked;
3863
9bccf70c 3864 /*
39236c6e
A
3865 * If the last packet was a retransmit, make sure
3866 * it was not spurious.
3867 *
fe8ab488
A
3868 * This will also take care of congestion window
3869 * adjustment if a last packet was recovered due to a
3870 * tail loss probe.
9bccf70c 3871 */
fe8ab488 3872 tcp_bad_rexmt_check(tp, th, &to);
9bccf70c 3873
39236c6e
A
3874 /* Recalculate the RTT */
3875 tcp_compute_rtt(tp, &to, th);
1c79356b
A
3876
3877 /*
3878 * If all outstanding data is acked, stop retransmit
3879 * timer and remember to restart (more output or persist).
3880 * If there is more data to be acked, restart retransmit
3881 * timer, using current (possibly backed-off) value.
3882 */
3883 if (th->th_ack == tp->snd_max) {
3884 tp->t_timer[TCPT_REXMT] = 0;
fe8ab488 3885 tp->t_timer[TCPT_PTO] = 0;
1c79356b
A
3886 needoutput = 1;
3887 } else if (tp->t_timer[TCPT_PERSIST] == 0)
fe8ab488
A
3888 tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp,
3889 tp->t_rxtcur);
1c79356b
A
3890
3891 /*
fe8ab488
A
3892 * If no data (only SYN) was ACK'd, skip rest of ACK
3893 * processing.
1c79356b
A
3894 */
3895 if (acked == 0)
3896 goto step6;
3897
fe8ab488 3898
2d21ac55 3899 if ((thflags & TH_ECE) != 0 &&
316670eb 3900 ((tp->ecn_flags & TE_ECN_ON) == TE_ECN_ON)) {
2d21ac55 3901 /*
fe8ab488
A
3902 * Reduce the congestion window if we haven't
3903 * done so.
2d21ac55 3904 */
fe8ab488 3905 if (!IN_FASTRECOVERY(tp)) {
316670eb 3906 tcp_reduce_congestion_window(tp);
fe8ab488
A
3907 tp->ecn_flags |= (TE_INRECOVERY|TE_SENDCWR);
3908 tcp_ccdbg_trace(tp, th, TCP_CC_ECN_RCVD);
2d21ac55 3909 }
6d2010ae 3910 }
b0d623f7 3911
6d2010ae
A
3912 /*
3913 * When new data is acked, open the congestion window.
3914 * The specifics of how this is achieved are up to the
3915 * congestion control algorithm in use for this connection.
3916 *
3917 * The calculations in this function assume that snd_una is
3918 * not updated yet.
3919 */
3920 if (!IN_FASTRECOVERY(tp)) {
3921 if (CC_ALGO(tp)->ack_rcvd != NULL)
3922 CC_ALGO(tp)->ack_rcvd(tp, th);
fe8ab488 3923 tcp_ccdbg_trace(tp, th, TCP_CC_ACK_RCVD);
1c79356b
A
3924 }
3925 if (acked > so->so_snd.sb_cc) {
3926 tp->snd_wnd -= so->so_snd.sb_cc;
3927 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
39236c6e
A
3928 if (so->so_flags & SOF_ENABLE_MSGS) {
3929 so->so_msg_state->msg_serial_bytes -=
3930 (int)so->so_snd.sb_cc;
3931 }
1c79356b
A
3932 ourfinisacked = 1;
3933 } else {
3934 sbdrop(&so->so_snd, acked);
39236c6e
A
3935 if (so->so_flags & SOF_ENABLE_MSGS) {
3936 so->so_msg_state->msg_serial_bytes -=
3937 acked;
3938 }
316670eb 3939 tcp_sbsnd_trim(&so->so_snd);
1c79356b
A
3940 tp->snd_wnd -= acked;
3941 ourfinisacked = 0;
3942 }
91447636 3943 /* detect una wraparound */
6d2010ae 3944 if ( !IN_FASTRECOVERY(tp) &&
8ad349bb
A
3945 SEQ_GT(tp->snd_una, tp->snd_recover) &&
3946 SEQ_LEQ(th->th_ack, tp->snd_recover))
3947 tp->snd_recover = th->th_ack - 1;
6d2010ae
A
3948
3949 if (IN_FASTRECOVERY(tp) &&
8ad349bb
A
3950 SEQ_GEQ(th->th_ack, tp->snd_recover))
3951 EXIT_FASTRECOVERY(tp);
6d2010ae 3952
1c79356b 3953 tp->snd_una = th->th_ack;
39236c6e 3954 if (SACK_ENABLED(tp)) {
8ad349bb
A
3955 if (SEQ_GT(tp->snd_una, tp->snd_recover))
3956 tp->snd_recover = tp->snd_una;
3957 }
1c79356b
A
3958 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
3959 tp->snd_nxt = tp->snd_una;
316670eb
A
3960 if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 &&
3961 tp->t_bwmeas != NULL)
3962 tcp_bwmeas_check(tp);
3963
4a3eedf9
A
3964 /*
3965 * sowwakeup must happen after snd_una, et al. are updated so that
3966 * the sequence numbers are in sync with so_snd
3967 */
3968 sowwakeup(so);
1c79356b
A
3969
3970 switch (tp->t_state) {
3971
3972 /*
3973 * In FIN_WAIT_1 STATE in addition to the processing
3974 * for the ESTABLISHED state if our FIN is now acknowledged
3975 * then enter FIN_WAIT_2.
3976 */
3977 case TCPS_FIN_WAIT_1:
3978 if (ourfinisacked) {
3979 /*
3980 * If we can't receive any more
3981 * data, then closing user can proceed.
39236c6e 3982 * Starting the TCPT_2MSL timer is contrary to the
1c79356b
A
3983 * specification, but if we don't get a FIN
3984 * we'll hang forever.
3985 */
3986 if (so->so_state & SS_CANTRCVMORE) {
39236c6e
A
3987 tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp,
3988 TCP_CONN_MAXIDLE(tp));
6d2010ae
A
3989 isconnected = FALSE;
3990 isdisconnected = TRUE;
1c79356b 3991 }
39236c6e
A
3992 DTRACE_TCP4(state__change, void, NULL,
3993 struct inpcb *, inp,
3994 struct tcpcb *, tp,
3995 int32_t, TCPS_FIN_WAIT_2);
1c79356b 3996 tp->t_state = TCPS_FIN_WAIT_2;
39236c6e
A
3997 /* fall through and make sure we also recognize
3998 * data ACKed with the FIN
3999 */
1c79356b 4000 }
c910b4d9 4001 tp->t_flags |= TF_ACKNOW;
1c79356b
A
4002 break;
4003
4004 /*
4005 * In CLOSING STATE in addition to the processing for
4006 * the ESTABLISHED state if the ACK acknowledges our FIN
4007 * then enter the TIME-WAIT state, otherwise ignore
4008 * the segment.
4009 */
4010 case TCPS_CLOSING:
4011 if (ourfinisacked) {
39236c6e
A
4012 DTRACE_TCP4(state__change, void, NULL,
4013 struct inpcb *, inp,
4014 struct tcpcb *, tp,
4015 int32_t, TCPS_TIME_WAIT);
1c79356b
A
4016 tp->t_state = TCPS_TIME_WAIT;
4017 tcp_canceltimers(tp);
fe8ab488
A
4018 if (tp->t_flagsext & TF_NOTIMEWAIT) {
4019 tp->t_flags |= TF_CLOSING;
4020 } else {
4021 add_to_time_wait(tp, 2 * tcp_msl);
4022 }
6d2010ae
A
4023 isconnected = FALSE;
4024 isdisconnected = TRUE;
1c79356b 4025 }
c910b4d9 4026 tp->t_flags |= TF_ACKNOW;
1c79356b
A
4027 break;
4028
4029 /*
4030 * In LAST_ACK, we may still be waiting for data to drain
4031 * and/or to be acked, as well as for the ack of our FIN.
4032 * If our FIN is now acknowledged, delete the TCB,
4033 * enter the closed state and return.
4034 */
4035 case TCPS_LAST_ACK:
4036 if (ourfinisacked) {
4037 tp = tcp_close(tp);
4038 goto drop;
4039 }
4040 break;
4041
4042 /*
4043 * In TIME_WAIT state the only thing that should arrive
4044 * is a retransmission of the remote FIN. Acknowledge
4045 * it and restart the finack timer.
4046 */
4047 case TCPS_TIME_WAIT:
6d2010ae 4048 add_to_time_wait(tp, 2 * tcp_msl);
1c79356b
A
4049 goto dropafterack;
4050 }
39236c6e
A
4051
4052 /*
4053 * If there is a SACK option on the ACK and we
4054 * haven't seen any duplicate acks before, count
4055 * it as a duplicate ack even if the cumulative
4056 * ack is advanced. If the receiver delayed an
4057 * ack and detected loss afterwards, then the ack
4058 * will advance cumulative ack and will also have
4059 * a SACK option. So counting it as one duplicate
4060 * ack is ok.
4061 */
4062 if (sack_ackadv == 1 &&
fe8ab488
A
4063 tp->t_state == TCPS_ESTABLISHED &&
4064 SACK_ENABLED(tp) && sack_bytes_acked > 0 &&
4065 to.to_nsacks > 0 && tp->t_dupacks == 0 &&
4066 SEQ_LEQ(th->th_ack, tp->snd_una) && tlen == 0 &&
4067 !(tp->t_flagsext & TF_PKTS_REORDERED)) {
39236c6e
A
4068 tcpstat.tcps_sack_ackadv++;
4069 goto process_dupack;
4070 }
1c79356b
A
4071 }
4072
4073step6:
4074 /*
4075 * Update window information.
4076 * Don't look at window if no ACK: TAC's send garbage on first SYN.
4077 */
4078 if ((thflags & TH_ACK) &&
4079 (SEQ_LT(tp->snd_wl1, th->th_seq) ||
4080 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
4081 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
4082 /* keep track of pure window updates */
9bccf70c 4083 if (tlen == 0 &&
1c79356b
A
4084 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
4085 tcpstat.tcps_rcvwinupd++;
4086 tp->snd_wnd = tiwin;
4087 tp->snd_wl1 = th->th_seq;
4088 tp->snd_wl2 = th->th_ack;
4089 if (tp->snd_wnd > tp->max_sndwnd)
4090 tp->max_sndwnd = tp->snd_wnd;
4091 needoutput = 1;
4092 }
4093
4094 /*
4095 * Process segments with URG.
4096 */
4097 if ((thflags & TH_URG) && th->th_urp &&
4098 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
4099 /*
4100 * This is a kludge, but if we receive and accept
4101 * random urgent pointers, we'll crash in
4102 * soreceive. It's hard to imagine someone
4103 * actually wanting to send this much urgent data.
4104 */
4105 if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
4106 th->th_urp = 0; /* XXX */
4107 thflags &= ~TH_URG; /* XXX */
4108 goto dodata; /* XXX */
4109 }
4110 /*
4111 * If this segment advances the known urgent pointer,
4112 * then mark the data stream. This should not happen
4113 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
4114 * a FIN has been received from the remote side.
4115 * In these states we ignore the URG.
4116 *
4117 * According to RFC961 (Assigned Protocols),
4118 * the urgent pointer points to the last octet
4119 * of urgent data. We continue, however,
4120 * to consider it to indicate the first octet
4121 * of data past the urgent section as the original
4122 * spec states (in one of two places).
4123 */
4124 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
4125 tp->rcv_up = th->th_seq + th->th_urp;
4126 so->so_oobmark = so->so_rcv.sb_cc +
4127 (tp->rcv_up - tp->rcv_nxt) - 1;
4128 if (so->so_oobmark == 0) {
4129 so->so_state |= SS_RCVATMARK;
4130 postevent(so, 0, EV_OOB);
4131 }
4132 sohasoutofband(so);
4133 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
4134 }
4135 /*
4136 * Remove out of band data so doesn't get presented to user.
4137 * This can happen independent of advancing the URG pointer,
4138 * but if two URG's are pending at once, some out-of-band
4139 * data may creep in... ick.
4140 */
b0d623f7 4141 if (th->th_urp <= (u_int32_t)tlen
1c79356b
A
4142#if SO_OOBINLINE
4143 && (so->so_options & SO_OOBINLINE) == 0
4144#endif
4145 )
9bccf70c
A
4146 tcp_pulloutofband(so, th, m,
4147 drop_hdrlen); /* hdr drop is delayed */
6d2010ae 4148 } else {
1c79356b
A
4149 /*
4150 * If no out of band data is expected,
4151 * pull receive urgent pointer along
4152 * with the receive window.
4153 */
4154 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
4155 tp->rcv_up = tp->rcv_nxt;
6d2010ae
A
4156 }
4157dodata:
1c79356b 4158
6d2010ae
A
4159 /* Set socket's connect or disconnect state correcly before doing data.
4160 * The following might unlock the socket if there is an upcall or a socket
4161 * filter.
4162 */
4163 if (isconnected) {
4164 soisconnected(so);
4165 } else if (isdisconnected) {
4166 soisdisconnected(so);
4167 }
4168
4169 /* Let's check the state of pcb just to make sure that it did not get closed
4170 * when we unlocked above
4171 */
4172 if (inp->inp_state == INPCB_STATE_DEAD) {
4173 /* Just drop the packet that we are processing and return */
4174 goto drop;
4175 }
4176
1c79356b
A
4177 /*
4178 * Process the segment text, merging it into the TCP sequencing queue,
4179 * and arranging for acknowledgment of receipt if necessary.
4180 * This process logically involves adjusting tp->rcv_wnd as data
4181 * is presented to the user (this happens in tcp_usrreq.c,
4182 * case PRU_RCVD). If a FIN has already been received on this
4183 * connection then we just ignore the text.
4184 */
c910b4d9 4185 if ((tlen || (thflags & TH_FIN)) &&
1c79356b 4186 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
8ad349bb
A
4187 tcp_seq save_start = th->th_seq;
4188 tcp_seq save_end = th->th_seq + tlen;
9bccf70c
A
4189 m_adj(m, drop_hdrlen); /* delayed header drop */
4190 /*
8ad349bb
A
4191 * Insert segment which includes th into TCP reassembly queue
4192 * with control block tp. Set thflags to whether reassembly now
4193 * includes a segment with FIN. This handles the common case
4194 * inline (segment is the next to be received on an established
4195 * connection, and the queue is empty), avoiding linkage into
4196 * and removal from the queue and repetition of various
4197 * conversions.
4198 * Set DELACK for segments received in order, but ack
4199 * immediately when segments are out of order (so
4200 * fast retransmit can work).
9bccf70c
A
4201 */
4202 if (th->th_seq == tp->rcv_nxt &&
4203 LIST_EMPTY(&tp->t_segq) &&
4204 TCPS_HAVEESTABLISHED(tp->t_state)) {
316670eb 4205 TCP_INC_VAR(tp->t_unacksegs, nlropkts);
39236c6e
A
4206 /*
4207 * Calculate the RTT on the receiver only if the
4208 * connection is in streaming mode and the last
4209 * packet was not an end-of-write
4210 */
4211 if ((tp->t_flags & TF_STRETCHACK) &&
4212 !(tp->t_flagsext & TF_STREAMEOW))
4213 tcp_compute_rtt(tp, &to, th);
4214
316670eb
A
4215 if (DELAY_ACK(tp, th) &&
4216 ((tp->t_flags & TF_ACKNOW) == 0) ) {
6d2010ae
A
4217 if ((tp->t_flags & TF_DELACK) == 0) {
4218 tp->t_flags |= TF_DELACK;
39236c6e
A
4219 tp->t_timer[TCPT_DELACK] =
4220 OFFSET_FROM_START(tp, tcp_delack);
6d2010ae 4221 }
9bccf70c 4222 }
91447636 4223 else {
9bccf70c 4224 tp->t_flags |= TF_ACKNOW;
91447636 4225 }
9bccf70c
A
4226 tp->rcv_nxt += tlen;
4227 thflags = th->th_flags & TH_FIN;
316670eb 4228 TCP_INC_VAR(tcpstat.tcps_rcvpack, nlropkts);
9bccf70c 4229 tcpstat.tcps_rcvbyte += tlen;
6d2010ae 4230 if (nstat_collect) {
39236c6e 4231 if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) {
fe8ab488
A
4232 INP_ADD_STAT(inp, cell, wifi, wired,
4233 rxpackets, m->m_pkthdr.lro_npkts);
39236c6e 4234 } else {
fe8ab488
A
4235 INP_ADD_STAT(inp, cell, wifi, wired,
4236 rxpackets, 1);
316670eb 4237 }
fe8ab488
A
4238 INP_ADD_STAT(inp, cell, wifi, wired,
4239 rxbytes, tlen);
6d2010ae 4240 }
316670eb 4241 tcp_sbrcv_grow(tp, &so->so_rcv, &to, tlen);
6d2010ae 4242 so_recv_data_stat(so, m, drop_hdrlen);
39236c6e
A
4243
4244 if (sbappendstream_rcvdemux(so, m,
4245 th->th_seq - (tp->irs + 1), 0)) {
91447636 4246 sorwakeup(so);
39236c6e 4247 }
9bccf70c 4248 } else {
39236c6e 4249 thflags = tcp_reass(tp, th, &tlen, m, ifp);
9bccf70c
A
4250 tp->t_flags |= TF_ACKNOW;
4251 }
1c79356b 4252
39236c6e 4253 if (tlen > 0 && SACK_ENABLED(tp))
8ad349bb
A
4254 tcp_update_sack_list(tp, save_start, save_end);
4255
39236c6e
A
4256 tcp_adaptive_rwtimo_check(tp, tlen);
4257
1c79356b
A
4258 if (tp->t_flags & TF_DELACK)
4259 {
9bccf70c
A
4260#if INET6
4261 if (isipv6) {
4262 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
4263 (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
4264 th->th_seq, th->th_ack, th->th_win);
4265 }
4266 else
4267#endif
4268 {
4269 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
4270 (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
4271 th->th_seq, th->th_ack, th->th_win);
4272 }
4273
1c79356b 4274 }
1c79356b
A
4275 } else {
4276 m_freem(m);
4277 thflags &= ~TH_FIN;
4278 }
4279
4280 /*
4281 * If FIN is received ACK the FIN and let the user know
4282 * that the connection is closing.
4283 */
4284 if (thflags & TH_FIN) {
4285 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
4286 socantrcvmore(so);
4287 postevent(so, 0, EV_FIN);
4288 /*
8ad349bb
A
4289 * If connection is half-synchronized
4290 * (ie NEEDSYN flag on) then delay ACK,
4291 * so it may be piggybacked when SYN is sent.
4292 * Otherwise, since we received a FIN then no
4293 * more input can be expected, send ACK now.
1c79356b 4294 */
316670eb 4295 TCP_INC_VAR(tp->t_unacksegs, nlropkts);
6d2010ae
A
4296 if (DELAY_ACK(tp, th) && (tp->t_flags & TF_NEEDSYN)) {
4297 if ((tp->t_flags & TF_DELACK) == 0) {
4298 tp->t_flags |= TF_DELACK;
4299 tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
4300 }
1c79356b 4301 }
91447636 4302 else {
1c79356b 4303 tp->t_flags |= TF_ACKNOW;
91447636 4304 }
1c79356b
A
4305 tp->rcv_nxt++;
4306 }
4307 switch (tp->t_state) {
4308
4309 /*
4310 * In SYN_RECEIVED and ESTABLISHED STATES
4311 * enter the CLOSE_WAIT state.
4312 */
4313 case TCPS_SYN_RECEIVED:
6d2010ae 4314 tp->t_starttime = tcp_now;
1c79356b 4315 case TCPS_ESTABLISHED:
6d2010ae
A
4316 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
4317 struct tcpcb *, tp, int32_t, TCPS_CLOSE_WAIT);
1c79356b
A
4318 tp->t_state = TCPS_CLOSE_WAIT;
4319 break;
4320
4321 /*
4322 * If still in FIN_WAIT_1 STATE FIN has not been acked so
4323 * enter the CLOSING state.
4324 */
4325 case TCPS_FIN_WAIT_1:
6d2010ae
A
4326 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
4327 struct tcpcb *, tp, int32_t, TCPS_CLOSING);
1c79356b
A
4328 tp->t_state = TCPS_CLOSING;
4329 break;
4330
4331 /*
4332 * In FIN_WAIT_2 state enter the TIME_WAIT state,
4333 * starting the time-wait timer, turning off the other
4334 * standard timers.
4335 */
4336 case TCPS_FIN_WAIT_2:
39236c6e
A
4337 DTRACE_TCP4(state__change, void, NULL,
4338 struct inpcb *, inp,
4339 struct tcpcb *, tp,
4340 int32_t, TCPS_TIME_WAIT);
1c79356b
A
4341 tp->t_state = TCPS_TIME_WAIT;
4342 tcp_canceltimers(tp);
fe8ab488
A
4343 tp->t_flags |= TF_ACKNOW;
4344 if (tp->t_flagsext & TF_NOTIMEWAIT) {
4345 tp->t_flags |= TF_CLOSING;
4346 } else {
4347 add_to_time_wait(tp, 2 * tcp_msl);
1c79356b 4348 }
1c79356b
A
4349 soisdisconnected(so);
4350 break;
4351
4352 /*
4353 * In TIME_WAIT state restart the 2 MSL time_wait timer.
4354 */
4355 case TCPS_TIME_WAIT:
6d2010ae 4356 add_to_time_wait(tp, 2 * tcp_msl);
1c79356b
A
4357 break;
4358 }
4359 }
4360#if TCPDEBUG
9bccf70c
A
4361 if (so->so_options & SO_DEBUG)
4362 tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
1c79356b 4363 &tcp_savetcp, 0);
1c79356b
A
4364#endif
4365
4366 /*
4367 * Return any desired output.
4368 */
2d21ac55 4369 if (needoutput || (tp->t_flags & TF_ACKNOW)) {
1c79356b 4370 (void) tcp_output(tp);
2d21ac55 4371 }
6d2010ae
A
4372
4373 tcp_check_timer_state(tp);
4374
4375
91447636 4376 tcp_unlock(so, 1, 0);
1c79356b
A
4377 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
4378 return;
4379
4380dropafterack:
4381 /*
4382 * Generate an ACK dropping incoming segment if it occupies
4383 * sequence space, where the ACK reflects our state.
4384 *
4385 * We can now skip the test for the RST flag since all
4386 * paths to this code happen after packets containing
4387 * RST have been dropped.
4388 *
4389 * In the SYN-RECEIVED state, don't send an ACK unless the
4390 * segment we received passes the SYN-RECEIVED ACK test.
4391 * If it fails send a RST. This breaks the loop in the
4392 * "LAND" DoS attack, and also prevents an ACK storm
4393 * between two listening ports that have been sent forged
4394 * SYN segments, each with the source address of the other.
4395 */
4396 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
4397 (SEQ_GT(tp->snd_una, th->th_ack) ||
9bccf70c
A
4398 SEQ_GT(th->th_ack, tp->snd_max)) ) {
4399 rstreason = BANDLIM_RST_OPENPORT;
39236c6e 4400 IF_TCP_STATINC(ifp, dospacket);
1c79356b 4401 goto dropwithreset;
9bccf70c 4402 }
1c79356b 4403#if TCPDEBUG
9bccf70c
A
4404 if (so->so_options & SO_DEBUG)
4405 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
1c79356b 4406 &tcp_savetcp, 0);
1c79356b
A
4407#endif
4408 m_freem(m);
4409 tp->t_flags |= TF_ACKNOW;
4410 (void) tcp_output(tp);
6d2010ae
A
4411
4412 /* Don't need to check timer state as we should have done it during tcp_output */
91447636 4413 tcp_unlock(so, 1, 0);
1c79356b
A
4414 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
4415 return;
91447636
A
4416dropwithresetnosock:
4417 nosock = 1;
1c79356b
A
4418dropwithreset:
4419 /*
4420 * Generate a RST, dropping incoming segment.
4421 * Make ACK acceptable to originator of segment.
4422 * Don't bother to respond if destination was broadcast/multicast.
4423 */
4424 if ((thflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST))
4425 goto drop;
4426#if INET6
4427 if (isipv6) {
9bccf70c
A
4428 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
4429 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
4430 goto drop;
1c79356b
A
4431 } else
4432#endif /* INET6 */
14353aa8
A
4433 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
4434 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
4435 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
4436 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
1c79356b 4437 goto drop;
9bccf70c
A
4438 /* IPv6 anycast check is done at tcp6_input() */
4439
4440 /*
4441 * Perform bandwidth limiting.
4442 */
4443#if ICMP_BANDLIM
4444 if (badport_bandlim(rstreason) < 0)
4445 goto drop;
4446#endif
4447
1c79356b 4448#if TCPDEBUG
9bccf70c
A
4449 if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
4450 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
1c79356b 4451 &tcp_savetcp, 0);
1c79356b 4452#endif
fe8ab488
A
4453 bzero(&tra, sizeof(tra));
4454 tra.ifscope = ifscope;
4455 tra.awdl_unrestricted = 1;
1c79356b 4456 if (thflags & TH_ACK)
9bccf70c
A
4457 /* mtod() below is safe as long as hdr dropping is delayed */
4458 tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack,
fe8ab488 4459 TH_RST, &tra);
1c79356b
A
4460 else {
4461 if (thflags & TH_SYN)
9bccf70c
A
4462 tlen++;
4463 /* mtod() below is safe as long as hdr dropping is delayed */
4464 tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
fe8ab488 4465 (tcp_seq)0, TH_RST|TH_ACK, &tra);
1c79356b
A
4466 }
4467 /* destroy temporarily created socket */
91447636
A
4468 if (dropsocket) {
4469 (void) soabort(so);
4470 tcp_unlock(so, 1, 0);
39236c6e 4471 } else if ((inp != NULL) && (nosock == 0)) {
6d2010ae
A
4472 tcp_unlock(so, 1, 0);
4473 }
1c79356b
A
4474 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
4475 return;
91447636
A
4476dropnosock:
4477 nosock = 1;
1c79356b
A
4478drop:
4479 /*
4480 * Drop space held by incoming segment and return.
4481 */
4482#if TCPDEBUG
9bccf70c
A
4483 if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
4484 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
1c79356b 4485 &tcp_savetcp, 0);
1c79356b
A
4486#endif
4487 m_freem(m);
1c79356b 4488 /* destroy temporarily created socket */
91447636
A
4489 if (dropsocket) {
4490 (void) soabort(so);
4491 tcp_unlock(so, 1, 0);
4492 }
6d2010ae
A
4493 else if (nosock == 0) {
4494 tcp_unlock(so, 1, 0);
4495 }
1c79356b
A
4496 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
4497 return;
4498}
4499
4500static void
c910b4d9 4501tcp_dooptions(tp, cp, cnt, th, to, input_ifscope)
8ad349bb
A
4502/*
4503 * Parse TCP options and place in tcpopt.
4504 */
1c79356b
A
4505 struct tcpcb *tp;
4506 u_char *cp;
4507 int cnt;
4508 struct tcphdr *th;
4509 struct tcpopt *to;
c910b4d9 4510 unsigned int input_ifscope;
1c79356b
A
4511{
4512 u_short mss = 0;
4513 int opt, optlen;
4514
4515 for (; cnt > 0; cnt -= optlen, cp += optlen) {
4516 opt = cp[0];
4517 if (opt == TCPOPT_EOL)
4518 break;
4519 if (opt == TCPOPT_NOP)
4520 optlen = 1;
4521 else {
9bccf70c
A
4522 if (cnt < 2)
4523 break;
1c79356b 4524 optlen = cp[1];
9bccf70c 4525 if (optlen < 2 || optlen > cnt)
1c79356b
A
4526 break;
4527 }
4528 switch (opt) {
4529
4530 default:
4531 continue;
4532
4533 case TCPOPT_MAXSEG:
4534 if (optlen != TCPOLEN_MAXSEG)
4535 continue;
4536 if (!(th->th_flags & TH_SYN))
4537 continue;
4538 bcopy((char *) cp + 2, (char *) &mss, sizeof(mss));
9bccf70c 4539 NTOHS(mss);
1c79356b
A
4540 break;
4541
4542 case TCPOPT_WINDOW:
4543 if (optlen != TCPOLEN_WINDOW)
4544 continue;
4545 if (!(th->th_flags & TH_SYN))
4546 continue;
39236c6e 4547 to->to_flags |= TOF_SCALE;
1c79356b
A
4548 tp->t_flags |= TF_RCVD_SCALE;
4549 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
4550 break;
4551
4552 case TCPOPT_TIMESTAMP:
4553 if (optlen != TCPOLEN_TIMESTAMP)
4554 continue;
8ad349bb 4555 to->to_flags |= TOF_TS;
1c79356b
A
4556 bcopy((char *)cp + 2,
4557 (char *)&to->to_tsval, sizeof(to->to_tsval));
4558 NTOHL(to->to_tsval);
4559 bcopy((char *)cp + 6,
4560 (char *)&to->to_tsecr, sizeof(to->to_tsecr));
4561 NTOHL(to->to_tsecr);
1c79356b
A
4562 /*
4563 * A timestamp received in a SYN makes
4564 * it ok to send timestamp requests and replies.
4565 */
4566 if (th->th_flags & TH_SYN) {
4567 tp->t_flags |= TF_RCVD_TSTMP;
4568 tp->ts_recent = to->to_tsval;
4569 tp->ts_recent_age = tcp_now;
4570 }
4571 break;
8ad349bb
A
4572 case TCPOPT_SACK_PERMITTED:
4573 if (!tcp_do_sack ||
4574 optlen != TCPOLEN_SACK_PERMITTED)
1c79356b 4575 continue;
1c79356b 4576 if (th->th_flags & TH_SYN)
8ad349bb 4577 to->to_flags |= TOF_SACK;
1c79356b 4578 break;
8ad349bb
A
4579 case TCPOPT_SACK:
4580 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
1c79356b 4581 continue;
8ad349bb
A
4582 to->to_nsacks = (optlen - 2) / TCPOLEN_SACK;
4583 to->to_sacks = cp + 2;
4584 tcpstat.tcps_sack_rcv_blocks++;
4585
1c79356b 4586 break;
39236c6e
A
4587
4588#if MPTCP
4589 case TCPOPT_MULTIPATH:
4590 tcp_do_mptcp_options(tp, cp, th, to, optlen);
4591 break;
4592#endif /* MPTCP */
1c79356b
A
4593 }
4594 }
9bccf70c 4595 if (th->th_flags & TH_SYN)
c910b4d9 4596 tcp_mss(tp, mss, input_ifscope); /* sets t_maxseg */
1c79356b
A
4597}
4598
4599/*
4600 * Pull out of band byte out of a segment so
4601 * it doesn't appear in the user's data queue.
4602 * It is still reflected in the segment length for
4603 * sequencing purposes.
4604 */
4605static void
9bccf70c 4606tcp_pulloutofband(so, th, m, off)
1c79356b
A
4607 struct socket *so;
4608 struct tcphdr *th;
4609 register struct mbuf *m;
9bccf70c 4610 int off; /* delayed to be droped hdrlen */
1c79356b 4611{
9bccf70c 4612 int cnt = off + th->th_urp - 1;
1c79356b
A
4613
4614 while (cnt >= 0) {
4615 if (m->m_len > cnt) {
4616 char *cp = mtod(m, caddr_t) + cnt;
4617 struct tcpcb *tp = sototcpcb(so);
4618
4619 tp->t_iobc = *cp;
4620 tp->t_oobflags |= TCPOOB_HAVEDATA;
4621 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
4622 m->m_len--;
9bccf70c
A
4623 if (m->m_flags & M_PKTHDR)
4624 m->m_pkthdr.len--;
1c79356b
A
4625 return;
4626 }
4627 cnt -= m->m_len;
4628 m = m->m_next;
4629 if (m == 0)
4630 break;
4631 }
4632 panic("tcp_pulloutofband");
4633}
4634
6d2010ae
A
4635uint32_t
4636get_base_rtt(struct tcpcb *tp)
4637{
4638 uint32_t base_rtt = 0, i;
4639 for (i = 0; i < N_RTT_BASE; ++i) {
4640 if (tp->rtt_hist[i] != 0 &&
4641 (base_rtt == 0 || tp->rtt_hist[i] < base_rtt))
4642 base_rtt = tp->rtt_hist[i];
4643 }
4644 return base_rtt;
4645}
4646
4647/* Each value of RTT base represents the minimum RTT seen in a minute.
4648 * We keep upto N_RTT_BASE minutes worth of history.
4649 */
4650void
4651update_base_rtt(struct tcpcb *tp, uint32_t rtt)
4652{
39236c6e
A
4653 int32_t i, qdelay;
4654 u_int32_t base_rtt;
4655
6d2010ae 4656 if (++tp->rtt_count >= rtt_samples_per_slot) {
39236c6e
A
4657#if TRAFFIC_MGT
4658 /*
4659 * If the recv side is being throttled, check if the
4660 * current RTT is closer to the base RTT seen in
4661 * first (recent) two slots. If so, unthrottle the stream.
4662 */
4663 if (tp->t_flagsext & TF_RECV_THROTTLE) {
4664 base_rtt = min(tp->rtt_hist[0], tp->rtt_hist[1]);
4665 qdelay = tp->t_rttcur - base_rtt;
4666 if (qdelay < target_qdelay)
4667 tp->t_flagsext &= ~(TF_RECV_THROTTLE);
4668 }
4669#endif /* TRAFFIC_MGT */
4670
6d2010ae
A
4671 for (i = (N_RTT_BASE-1); i > 0; --i) {
4672 tp->rtt_hist[i] = tp->rtt_hist[i-1];
4673 }
4674 tp->rtt_hist[0] = rtt;
4675 tp->rtt_count = 0;
4676 } else {
4677 tp->rtt_hist[0] = min(tp->rtt_hist[0], rtt);
4678 }
4679}
4680
39236c6e
A
4681/*
4682 * If we have a timestamp reply, update smoothed RTT. If no timestamp is
4683 * present but transmit timer is running and timed sequence number was
4684 * acked, update smoothed RTT.
4685 *
4686 * If timestamps are supported, a receiver can update RTT even if
4687 * there is no outstanding data.
4688 *
4689 * Some boxes send broken timestamp replies during the SYN+ACK phase,
4690 * ignore timestamps of 0or we could calculate a huge RTT and blow up
4691 * the retransmit timer.
4692 */
4693static void
4694tcp_compute_rtt(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th)
4695{
4696 VERIFY(to != NULL && th != NULL);
4697 if (((to->to_flags & TOF_TS) != 0) &&
4698 (to->to_tsecr != 0) &&
4699 TSTMP_GEQ(tcp_now, to->to_tsecr)) {
4700 tcp_xmit_timer(tp, tcp_now - to->to_tsecr,
4701 to->to_tsecr, th->th_ack);
fe8ab488 4702 } else if (tp->t_rtttime != 0 && SEQ_GT(th->th_ack, tp->t_rtseq)) {
39236c6e
A
4703 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime, 0,
4704 th->th_ack);
4705 }
4706}
4707
1c79356b
A
4708/*
4709 * Collect new round-trip time estimate
4710 * and update averages and current timeout.
4711 */
4712static void
39236c6e
A
4713tcp_xmit_timer(register struct tcpcb *tp, int rtt,
4714 u_int32_t tsecr, tcp_seq th_ack)
1c79356b
A
4715{
4716 register int delta;
4717
39236c6e
A
4718 if (tp->t_flagsext & TF_RECOMPUTE_RTT) {
4719 if (SEQ_GT(th_ack, tp->snd_una) &&
4720 SEQ_LEQ(th_ack, tp->snd_max) &&
4721 (tsecr == 0 ||
4722 TSTMP_GEQ(tsecr, tp->t_badrexmt_time))) {
4723 /*
4724 * We received a new ACk after a
4725 * spurious timeout. Adapt retransmission
4726 * timer as described in rfc 4015.
4727 */
4728 tp->t_flagsext &= ~(TF_RECOMPUTE_RTT);
4729 tp->t_badrexmt_time = 0;
4730 tp->t_srtt = max(tp->t_srtt_prev, rtt);
4731 tp->t_srtt = tp->t_srtt << TCP_RTT_SHIFT;
4732 tp->t_rttvar = max(tp->t_rttvar_prev, (rtt >> 1));
4733 tp->t_rttvar = tp->t_rttvar << TCP_RTTVAR_SHIFT;
4734
4735 if (tp->t_rttbest > (tp->t_srtt + tp->t_rttvar))
4736 tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
4737
4738 goto compute_rto;
4739 } else {
4740 return;
4741 }
4742 }
4743
1c79356b
A
4744 tcpstat.tcps_rttupdated++;
4745 tp->t_rttupdated++;
6d2010ae
A
4746
4747 if (rtt > 0) {
4748 tp->t_rttcur = rtt;
4749 update_base_rtt(tp, rtt);
4750 }
4751
1c79356b
A
4752 if (tp->t_srtt != 0) {
4753 /*
4754 * srtt is stored as fixed point with 5 bits after the
6d2010ae 4755 * binary point (i.e., scaled by 32). The following magic
1c79356b
A
4756 * is equivalent to the smoothing algorithm in rfc793 with
4757 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
6d2010ae
A
4758 * point).
4759 *
39236c6e
A
4760 * Freebsd adjusts rtt to origin 0 by subtracting 1
4761 * from the provided rtt value. This was required because
4762 * of the way t_rtttime was initiailised to 1 before.
4763 * Since we changed t_rtttime to be based on
6d2010ae 4764 * tcp_now, this extra adjustment is not needed.
1c79356b 4765 */
6d2010ae 4766 delta = (rtt << TCP_DELTA_SHIFT)
1c79356b
A
4767 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
4768
4769 if ((tp->t_srtt += delta) <= 0)
4770 tp->t_srtt = 1;
4771
4772 /*
4773 * We accumulate a smoothed rtt variance (actually, a
4774 * smoothed mean difference), then set the retransmit
4775 * timer to smoothed rtt + 4 times the smoothed variance.
4776 * rttvar is stored as fixed point with 4 bits after the
4777 * binary point (scaled by 16). The following is
4778 * equivalent to rfc793 smoothing with an alpha of .75
4779 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
4780 * rfc793's wired-in beta.
4781 */
4782 if (delta < 0)
4783 delta = -delta;
4784 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
4785 if ((tp->t_rttvar += delta) <= 0)
4786 tp->t_rttvar = 1;
316670eb
A
4787 if (tp->t_rttbest == 0 ||
4788 tp->t_rttbest > (tp->t_srtt + tp->t_rttvar))
4789 tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
1c79356b
A
4790 } else {
4791 /*
4792 * No rtt measurement yet - use the unsmoothed rtt.
4793 * Set the variance to half the rtt (so our first
4794 * retransmit happens at 3*rtt).
4795 */
4796 tp->t_srtt = rtt << TCP_RTT_SHIFT;
4797 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
4798 }
39236c6e
A
4799
4800compute_rto:
4801 nstat_route_rtt(tp->t_inpcb->inp_route.ro_rt, tp->t_srtt,
4802 tp->t_rttvar);
9bccf70c 4803 tp->t_rtttime = 0;
1c79356b 4804 tp->t_rxtshift = 0;
39236c6e 4805 tp->t_rxtstart = 0;
1c79356b
A
4806
4807 /*
4808 * the retransmit should happen at rtt + 4 * rttvar.
4809 * Because of the way we do the smoothing, srtt and rttvar
4810 * will each average +1/2 tick of bias. When we compute
4811 * the retransmit timer, we want 1/2 tick of rounding and
4812 * 1 extra tick because of +-1/2 tick uncertainty in the
4813 * firing of the timer. The bias will give us exactly the
4814 * 1.5 tick we need. But, because the bias is
4815 * statistical, we have to test that we don't drop below
4816 * the minimum feasible timer (which is 2 ticks).
4817 */
4818 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
6d2010ae
A
4819 max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX,
4820 TCP_ADD_REXMTSLOP(tp));
1c79356b
A
4821
4822 /*
4823 * We received an ack for a packet that wasn't retransmitted;
4824 * it is probably safe to discard any error indications we've
4825 * received recently. This isn't quite right, but close enough
4826 * for now (a route might have failed after we sent a segment,
4827 * and the return path might not be symmetrical).
4828 */
4829 tp->t_softerror = 0;
4830}
4831
2d21ac55
A
4832static inline unsigned int
4833tcp_maxmtu(struct rtentry *rt)
4834{
4835 unsigned int maxmtu;
4836
b0d623f7 4837 RT_LOCK_ASSERT_HELD(rt);
2d21ac55
A
4838 if (rt->rt_rmx.rmx_mtu == 0)
4839 maxmtu = rt->rt_ifp->if_mtu;
4840 else
4841 maxmtu = MIN(rt->rt_rmx.rmx_mtu, rt->rt_ifp->if_mtu);
4842
4843 return (maxmtu);
4844}
4845
4846#if INET6
4847static inline unsigned int
4848tcp_maxmtu6(struct rtentry *rt)
4849{
4850 unsigned int maxmtu;
316670eb 4851 struct nd_ifinfo *ndi;
2d21ac55 4852
b0d623f7
A
4853 RT_LOCK_ASSERT_HELD(rt);
4854 lck_rw_lock_shared(nd_if_rwlock);
316670eb
A
4855 if ((ndi = ND_IFINFO(rt->rt_ifp)) != NULL && !ndi->initialized)
4856 ndi = NULL;
4857 if (ndi != NULL)
4858 lck_mtx_lock(&ndi->lock);
2d21ac55
A
4859 if (rt->rt_rmx.rmx_mtu == 0)
4860 maxmtu = IN6_LINKMTU(rt->rt_ifp);
4861 else
4862 maxmtu = MIN(rt->rt_rmx.rmx_mtu, IN6_LINKMTU(rt->rt_ifp));
316670eb
A
4863 if (ndi != NULL)
4864 lck_mtx_unlock(&ndi->lock);
b0d623f7 4865 lck_rw_done(nd_if_rwlock);
2d21ac55
A
4866
4867 return (maxmtu);
4868}
4869#endif
4870
1c79356b
A
4871/*
4872 * Determine a reasonable value for maxseg size.
4873 * If the route is known, check route for mtu.
4874 * If none, use an mss that can be handled on the outgoing
4875 * interface without forcing IP to fragment; if bigger than
4876 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
4877 * to utilize large mbufs. If no route is found, route has no mtu,
4878 * or the destination isn't local, use a default, hopefully conservative
4879 * size (usually 512 or the default IP max size, but no more than the mtu
4880 * of the interface), as we can't discover anything about intervening
4881 * gateways or networks. We also initialize the congestion/slow start
fe8ab488
A
4882 * window. While looking at the routing entry, we also initialize
4883 * other path-dependent parameters from pre-set or cached values
4884 * in the routing entry.
1c79356b
A
4885 *
4886 * Also take into account the space needed for options that we
4887 * send regularly. Make maxseg shorter by that amount to assure
4888 * that we can send maxseg amount of data even when the options
4889 * are present. Store the upper limit of the length of options plus
4890 * data in maxopd.
4891 *
4892 * NOTE that this routine is only called when we process an incoming
4893 * segment, for outgoing segments only tcp_mssopt is called.
4894 *
1c79356b
A
4895 */
4896void
c910b4d9 4897tcp_mss(tp, offer, input_ifscope)
1c79356b
A
4898 struct tcpcb *tp;
4899 int offer;
c910b4d9 4900 unsigned int input_ifscope;
1c79356b
A
4901{
4902 register struct rtentry *rt;
4903 struct ifnet *ifp;
4904 register int rtt, mss;
b0d623f7 4905 u_int32_t bufsize;
1c79356b
A
4906 struct inpcb *inp;
4907 struct socket *so;
4908 struct rmxp_tao *taop;
4909 int origoffer = offer;
b0d623f7 4910 u_int32_t sb_max_corrected;
2d21ac55 4911 int isnetlocal = 0;
1c79356b 4912#if INET6
9bccf70c
A
4913 int isipv6;
4914 int min_protoh;
4915#endif
1c79356b
A
4916
4917 inp = tp->t_inpcb;
9bccf70c
A
4918#if INET6
4919 isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
4920 min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr)
4921 : sizeof (struct tcpiphdr);
4922#else
4923#define min_protoh (sizeof (struct tcpiphdr))
4924#endif
b0d623f7 4925
1c79356b 4926#if INET6
2d21ac55 4927 if (isipv6) {
6d2010ae 4928 rt = tcp_rtlookup6(inp, input_ifscope);
2d21ac55 4929 }
1c79356b
A
4930 else
4931#endif /* INET6 */
2d21ac55 4932 {
c910b4d9 4933 rt = tcp_rtlookup(inp, input_ifscope);
2d21ac55 4934 }
6d2010ae
A
4935 isnetlocal = (tp->t_flags & TF_LOCAL);
4936
1c79356b
A
4937 if (rt == NULL) {
4938 tp->t_maxopd = tp->t_maxseg =
4939#if INET6
4940 isipv6 ? tcp_v6mssdflt :
4941#endif /* INET6 */
4942 tcp_mssdflt;
4943 return;
4944 }
4945 ifp = rt->rt_ifp;
d12e1678
A
4946 /*
4947 * Slower link window correction:
fe8ab488
A
4948 * If a value is specificied for slowlink_wsize use it for
4949 * PPP links believed to be on a serial modem (speed <128Kbps).
4950 * Excludes 9600bps as it is the default value adversized
4951 * by pseudo-devices over ppp.
d12e1678
A
4952 */
4953 if (ifp->if_type == IFT_PPP && slowlink_wsize > 0 &&
4954 ifp->if_baudrate > 9600 && ifp->if_baudrate <= 128000) {
4955 tp->t_flags |= TF_SLOWLINK;
4956 }
1c79356b
A
4957 so = inp->inp_socket;
4958
4959 taop = rmx_taop(rt->rt_rmx);
4960 /*
4961 * Offer == -1 means that we didn't receive SYN yet,
4962 * use cached value in that case;
4963 */
4964 if (offer == -1)
4965 offer = taop->tao_mssopt;
4966 /*
4967 * Offer == 0 means that there was no MSS on the SYN segment,
4968 * in this case we use tcp_mssdflt.
4969 */
4970 if (offer == 0)
4971 offer =
4972#if INET6
4973 isipv6 ? tcp_v6mssdflt :
4974#endif /* INET6 */
4975 tcp_mssdflt;
e5568f75
A
4976 else {
4977 /*
4978 * Prevent DoS attack with too small MSS. Round up
4979 * to at least minmss.
4980 */
4981 offer = max(offer, tcp_minmss);
1c79356b
A
4982 /*
4983 * Sanity check: make sure that maxopd will be large
4984 * enough to allow some data on segments even is the
4985 * all the option space is used (40bytes). Otherwise
4986 * funny things may happen in tcp_output.
4987 */
4988 offer = max(offer, 64);
e5568f75 4989 }
1c79356b
A
4990 taop->tao_mssopt = offer;
4991
4992 /*
4993 * While we're here, check if there's an initial rtt
4994 * or rttvar. Convert from the route-table units
4995 * to scaled multiples of the slow timeout timer.
4996 */
316670eb
A
4997 if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt) != 0) {
4998 tcp_getrt_rtt(tp, rt);
4999 } else {
6d2010ae 5000 tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN : TCPTV_REXMTMIN;
316670eb 5001 }
2d21ac55 5002
9bccf70c 5003#if INET6
2d21ac55
A
5004 mss = (isipv6 ? tcp_maxmtu6(rt) : tcp_maxmtu(rt));
5005#else
5006 mss = tcp_maxmtu(rt);
9bccf70c 5007#endif
2d21ac55
A
5008 mss -= min_protoh;
5009
5010 if (rt->rt_rmx.rmx_mtu == 0) {
1c79356b
A
5011#if INET6
5012 if (isipv6) {
2d21ac55 5013 if (!isnetlocal)
1c79356b
A
5014 mss = min(mss, tcp_v6mssdflt);
5015 } else
5016#endif /* INET6 */
2d21ac55 5017 if (!isnetlocal)
1c79356b
A
5018 mss = min(mss, tcp_mssdflt);
5019 }
2d21ac55 5020
1c79356b
A
5021 mss = min(mss, offer);
5022 /*
5023 * maxopd stores the maximum length of data AND options
5024 * in a segment; maxseg is the amount of data in a normal
5025 * segment. We need to store this value (maxopd) apart
5026 * from maxseg, because now every segment carries options
5027 * and thus we normally have somewhat less data in segments.
5028 */
5029 tp->t_maxopd = mss;
5030
5031 /*
8ad349bb
A
5032 * origoffer==-1 indicates, that no segments were received yet.
5033 * In this case we just guess.
1c79356b 5034 */
8ad349bb 5035 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
1c79356b
A
5036 (origoffer == -1 ||
5037 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
5038 mss -= TCPOLEN_TSTAMP_APPA;
1c79356b 5039
39236c6e
A
5040#if MPTCP
5041 mss -= mptcp_adj_mss(tp, FALSE);
5042#endif /* MPTCP */
5043 tp->t_maxseg = mss;
5044
2d21ac55
A
5045 /*
5046 * Calculate corrected value for sb_max; ensure to upgrade the
5047 * numerator for large sb_max values else it will overflow.
5048 */
5049 sb_max_corrected = (sb_max * (u_int64_t)MCLBYTES) / (MSIZE + MCLBYTES);
5050
1c79356b 5051 /*
55e303ae
A
5052 * If there's a pipesize (ie loopback), change the socket
5053 * buffer to that size only if it's bigger than the current
5054 * sockbuf size. Make the socket buffers an integral
1c79356b
A
5055 * number of mss units; if the mss is larger than
5056 * the socket buffer, decrease the mss.
5057 */
5058#if RTV_SPIPE
55e303ae
A
5059 bufsize = rt->rt_rmx.rmx_sendpipe;
5060 if (bufsize < so->so_snd.sb_hiwat)
1c79356b
A
5061#endif
5062 bufsize = so->so_snd.sb_hiwat;
5063 if (bufsize < mss)
5064 mss = bufsize;
5065 else {
2d21ac55
A
5066 bufsize = (((bufsize + (u_int64_t)mss - 1) / (u_int64_t)mss) * (u_int64_t)mss);
5067 if (bufsize > sb_max_corrected)
5068 bufsize = sb_max_corrected;
1c79356b
A
5069 (void)sbreserve(&so->so_snd, bufsize);
5070 }
5071 tp->t_maxseg = mss;
5072
5073#if RTV_RPIPE
55e303ae
A
5074 bufsize = rt->rt_rmx.rmx_recvpipe;
5075 if (bufsize < so->so_rcv.sb_hiwat)
1c79356b
A
5076#endif
5077 bufsize = so->so_rcv.sb_hiwat;
5078 if (bufsize > mss) {
2d21ac55
A
5079 bufsize = (((bufsize + (u_int64_t)mss - 1) / (u_int64_t)mss) * (u_int64_t)mss);
5080 if (bufsize > sb_max_corrected)
5081 bufsize = sb_max_corrected;
1c79356b
A
5082 (void)sbreserve(&so->so_rcv, bufsize);
5083 }
9bccf70c 5084
6d2010ae 5085 set_tcp_stream_priority(so);
1c79356b
A
5086
5087 if (rt->rt_rmx.rmx_ssthresh) {
5088 /*
5089 * There's some sort of gateway or interface
5090 * buffer limit on the path. Use this to set
fe8ab488
A
5091 * slow-start threshold, but set the threshold to
5092 * no less than 2*mss.
1c79356b
A
5093 */
5094 tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
5095 tcpstat.tcps_usedssthresh++;
b0d623f7 5096 } else {
cf7d32b8 5097 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
b0d623f7 5098 }
cf7d32b8 5099
6d2010ae
A
5100 /*
5101 * Set the slow-start flight size depending on whether this
5102 * is a local network or not.
5103 */
5104 if (CC_ALGO(tp)->cwnd_init != NULL)
5105 CC_ALGO(tp)->cwnd_init(tp);
5106
fe8ab488 5107 tcp_ccdbg_trace(tp, NULL, TCP_CC_CWND_INIT);
6d2010ae 5108
b0d623f7
A
5109 /* Route locked during lookup above */
5110 RT_UNLOCK(rt);
1c79356b
A
5111}
5112
5113/*
5114 * Determine the MSS option to send on an outgoing SYN.
5115 */
5116int
9bccf70c 5117tcp_mssopt(tp)
1c79356b 5118 struct tcpcb *tp;
1c79356b
A
5119{
5120 struct rtentry *rt;
8ad349bb 5121 int mss;
1c79356b 5122#if INET6
9bccf70c
A
5123 int isipv6;
5124 int min_protoh;
5125#endif
1c79356b 5126
9bccf70c
A
5127#if INET6
5128 isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
5129 min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr)
5130 : sizeof (struct tcpiphdr);
5131#else
5132#define min_protoh (sizeof (struct tcpiphdr))
5133#endif
b0d623f7 5134
1c79356b
A
5135#if INET6
5136 if (isipv6)
6d2010ae 5137 rt = tcp_rtlookup6(tp->t_inpcb, IFSCOPE_NONE);
1c79356b
A
5138 else
5139#endif /* INET6 */
c910b4d9 5140 rt = tcp_rtlookup(tp->t_inpcb, IFSCOPE_NONE);
2d21ac55 5141 if (rt == NULL) {
2d21ac55 5142 return (
1c79356b
A
5143#if INET6
5144 isipv6 ? tcp_v6mssdflt :
5145#endif /* INET6 */
2d21ac55
A
5146 tcp_mssdflt);
5147 }
d12e1678
A
5148 /*
5149 * Slower link window correction:
5150 * If a value is specificied for slowlink_wsize use it for PPP links
5151 * believed to be on a serial modem (speed <128Kbps). Excludes 9600bps as
5152 * it is the default value adversized by pseudo-devices over ppp.
5153 */
5154 if (rt->rt_ifp->if_type == IFT_PPP && slowlink_wsize > 0 &&
5155 rt->rt_ifp->if_baudrate > 9600 && rt->rt_ifp->if_baudrate <= 128000) {
5156 tp->t_flags |= TF_SLOWLINK;
5157 }
1c79356b 5158
8ad349bb 5159#if INET6
2d21ac55
A
5160 mss = (isipv6 ? tcp_maxmtu6(rt) : tcp_maxmtu(rt));
5161#else
5162 mss = tcp_maxmtu(rt);
8ad349bb 5163#endif
b0d623f7
A
5164 /* Route locked during lookup above */
5165 RT_UNLOCK(rt);
8ad349bb 5166 return (mss - min_protoh);
9bccf70c
A
5167}
5168
9bccf70c 5169/*
8ad349bb
A
5170 * On a partial ack arrives, force the retransmission of the
5171 * next unacknowledged segment. Do not clear tp->t_dupacks.
6d2010ae 5172 * By setting snd_nxt to th_ack, this forces retransmission timer to
8ad349bb 5173 * be started again.
9bccf70c 5174 */
8ad349bb
A
5175static void
5176tcp_newreno_partial_ack(tp, th)
9bccf70c
A
5177 struct tcpcb *tp;
5178 struct tcphdr *th;
5179{
9bccf70c 5180 tcp_seq onxt = tp->snd_nxt;
b0d623f7 5181 u_int32_t ocwnd = tp->snd_cwnd;
9bccf70c 5182 tp->t_timer[TCPT_REXMT] = 0;
fe8ab488 5183 tp->t_timer[TCPT_PTO] = 0;
9bccf70c
A
5184 tp->t_rtttime = 0;
5185 tp->snd_nxt = th->th_ack;
5186 /*
5187 * Set snd_cwnd to one segment beyond acknowledged offset
5188 * (tp->snd_una has not yet been updated when this function
5189 * is called)
5190 */
39236c6e 5191 tp->snd_cwnd = tp->t_maxseg + BYTES_ACKED(th, tp);
91447636 5192 tp->t_flags |= TF_ACKNOW;
9bccf70c
A
5193 (void) tcp_output(tp);
5194 tp->snd_cwnd = ocwnd;
5195 if (SEQ_GT(onxt, tp->snd_nxt))
5196 tp->snd_nxt = onxt;
5197 /*
5198 * Partial window deflation. Relies on fact that tp->snd_una
5199 * not updated yet.
5200 */
39236c6e
A
5201 if (tp->snd_cwnd > BYTES_ACKED(th, tp))
5202 tp->snd_cwnd -= BYTES_ACKED(th, tp);
2d21ac55
A
5203 else
5204 tp->snd_cwnd = 0;
5205 tp->snd_cwnd += tp->t_maxseg;
5206
1c79356b 5207}
91447636
A
5208
5209/*
5210 * Drop a random TCP connection that hasn't been serviced yet and
5211 * is eligible for discard. There is a one in qlen chance that
5212 * we will return a null, saying that there are no dropable
5213 * requests. In this case, the protocol specific code should drop
5214 * the new request. This insures fairness.
5215 *
5216 * The listening TCP socket "head" must be locked
5217 */
5218static int
2d21ac55 5219tcp_dropdropablreq(struct socket *head)
91447636 5220{
2d21ac55 5221 struct socket *so, *sonext;
91447636 5222 unsigned int i, j, qlen;
39236c6e
A
5223 static u_int32_t rnd = 0;
5224 static u_int64_t old_runtime;
91447636 5225 static unsigned int cur_cnt, old_cnt;
39236c6e 5226 u_int64_t now_sec;
91447636 5227 struct inpcb *inp = NULL;
3a60a9f5 5228 struct tcpcb *tp;
2d21ac55
A
5229
5230 if ((head->so_options & SO_ACCEPTCONN) == 0)
39236c6e
A
5231 return (0);
5232
5233 if (TAILQ_EMPTY(&head->so_incomp))
5234 return (0);
5235
5236 /*
5237 * Check if there is any socket in the incomp queue
5238 * that is closed because of a reset from the peer and is
5239 * waiting to be garbage collected. If so, pick that as
5240 * the victim
5241 */
5242 TAILQ_FOREACH_SAFE(so, &head->so_incomp, so_list, sonext) {
5243 inp = sotoinpcb(so);
5244 tp = intotcpcb(inp);
5245 if (tp != NULL && tp->t_state == TCPS_CLOSED &&
5246 so->so_head != NULL &&
5247 (so->so_state & (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) ==
5248 (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) {
5249 /*
5250 * The listen socket is already locked but we
5251 * can lock this socket here without lock ordering
5252 * issues because it is in the incomp queue and
5253 * is not visible to others.
5254 */
5255 if (lck_mtx_try_lock(&inp->inpcb_mtx)) {
5256 so->so_usecount++;
5257 goto found_victim;
5258 } else {
5259 continue;
5260 }
5261 }
5262 }
2d21ac55
A
5263
5264 so = TAILQ_FIRST(&head->so_incomp);
2d21ac55 5265
39236c6e
A
5266 now_sec = net_uptime();
5267 if ((i = (now_sec - old_runtime)) != 0) {
5268 old_runtime = now_sec;
91447636
A
5269 old_cnt = cur_cnt / i;
5270 cur_cnt = 0;
5271 }
5272
91447636
A
5273
5274 qlen = head->so_incqlen;
39236c6e
A
5275 if (rnd == 0)
5276 rnd = RandomULong();
5277
91447636
A
5278 if (++cur_cnt > qlen || old_cnt > qlen) {
5279 rnd = (314159 * rnd + 66329) & 0xffff;
5280 j = ((qlen + 1) * rnd) >> 16;
5281
5282 while (j-- && so)
5283 so = TAILQ_NEXT(so, so_list);
5284 }
2d21ac55 5285 /* Find a connection that is not already closing (or being served) */
91447636
A
5286 while (so) {
5287 inp = (struct inpcb *)so->so_pcb;
5288
2d21ac55
A
5289 sonext = TAILQ_NEXT(so, so_list);
5290
39236c6e
A
5291 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0)
5292 != WNT_STOPUSING) {
5293 /*
5294 * Avoid the issue of a socket being accepted
5295 * by one input thread and being dropped by
5296 * another input thread. If we can't get a hold
5297 * on this mutex, then grab the next socket in
5298 * line.
2d21ac55 5299 */
6d2010ae 5300 if (lck_mtx_try_lock(&inp->inpcb_mtx)) {
2d21ac55 5301 so->so_usecount++;
6d2010ae 5302 if ((so->so_usecount == 2) &&
39236c6e
A
5303 (so->so_state & SS_INCOMP) &&
5304 !(so->so_flags & SOF_INCOMP_INPROGRESS)) {
2d21ac55 5305 break;
39236c6e
A
5306 } else {
5307 /*
5308 * don't use if being accepted or
5309 * used in any other way
5310 */
2d21ac55
A
5311 in_pcb_checkstate(inp, WNT_RELEASE, 1);
5312 tcp_unlock(so, 1, 0);
5313 }
39236c6e
A
5314 } else {
5315 /*
5316 * do not try to lock the inp in
5317 * in_pcb_checkstate because the lock
5318 * is already held in some other thread.
b0d623f7
A
5319 * Only drop the inp_wntcnt reference.
5320 */
5321 in_pcb_checkstate(inp, WNT_RELEASE, 1);
5322 }
2d21ac55
A
5323 }
5324 so = sonext;
91447636 5325
91447636 5326 }
39236c6e
A
5327 if (so == NULL) {
5328 return (0);
5329 }
2d21ac55 5330
2d21ac55
A
5331 /* Makes sure socket is still in the right state to be discarded */
5332
91447636
A
5333 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
5334 tcp_unlock(so, 1, 0);
39236c6e 5335 return (0);
91447636 5336 }
2d21ac55 5337
39236c6e 5338found_victim:
2d21ac55 5339 if (so->so_usecount != 2 || !(so->so_state & SS_INCOMP)) {
6d2010ae 5340 /* do not discard: that socket is being accepted */
2d21ac55 5341 tcp_unlock(so, 1, 0);
39236c6e 5342 return (0);
2d21ac55
A
5343 }
5344
6d2010ae
A
5345 TAILQ_REMOVE(&head->so_incomp, so, so_list);
5346 tcp_unlock(head, 0, 0);
91447636 5347
6d2010ae 5348 lck_mtx_assert(&inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
3a60a9f5 5349 tp = sototcpcb(so);
2d21ac55 5350 so->so_flags |= SOF_OVERFLOW;
6d2010ae
A
5351 so->so_head = NULL;
5352
5353 tcp_close(tp);
6d2010ae 5354 if (inp->inp_wantcnt > 0 && inp->inp_wantcnt != WNT_STOPUSING) {
39236c6e
A
5355 /*
5356 * Some one has a wantcnt on this pcb. Since WNT_ACQUIRE
6d2010ae
A
5357 * doesn't require a lock, it could have happened while
5358 * we are holding the lock. This pcb will have to
5359 * be garbage collected later.
5360 * Release the reference held for so_incomp queue
5361 */
5362 so->so_usecount--;
6d2010ae
A
5363 tcp_unlock(so, 1, 0);
5364 } else {
39236c6e
A
5365 /*
5366 * Unlock this socket and leave the reference on.
5367 * We need to acquire the pcbinfo lock in order to
5368 * fully dispose it off
6d2010ae
A
5369 */
5370 tcp_unlock(so, 0, 0);
5371
39236c6e 5372 lck_rw_lock_exclusive(tcbinfo.ipi_lock);
6d2010ae
A
5373
5374 tcp_lock(so, 0, 0);
6d2010ae
A
5375 /* Release the reference held for so_incomp queue */
5376 so->so_usecount--;
5377
5378 if (so->so_usecount != 1 ||
39236c6e
A
5379 (inp->inp_wantcnt > 0 &&
5380 inp->inp_wantcnt != WNT_STOPUSING)) {
5381 /*
5382 * There is an extra wantcount or usecount
5383 * that must have been added when the socket
5384 * was unlocked. This socket will have to be
5385 * garbage collected later
6d2010ae
A
5386 */
5387 tcp_unlock(so, 1, 0);
5388 } else {
5389
5390 /* Drop the reference held for this function */
5391 so->so_usecount--;
5392
5393 in_pcbdispose(inp);
5394 }
39236c6e 5395 lck_rw_done(tcbinfo.ipi_lock);
6d2010ae 5396 }
3a60a9f5 5397 tcpstat.tcps_drops++;
6d2010ae 5398
3a60a9f5 5399 tcp_lock(head, 0, 0);
2d21ac55
A
5400 head->so_incqlen--;
5401 head->so_qlen--;
6d2010ae
A
5402 return(1);
5403}
5404
5405/* Set background congestion control on a socket */
5406void
5407tcp_set_background_cc(struct socket *so)
5408{
5409 tcp_set_new_cc(so, TCP_CC_ALGO_BACKGROUND_INDEX);
5410}
5411
5412/* Set foreground congestion control on a socket */
5413void
5414tcp_set_foreground_cc(struct socket *so)
5415{
fe8ab488
A
5416 if (tcp_use_newreno)
5417 tcp_set_new_cc(so, TCP_CC_ALGO_NEWRENO_INDEX);
5418 else
5419 tcp_set_new_cc(so, TCP_CC_ALGO_CUBIC_INDEX);
6d2010ae
A
5420}
5421
5422static void
5423tcp_set_new_cc(struct socket *so, uint16_t cc_index)
5424{
5425 struct inpcb *inp = sotoinpcb(so);
5426 struct tcpcb *tp = intotcpcb(inp);
39236c6e 5427 u_char old_cc_index = 0;
6d2010ae
A
5428 if (tp->tcp_cc_index != cc_index) {
5429
5430 old_cc_index = tp->tcp_cc_index;
5431
5432 if (CC_ALGO(tp)->cleanup != NULL)
5433 CC_ALGO(tp)->cleanup(tp);
5434 tp->tcp_cc_index = cc_index;
5435
fe8ab488
A
5436 tcp_cc_allocate_state(tp);
5437
5438 if (CC_ALGO(tp)->switch_to != NULL)
5439 CC_ALGO(tp)->switch_to(tp, old_cc_index);
5440
5441 tcp_ccdbg_trace(tp, NULL, TCP_CC_CHANGE_ALGO);
6d2010ae 5442 }
91447636
A
5443}
5444
316670eb
A
5445void
5446tcp_set_recv_bg(struct socket *so)
5447{
5448 if (!IS_TCP_RECV_BG(so))
5449 so->so_traffic_mgt_flags |= TRAFFIC_MGT_TCP_RECVBG;
39236c6e
A
5450
5451 /* Unset Large Receive Offload on background sockets */
5452 so_set_lro(so, SO_TC_BK);
316670eb
A
5453}
5454
5455void
5456tcp_clear_recv_bg(struct socket *so)
5457{
5458 if (IS_TCP_RECV_BG(so))
5459 so->so_traffic_mgt_flags &= ~(TRAFFIC_MGT_TCP_RECVBG);
39236c6e
A
5460
5461 /*
5462 * Set/unset use of Large Receive Offload depending on
5463 * the traffic class
5464 */
5465 so_set_lro(so, so->so_traffic_class);
316670eb
A
5466}
5467
5468void
5469inp_fc_unthrottle_tcp(struct inpcb *inp)
5470{
5471 struct tcpcb *tp = inp->inp_ppcb;
5472 /*
5473 * Back off the slow-start threshold and enter
5474 * congestion avoidance phase
5475 */
5476 if (CC_ALGO(tp)->pre_fr != NULL)
5477 CC_ALGO(tp)->pre_fr(tp);
5478
5479 tp->snd_cwnd = tp->snd_ssthresh;
5480
5481 /*
5482 * Restart counting for ABC as we changed the
5483 * congestion window just now.
5484 */
5485 tp->t_bytes_acked = 0;
5486
5487 /* Reset retransmit shift as we know that the reason
5488 * for delay in sending a packet is due to flow
5489 * control on the outgoing interface. There is no need
5490 * to backoff retransmit timer.
5491 */
5492 tp->t_rxtshift = 0;
5493
5494 /*
5495 * Start the output stream again. Since we are
5496 * not retransmitting data, do not reset the
5497 * retransmit timer or rtt calculation.
5498 */
5499 tcp_output(tp);
5500}
39236c6e 5501
8ad349bb
A
5502static int
5503tcp_getstat SYSCTL_HANDLER_ARGS
5504{
2d21ac55 5505#pragma unused(oidp, arg1, arg2)
8ad349bb
A
5506
5507 int error;
5508
39236c6e
A
5509 proc_t caller = PROC_NULL;
5510 proc_t caller_parent = PROC_NULL;
5511 char command_name[MAXCOMLEN + 1] = "";
5512 char parent_name[MAXCOMLEN + 1] = "";
5513
5514 if ((caller = proc_self()) != PROC_NULL) {
5515 /* get process name */
5516 strlcpy(command_name, caller->p_comm, sizeof(command_name));
5517
5518 /* get parent process name if possible */
5519 if ((caller_parent = proc_find(caller->p_ppid)) != PROC_NULL) {
5520 strlcpy(parent_name, caller_parent->p_comm,
5521 sizeof(parent_name));
5522 proc_rele(caller_parent);
5523 }
5524
5525 if ((escape_str(command_name, strlen(command_name),
5526 sizeof(command_name)) == 0) &&
5527 (escape_str(parent_name, strlen(parent_name),
5528 sizeof(parent_name)) == 0)) {
5529 kern_asl_msg(LOG_DEBUG, "messagetracer",
5530 5,
5531 "com.apple.message.domain",
5532 "com.apple.kernel.tcpstat", /* 1 */
5533 "com.apple.message.signature",
5534 "tcpstat", /* 2 */
5535 "com.apple.message.signature2", command_name, /* 3 */
5536 "com.apple.message.signature3", parent_name, /* 4 */
5537 "com.apple.message.summarize", "YES", /* 5 */
5538 NULL);
5539 }
5540 }
5541 if (caller != PROC_NULL)
5542 proc_rele(caller);
5543
8ad349bb
A
5544 if (req->oldptr == 0) {
5545 req->oldlen= (size_t)sizeof(struct tcpstat);
5546 }
5547
2d21ac55 5548 error = SYSCTL_OUT(req, &tcpstat, MIN(sizeof (tcpstat), req->oldlen));
8ad349bb
A
5549
5550 return (error);
5551
5552}
5553
39236c6e
A
5554/*
5555 * Checksum extended TCP header and data.
5556 */
5557int
5558tcp_input_checksum(int af, struct mbuf *m, struct tcphdr *th, int off, int tlen)
5559{
5560 struct ifnet *ifp = m->m_pkthdr.rcvif;
5561
5562 switch (af) {
5563 case AF_INET: {
5564 struct ip *ip = mtod(m, struct ip *);
5565 struct ipovly *ipov = (struct ipovly *)ip;
5566
5567 if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_DID_CSUM)
5568 return (0);
5569
5570 if ((hwcksum_rx || (ifp->if_flags & IFF_LOOPBACK) ||
5571 (m->m_pkthdr.pkt_flags & PKTF_LOOP)) &&
5572 (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)) {
5573 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
5574 th->th_sum = m->m_pkthdr.csum_rx_val;
5575 } else {
5576 uint16_t sum = m->m_pkthdr.csum_rx_val;
5577 uint16_t start = m->m_pkthdr.csum_rx_start;
5578
5579 /*
5580 * Perform 1's complement adjustment of octets
5581 * that got included/excluded in the hardware-
5582 * calculated checksum value. Ignore cases
5583 * where the value includes or excludes the IP
5584 * header span, as the sum for those octets
5585 * would already be 0xffff and thus no-op.
5586 */
5587 if ((m->m_pkthdr.csum_flags & CSUM_PARTIAL) &&
5588 start != 0 && (off - start) != off) {
5589#if BYTE_ORDER != BIG_ENDIAN
5590 if (start < off) {
5591 HTONS(ip->ip_len);
5592 HTONS(ip->ip_off);
5593 }
5594#endif
5595 /* callee folds in sum */
5596 sum = m_adj_sum16(m, start, off, sum);
5597#if BYTE_ORDER != BIG_ENDIAN
5598 if (start < off) {
5599 NTOHS(ip->ip_off);
5600 NTOHS(ip->ip_len);
5601 }
5602#endif
5603 }
5604
5605 /* callee folds in sum */
5606 th->th_sum = in_pseudo(ip->ip_src.s_addr,
5607 ip->ip_dst.s_addr,
5608 sum + htonl(tlen + IPPROTO_TCP));
5609 }
5610 th->th_sum ^= 0xffff;
5611 } else {
5612 uint16_t ip_sum;
5613 int len;
5614 char b[9];
5615
5616 bcopy(ipov->ih_x1, b, sizeof (ipov->ih_x1));
5617 bzero(ipov->ih_x1, sizeof (ipov->ih_x1));
5618 ip_sum = ipov->ih_len;
5619 ipov->ih_len = (u_short)tlen;
5620#if BYTE_ORDER != BIG_ENDIAN
5621 HTONS(ipov->ih_len);
5622#endif
5623 len = sizeof (struct ip) + tlen;
5624 th->th_sum = in_cksum(m, len);
5625 bcopy(b, ipov->ih_x1, sizeof (ipov->ih_x1));
5626 ipov->ih_len = ip_sum;
5627
5628 tcp_in_cksum_stats(len);
5629 }
5630 break;
5631 }
5632#if INET6
5633 case AF_INET6: {
5634 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
5635
5636 if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_DID_CSUM)
5637 return (0);
5638
5639 if ((hwcksum_rx || (ifp->if_flags & IFF_LOOPBACK) ||
5640 (m->m_pkthdr.pkt_flags & PKTF_LOOP)) &&
5641 (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)) {
5642 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
5643 th->th_sum = m->m_pkthdr.csum_rx_val;
5644 } else {
5645 uint16_t sum = m->m_pkthdr.csum_rx_val;
5646 uint16_t start = m->m_pkthdr.csum_rx_start;
5647
5648 /*
5649 * Perform 1's complement adjustment of octets
5650 * that got included/excluded in the hardware-
5651 * calculated checksum value.
5652 */
5653 if ((m->m_pkthdr.csum_flags & CSUM_PARTIAL) &&
5654 start != off) {
5655 uint16_t s, d;
5656
5657 if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src)) {
5658 s = ip6->ip6_src.s6_addr16[1];
5659 ip6->ip6_src.s6_addr16[1] = 0 ;
5660 }
5661 if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst)) {
5662 d = ip6->ip6_dst.s6_addr16[1];
5663 ip6->ip6_dst.s6_addr16[1] = 0;
5664 }
5665
5666 /* callee folds in sum */
5667 sum = m_adj_sum16(m, start, off, sum);
5668
5669 if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src))
5670 ip6->ip6_src.s6_addr16[1] = s;
5671 if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst))
5672 ip6->ip6_dst.s6_addr16[1] = d;
5673 }
5674
5675 th->th_sum = in6_pseudo(
5676 &ip6->ip6_src, &ip6->ip6_dst,
5677 sum + htonl(tlen + IPPROTO_TCP));
5678 }
5679 th->th_sum ^= 0xffff;
5680 } else {
5681 tcp_in6_cksum_stats(tlen);
5682 th->th_sum = in6_cksum(m, IPPROTO_TCP, off, tlen);
5683 }
5684 break;
5685 }
5686#endif /* INET6 */
5687 default:
5688 VERIFY(0);
5689 /* NOTREACHED */
5690 }
5691
5692 if (th->th_sum != 0) {
5693 tcpstat.tcps_rcvbadsum++;
5694 IF_TCP_STATINC(ifp, badformat);
5695 return (-1);
5696 }
5697
5698 return (0);
5699}
5700
fe8ab488
A
5701SYSCTL_PROC(_net_inet_tcp, TCPCTL_STATS, stats,
5702 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
8ad349bb
A
5703 tcp_getstat, "S,tcpstat", "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
5704
2d21ac55
A
5705static int
5706sysctl_rexmtthresh SYSCTL_HANDLER_ARGS
5707{
5708#pragma unused(arg1, arg2)
5709
5710 int error, val = tcprexmtthresh;
5711
5712 error = sysctl_handle_int(oidp, &val, 0, req);
5713 if (error || !req->newptr)
5714 return (error);
5715
5716 /*
5717 * Constrain the number of duplicate ACKs
5718 * to consider for TCP fast retransmit
5719 * to either 2 or 3
5720 */
5721
5722 if (val < 2 || val > 3)
5723 return (EINVAL);
5724
5725 tcprexmtthresh = val;
5726
5727 return (0);
5728}
91447636 5729
6d2010ae 5730SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmt_thresh, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED,
2d21ac55 5731 &tcprexmtthresh, 0, &sysctl_rexmtthresh, "I", "Duplicate ACK Threshold for Fast Retransmit");