bsd/netinet/tcp_input.c

   1 /*
   2  * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
  30  *      The Regents of the University of California.  All rights reserved.
  31  *
  32  * Redistribution and use in source and binary forms, with or without
  33  * modification, are permitted provided that the following conditions
  34  * are met:
  35  * 1. Redistributions of source code must retain the above copyright
  36  *    notice, this list of conditions and the following disclaimer.
  37  * 2. Redistributions in binary form must reproduce the above copyright
  38  *    notice, this list of conditions and the following disclaimer in the
  39  *    documentation and/or other materials provided with the distribution.
  40  * 3. All advertising materials mentioning features or use of this software
  41  *    must display the following acknowledgement:
  42  *      This product includes software developed by the University of
  43  *      California, Berkeley and its contributors.
  44  * 4. Neither the name of the University nor the names of its contributors
  45  *    may be used to endorse or promote products derived from this software
  46  *    without specific prior written permission.
  47  *
  48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  58  * SUCH DAMAGE.
  59  *
  60  *      @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
  61  * $FreeBSD: src/sys/netinet/tcp_input.c,v 1.107.2.16 2001/08/22 00:59:12 silby Exp $
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/kernel.h>
  73 #include <sys/sysctl.h>
  74 #include <sys/malloc.h>
  75 #include <sys/mbuf.h>
  76 #include <sys/proc.h>           /* for proc0 declaration */
  77 #include <sys/protosw.h>
  78 #include <sys/socket.h>
  79 #include <sys/socketvar.h>
  80 #include <sys/syslog.h>
  81 #include <sys/mcache.h>
  82 #include <sys/kasl.h>
  83 #include <kern/cpu_number.h>    /* before tcp_seq.h, for tcp_random18() */
  84
  85 #include <machine/endian.h>
  86
  87 #include <net/if.h>
  88 #include <net/if_types.h>
  89 #include <net/route.h>
  90 #include <net/ntstat.h>
  91 #include <net/dlil.h>
  92
  93 #include <netinet/in.h>
  94 #include <netinet/in_systm.h>
  95 #include <netinet/ip.h>
  96 #include <netinet/ip_icmp.h>    /* for ICMP_BANDLIM             */
  97 #include <netinet/in_var.h>
  98 #include <netinet/icmp_var.h>   /* for ICMP_BANDLIM     */
  99 #include <netinet/in_pcb.h>
 100 #include <netinet/ip_var.h>
 101 #include <mach/sdt.h>
 102 #if INET6
 103 #include <netinet/ip6.h>
 104 #include <netinet/icmp6.h>
 105 #include <netinet6/nd6.h>
 106 #include <netinet6/ip6_var.h>
 107 #include <netinet6/in6_pcb.h>
 108 #endif
 109 #include <netinet/tcp.h>
 110 #include <netinet/tcp_cache.h>
 111 #include <netinet/tcp_fsm.h>
 112 #include <netinet/tcp_seq.h>
 113 #include <netinet/tcp_timer.h>
 114 #include <netinet/tcp_var.h>
 115 #include <netinet/tcp_cc.h>
 116 #include <dev/random/randomdev.h>
 117 #include <kern/zalloc.h>
 118 #if INET6
 119 #include <netinet6/tcp6_var.h>
 120 #endif
 121 #include <netinet/tcpip.h>
 122 #if TCPDEBUG
 123 #include <netinet/tcp_debug.h>
 124 u_char tcp_saveipgen[40]; /* the size must be of max ip header, now IPv6 */
 125 struct tcphdr tcp_savetcp;
 126 #endif /* TCPDEBUG */
 127
 128 #if IPSEC
 129 #include <netinet6/ipsec.h>
 130 #if INET6
 131 #include <netinet6/ipsec6.h>
 132 #endif
 133 #include <netkey/key.h>
 134 #endif /*IPSEC*/
 135
 136 #if CONFIG_MACF_NET || CONFIG_MACF_SOCKET
 137 #include <security/mac_framework.h>
 138 #endif /* CONFIG_MACF_NET || CONFIG_MACF_SOCKET */
 139
 140 #include <sys/kdebug.h>
 141 #include <netinet/lro_ext.h>
 142 #if MPTCP
 143 #include <netinet/mptcp_var.h>
 144 #include <netinet/mptcp.h>
 145 #include <netinet/mptcp_opt.h>
 146 #endif /* MPTCP */
 147
 148 #include <corecrypto/ccaes.h>
 149
 150 #define DBG_LAYER_BEG           NETDBG_CODE(DBG_NETTCP, 0)
 151 #define DBG_LAYER_END           NETDBG_CODE(DBG_NETTCP, 2)
 152 #define DBG_FNC_TCP_INPUT       NETDBG_CODE(DBG_NETTCP, (3 << 8))
 153 #define DBG_FNC_TCP_NEWCONN     NETDBG_CODE(DBG_NETTCP, (7 << 8))
 154
 155 tcp_cc  tcp_ccgen;
 156
 157 struct  tcpstat tcpstat;
 158
 159 static int log_in_vain = 0;
 160 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain,
 161     CTLFLAG_RW | CTLFLAG_LOCKED, &log_in_vain, 0,
 162     "Log all incoming TCP connections");
 163
 164 static int blackhole = 0;
 165 SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole,
 166     CTLFLAG_RW | CTLFLAG_LOCKED, &blackhole, 0,
 167     "Do not send RST when dropping refused connections");
 168
 169 int tcp_delack_enabled = 3;
 170 SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack,
 171     CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_delack_enabled, 0,
 172     "Delay ACK to try and piggyback it onto a data packet");
 173
 174 int tcp_lq_overflow = 1;
 175 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_lq_overflow,
 176     CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_lq_overflow, 0,
 177     "Listen Queue Overflow");
 178
 179 int tcp_recv_bg = 0;
 180 SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbg, CTLFLAG_RW | CTLFLAG_LOCKED,
 181     &tcp_recv_bg, 0, "Receive background");
 182
 183 #if TCP_DROP_SYNFIN
 184 static int drop_synfin = 1;
 185 SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin,
 186     CTLFLAG_RW | CTLFLAG_LOCKED, &drop_synfin, 0,
 187     "Drop TCP packets with SYN+FIN set");
 188 #endif
 189
 190 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW|CTLFLAG_LOCKED, 0,
 191     "TCP Segment Reassembly Queue");
 192
 193 static int tcp_reass_overflows = 0;
 194 SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows,
 195     CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_reass_overflows, 0,
 196     "Global number of TCP Segment Reassembly Queue Overflows");
 197
 198
 199 __private_extern__ int slowlink_wsize = 8192;
 200 SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowlink_wsize,
 201     CTLFLAG_RW | CTLFLAG_LOCKED,
 202     &slowlink_wsize, 0, "Maximum advertised window size for slowlink");
 203
 204 int maxseg_unacked = 8;
 205 SYSCTL_INT(_net_inet_tcp, OID_AUTO, maxseg_unacked,
 206     CTLFLAG_RW | CTLFLAG_LOCKED, &maxseg_unacked, 0,
 207     "Maximum number of outstanding segments left unacked");
 208
 209 int tcp_do_rfc3465 = 1;
 210 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW | CTLFLAG_LOCKED,
 211     &tcp_do_rfc3465, 0, "");
 212
 213 int tcp_do_rfc3465_lim2 = 1;
 214 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465_lim2,
 215     CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_rfc3465_lim2, 0,
 216     "Appropriate bytes counting w/ L=2*SMSS");
 217
 218 int rtt_samples_per_slot = 20;
 219
 220 int tcp_allowed_iaj = ALLOWED_IAJ;
 221 int tcp_acc_iaj_high_thresh = ACC_IAJ_HIGH_THRESH;
 222 u_int32_t tcp_autorcvbuf_inc_shift = 3;
 223 SYSCTL_INT(_net_inet_tcp, OID_AUTO, recv_allowed_iaj,
 224     CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_allowed_iaj, 0,
 225     "Allowed inter-packet arrival jiter");
 226 #if (DEVELOPMENT || DEBUG)
 227 SYSCTL_INT(_net_inet_tcp, OID_AUTO, acc_iaj_high_thresh,
 228     CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_acc_iaj_high_thresh, 0,
 229     "Used in calculating maximum accumulated IAJ");
 230
 231 SYSCTL_INT(_net_inet_tcp, OID_AUTO, autorcvbufincshift,
 232     CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_autorcvbuf_inc_shift, 0,
 233     "Shift for increment in receive socket buffer size");
 234 #endif /* (DEVELOPMENT || DEBUG) */
 235
 236 u_int32_t tcp_do_autorcvbuf = 1;
 237 SYSCTL_INT(_net_inet_tcp, OID_AUTO, doautorcvbuf,
 238     CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_autorcvbuf, 0,
 239     "Enable automatic socket buffer tuning");
 240
 241 u_int32_t tcp_autorcvbuf_max = 512 * 1024;
 242 SYSCTL_INT(_net_inet_tcp, OID_AUTO, autorcvbufmax,
 243     CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_autorcvbuf_max, 0,
 244     "Maximum receive socket buffer size");
 245
 246 int sw_lro = 0;
 247 SYSCTL_INT(_net_inet_tcp, OID_AUTO, lro, CTLFLAG_RW | CTLFLAG_LOCKED,
 248         &sw_lro, 0, "Used to coalesce TCP packets");
 249
 250 int lrodebug = 0;
 251 SYSCTL_INT(_net_inet_tcp, OID_AUTO, lrodbg,
 252     CTLFLAG_RW | CTLFLAG_LOCKED, &lrodebug, 0,
 253     "Used to debug SW LRO");
 254
 255 int lro_start = 4;
 256 SYSCTL_INT(_net_inet_tcp, OID_AUTO, lro_startcnt,
 257     CTLFLAG_RW | CTLFLAG_LOCKED, &lro_start, 0,
 258     "Segments for starting LRO computed as power of 2");
 259
 260 extern int tcp_do_autosendbuf;
 261
 262 int limited_txmt = 1;
 263 int early_rexmt = 1;
 264 int sack_ackadv = 1;
 265 int tcp_dsack_enable = 1;
 266
 267 #if (DEVELOPMENT || DEBUG)
 268 SYSCTL_INT(_net_inet_tcp, OID_AUTO, limited_transmit,
 269     CTLFLAG_RW | CTLFLAG_LOCKED, &limited_txmt, 0,
 270     "Enable limited transmit");
 271
 272 SYSCTL_INT(_net_inet_tcp, OID_AUTO, early_rexmt,
 273     CTLFLAG_RW | CTLFLAG_LOCKED, &early_rexmt, 0,
 274     "Enable Early Retransmit");
 275
 276 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sack_ackadv,
 277     CTLFLAG_RW | CTLFLAG_LOCKED, &sack_ackadv, 0,
 278     "Use SACK with cumulative ack advancement as a dupack");
 279
 280 SYSCTL_INT(_net_inet_tcp, OID_AUTO, dsack_enable,
 281     CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_dsack_enable, 0,
 282     "use DSACK TCP option to report duplicate segments");
 283 #endif /* (DEVELOPMENT || DEBUG) */
 284
 285 #if CONFIG_IFEF_NOWINDOWSCALE
 286 int tcp_obey_ifef_nowindowscale = 0;
 287 SYSCTL_INT(_net_inet_tcp, OID_AUTO, obey_ifef_nowindowscale,
 288     CTLFLAG_RW | CTLFLAG_LOCKED,
 289     &tcp_obey_ifef_nowindowscale, 0, "");
 290 #endif
 291
 292 extern int tcp_TCPTV_MIN;
 293 extern int tcp_acc_iaj_high;
 294 extern int tcp_acc_iaj_react_limit;
 295
 296 int tcprexmtthresh = 3;
 297
 298 u_int32_t tcp_now;
 299 struct timeval tcp_uptime;      /* uptime when tcp_now was last updated */
 300 lck_spin_t *tcp_uptime_lock;    /* Used to sychronize updates to tcp_now */
 301
 302 struct inpcbhead tcb;
 303 #define tcb6    tcb  /* for KAME src sync over BSD*'s */
 304 struct inpcbinfo tcbinfo;
 305
 306 static void tcp_dooptions(struct tcpcb *, u_char *, int, struct tcphdr *,
 307     struct tcpopt *);
 308 static void tcp_finalize_options(struct tcpcb *, struct tcpopt *, unsigned int);
 309 static void tcp_pulloutofband(struct socket *,
 310     struct tcphdr *, struct mbuf *, int);
 311 static int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *,
 312     struct ifnet *);
 313 static void tcp_xmit_timer(struct tcpcb *, int, u_int32_t, tcp_seq);
 314 static inline unsigned int tcp_maxmtu(struct rtentry *);
 315 static inline int tcp_stretch_ack_enable(struct tcpcb *tp);
 316 static inline void tcp_adaptive_rwtimo_check(struct tcpcb *, int);
 317
 318 #if TRAFFIC_MGT
 319 static inline void update_iaj_state(struct tcpcb *tp, uint32_t tlen,
 320     int reset_size);
 321 void compute_iaj(struct tcpcb *tp, int nlropkts, int lro_delay_factor);
 322 static void compute_iaj_meat(struct tcpcb *tp, uint32_t cur_iaj);
 323 #endif /* TRAFFIC_MGT */
 324
 325 #if INET6
 326 static inline unsigned int tcp_maxmtu6(struct rtentry *);
 327 #endif
 328
 329 static void tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sb,
 330     struct tcpopt *to, u_int32_t tlen);
 331
 332 void tcp_sbrcv_trim(struct tcpcb *tp, struct sockbuf *sb);
 333 static void tcp_sbsnd_trim(struct sockbuf *sbsnd);
 334 static inline void tcp_sbrcv_tstmp_check(struct tcpcb *tp);
 335 static inline void tcp_sbrcv_reserve(struct tcpcb *tp, struct sockbuf *sb,
 336     u_int32_t newsize, u_int32_t idealsize);
 337 static void tcp_bad_rexmt_restore_state(struct tcpcb *tp, struct tcphdr *th);
 338 static void tcp_compute_rtt(struct tcpcb *tp, struct tcpopt *to,
 339     struct tcphdr *th);
 340 static void tcp_early_rexmt_check(struct tcpcb *tp, struct tcphdr *th);
 341 static void tcp_bad_rexmt_check(struct tcpcb *tp, struct tcphdr *th,
 342     struct tcpopt *to);
 343 /*
 344  * Constants used for resizing receive socket buffer
 345  * when timestamps are not supported
 346  */
 347 #define TCPTV_RCVNOTS_QUANTUM 100
 348 #define TCP_RCVNOTS_BYTELEVEL 204800
 349
 350 /*
 351  * Constants used for limiting early retransmits
 352  * to 10 per minute.
 353  */
 354 #define TCP_EARLY_REXMT_WIN (60 * TCP_RETRANSHZ) /* 60 seconds */
 355 #define TCP_EARLY_REXMT_LIMIT 10
 356
 357 extern void ipfwsyslog( int level, const char *format,...);
 358 extern int fw_verbose;
 359
 360 #if IPFIREWALL
 361 extern void ipfw_stealth_stats_incr_tcp(void);
 362
 363 #define log_in_vain_log( a ) {            \
 364         if ( (log_in_vain == 3 ) && (fw_verbose == 2)) {        /* Apple logging, log to ipfw.log */ \
 365                 ipfwsyslog a ;  \
 366         } else if ( (log_in_vain == 4 ) && (fw_verbose == 2)) {   \
 367                 ipfw_stealth_stats_incr_tcp();                    \
 368         }                       \
 369         else log a ;            \
 370 }
 371 #else
 372 #define log_in_vain_log( a ) { log a; }
 373 #endif
 374
 375 int tcp_rcvunackwin = TCPTV_UNACKWIN;
 376 int tcp_maxrcvidle = TCPTV_MAXRCVIDLE;
 377 int tcp_rcvsspktcnt = TCP_RCV_SS_PKTCOUNT;
 378 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rcvsspktcnt, CTLFLAG_RW | CTLFLAG_LOCKED,
 379         &tcp_rcvsspktcnt, 0, "packets to be seen before receiver stretches acks");
 380
 381 #define DELAY_ACK(tp, th) \
 382         (CC_ALGO(tp)->delay_ack != NULL && CC_ALGO(tp)->delay_ack(tp, th))
 383
 384 static int tcp_dropdropablreq(struct socket *head);
 385 static void tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th);
 386 static void update_base_rtt(struct tcpcb *tp, uint32_t rtt);
 387 void tcp_set_background_cc(struct socket *so);
 388 void tcp_set_foreground_cc(struct socket *so);
 389 static void tcp_set_new_cc(struct socket *so, uint16_t cc_index);
 390 static void tcp_bwmeas_check(struct tcpcb *tp);
 391
 392 #if TRAFFIC_MGT
 393 void
 394 reset_acc_iaj(struct tcpcb *tp)
 395 {
 396         tp->acc_iaj = 0;
 397         tp->iaj_rwintop = 0;
 398         CLEAR_IAJ_STATE(tp);
 399 }
 400
 401 static inline void
 402 update_iaj_state(struct tcpcb *tp, uint32_t size, int rst_size)
 403 {
 404         if (rst_size > 0)
 405                 tp->iaj_size = 0;
 406         if (tp->iaj_size == 0 || size >= tp->iaj_size) {
 407                 tp->iaj_size = size;
 408                 tp->iaj_rcv_ts = tcp_now;
 409                 tp->iaj_small_pkt = 0;
 410         }
 411 }
 412
 413 /* For every 32 bit unsigned integer(v), this function will find the
 414  * largest integer n such that (n*n <= v). This takes at most 16 iterations
 415  * irrespective of the value of v and does not involve multiplications.
 416  */
 417 static inline int
 418 isqrt(unsigned int val) {
 419         unsigned int sqrt_cache[11] = {0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100};
 420         unsigned int temp, g=0, b=0x8000, bshft=15;
 421         if ( val <= 100) {
 422                 for (g = 0; g <= 10; ++g) {
 423                         if (sqrt_cache[g] > val) {
 424                                 g--;
 425                                 break;
 426                         } else if (sqrt_cache[g] == val) {
 427                                 break;
 428                         }
 429                 }
 430         } else {
 431                 do {
 432                         temp = (((g << 1) + b) << (bshft--));
 433                         if (val >= temp) {
 434                                 g += b;
 435                                 val -= temp;
 436                         }
 437                         b >>= 1;
 438                 } while ( b > 0 && val > 0);
 439         }
 440         return(g);
 441 }
 442
 443 /*
 444 * With LRO, roughly estimate the inter arrival time between
 445 * each sub coalesced packet as an average. Count the delay
 446 * cur_iaj to be the delay between the last packet received
 447 * and the first packet of the LRO stream. Due to round off errors
 448 * cur_iaj may be the same as lro_delay_factor. Averaging has
 449 * round off errors too. lro_delay_factor may be close to 0
 450 * in steady state leading to lower values fed to compute_iaj_meat.
 451 */
 452 void
 453 compute_iaj(struct tcpcb *tp, int nlropkts, int lro_delay_factor)
 454 {
 455         uint32_t cur_iaj = tcp_now - tp->iaj_rcv_ts;
 456         uint32_t timediff = 0;
 457
 458         if (cur_iaj >= lro_delay_factor) {
 459                 cur_iaj = cur_iaj - lro_delay_factor;
 460         }
 461
 462         compute_iaj_meat(tp, cur_iaj);
 463
 464         if (nlropkts <= 1)
 465                 return;
 466
 467         nlropkts--;
 468
 469         timediff = lro_delay_factor/nlropkts;
 470
 471         while (nlropkts > 0)
 472         {
 473                 compute_iaj_meat(tp, timediff);
 474                 nlropkts--;
 475         }
 476 }
 477
 478 static
 479 void compute_iaj_meat(struct tcpcb *tp, uint32_t cur_iaj)
 480 {
 481         /* When accumulated IAJ reaches MAX_ACC_IAJ in milliseconds,
 482          * throttle the receive window to a minimum of MIN_IAJ_WIN packets
 483          */
 484 #define MAX_ACC_IAJ (tcp_acc_iaj_high_thresh + tcp_acc_iaj_react_limit)
 485 #define IAJ_DIV_SHIFT 4
 486 #define IAJ_ROUNDUP_CONST (1 << (IAJ_DIV_SHIFT - 1))
 487
 488         uint32_t allowed_iaj, acc_iaj = 0;
 489
 490         uint32_t mean, temp;
 491         int32_t cur_iaj_dev;
 492
 493         cur_iaj_dev = (cur_iaj - tp->avg_iaj);
 494
 495         /* Allow a jitter of "allowed_iaj" milliseconds. Some connections
 496          * may have a constant jitter more than that. We detect this by
 497          * using standard deviation.
 498          */
 499         allowed_iaj = tp->avg_iaj + tp->std_dev_iaj;
 500         if (allowed_iaj < tcp_allowed_iaj)
 501                 allowed_iaj = tcp_allowed_iaj;
 502
 503         /* Initially when the connection starts, the senders congestion
 504          * window is small. During this period we avoid throttling a
 505          * connection because we do not have a good starting point for
 506          * allowed_iaj. IAJ_IGNORE_PKTCNT is used to quietly gloss over
 507          * the first few packets.
 508          */
 509         if (tp->iaj_pktcnt > IAJ_IGNORE_PKTCNT) {
 510                 if ( cur_iaj <= allowed_iaj ) {
 511                         if (tp->acc_iaj >= 2)
 512                                 acc_iaj = tp->acc_iaj - 2;
 513                         else
 514                                 acc_iaj = 0;
 515
 516                 } else {
 517                         acc_iaj = tp->acc_iaj + (cur_iaj - allowed_iaj);
 518                 }
 519
 520                 if (acc_iaj > MAX_ACC_IAJ)
 521                         acc_iaj = MAX_ACC_IAJ;
 522                 tp->acc_iaj = acc_iaj;
 523         }
 524
 525         /* Compute weighted average where the history has a weight of
 526          * 15 out of 16 and the current value has a weight of 1 out of 16.
 527          * This will make the short-term measurements have more weight.
 528          *
 529          * The addition of 8 will help to round-up the value
 530          * instead of round-down
 531          */
 532         tp->avg_iaj = (((tp->avg_iaj << IAJ_DIV_SHIFT) - tp->avg_iaj)
 533                 + cur_iaj + IAJ_ROUNDUP_CONST) >> IAJ_DIV_SHIFT;
 534
 535         /* Compute Root-mean-square of deviation where mean is a weighted
 536          * average as described above.
 537          */
 538         temp = tp->std_dev_iaj * tp->std_dev_iaj;
 539         mean = (((temp << IAJ_DIV_SHIFT) - temp)
 540                 + (cur_iaj_dev * cur_iaj_dev)
 541                 + IAJ_ROUNDUP_CONST) >> IAJ_DIV_SHIFT;
 542
 543         tp->std_dev_iaj = isqrt(mean);
 544
 545         DTRACE_TCP3(iaj, struct tcpcb *, tp, uint32_t, cur_iaj,
 546                 uint32_t, allowed_iaj);
 547
 548         return;
 549 }
 550 #endif /* TRAFFIC_MGT */
 551
 552 /* Check if enough amount of data has been acknowledged since
 553  * bw measurement was started
 554  */
 555 static void
 556 tcp_bwmeas_check(struct tcpcb *tp)
 557 {
 558         int32_t bw_meas_bytes;
 559         uint32_t bw, bytes, elapsed_time;
 560         bw_meas_bytes = tp->snd_una - tp->t_bwmeas->bw_start;
 561         if ((tp->t_flagsext & TF_BWMEAS_INPROGRESS) != 0 &&
 562             bw_meas_bytes >= (int32_t)(tp->t_bwmeas->bw_size)) {
 563                 bytes = bw_meas_bytes;
 564                 elapsed_time = tcp_now - tp->t_bwmeas->bw_ts;
 565                 if (elapsed_time > 0) {
 566                         bw = bytes / elapsed_time;
 567                         if ( bw > 0) {
 568                                 if (tp->t_bwmeas->bw_sndbw > 0) {
 569                                         tp->t_bwmeas->bw_sndbw =
 570                                             (((tp->t_bwmeas->bw_sndbw << 3) - tp->t_bwmeas->bw_sndbw) + bw) >> 3;
 571                                 } else {
 572                                         tp->t_bwmeas->bw_sndbw = bw;
 573                                 }
 574                         }
 575                 }
 576                 tp->t_flagsext &= ~(TF_BWMEAS_INPROGRESS);
 577         }
 578 }
 579
 580 static int
 581 tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m,
 582     struct ifnet *ifp)
 583 {
 584         struct tseg_qent *q;
 585         struct tseg_qent *p = NULL;
 586         struct tseg_qent *nq;
 587         struct tseg_qent *te = NULL;
 588         struct inpcb *inp = tp->t_inpcb;
 589         struct socket *so = inp->inp_socket;
 590         int flags = 0;
 591         int dowakeup = 0;
 592         struct mbuf *oodata = NULL;
 593         int copy_oodata = 0;
 594         u_int16_t qlimit;
 595         boolean_t cell = IFNET_IS_CELLULAR(ifp);
 596         boolean_t wifi = (!cell && IFNET_IS_WIFI(ifp));
 597         boolean_t wired = (!wifi && IFNET_IS_WIRED(ifp));
 598         boolean_t dsack_set = FALSE;
 599
 600         /*
 601          * Call with th==0 after become established to
 602          * force pre-ESTABLISHED data up to user socket.
 603          */
 604         if (th == NULL)
 605                 goto present;
 606
 607         /*
 608          * If the reassembly queue already has entries or if we are going
 609          * to add a new one, then the connection has reached a loss state.
 610          * Reset the stretch-ack algorithm at this point.
 611          */
 612         tcp_reset_stretch_ack(tp);
 613
 614 #if TRAFFIC_MGT
 615         if (tp->acc_iaj > 0)
 616                 reset_acc_iaj(tp);
 617 #endif /* TRAFFIC_MGT */
 618
 619         /*
 620          * Limit the number of segments in the reassembly queue to prevent
 621          * holding on to too many segments (and thus running out of mbufs).
 622          * Make sure to let the missing segment through which caused this
 623          * queue.  Always keep one global queue entry spare to be able to
 624          * process the missing segment.
 625          */
 626         qlimit = min(max(100, so->so_rcv.sb_hiwat >> 10),
 627             tcp_autorcvbuf_max >> 10);
 628         if (th->th_seq != tp->rcv_nxt &&
 629             (tp->t_reassqlen + 1) >= qlimit) {
 630                 tcp_reass_overflows++;
 631                 tcpstat.tcps_rcvmemdrop++;
 632                 m_freem(m);
 633                 *tlenp = 0;
 634                 return (0);
 635         }
 636
 637         /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
 638         te = (struct tseg_qent *) zalloc(tcp_reass_zone);
 639         if (te == NULL) {
 640                 tcpstat.tcps_rcvmemdrop++;
 641                 m_freem(m);
 642                 return (0);
 643         }
 644         tp->t_reassqlen++;
 645
 646         /*
 647          * Find a segment which begins after this one does.
 648          */
 649         LIST_FOREACH(q, &tp->t_segq, tqe_q) {
 650                 if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
 651                         break;
 652                 p = q;
 653         }
 654
 655         /*
 656          * If there is a preceding segment, it may provide some of
 657          * our data already.  If so, drop the data from the incoming
 658          * segment.  If it provides all of our data, drop us.
 659          */
 660         if (p != NULL) {
 661                 int i;
 662                 /* conversion to int (in i) handles seq wraparound */
 663                 i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
 664                 if (i > 0) {
 665                         if (TCP_DSACK_ENABLED(tp) && i > 1) {
 666                                 /*
 667                                  * Note duplicate data sequnce numbers
 668                                  * to report in DSACK option
 669                                  */
 670                                 tp->t_dsack_lseq = th->th_seq;
 671                                 tp->t_dsack_rseq = th->th_seq +
 672                                     min(i, *tlenp);
 673
 674                                 /*
 675                                  * Report only the first part of partial/
 676                                  * non-contiguous duplicate sequence space
 677                                  */
 678                                 dsack_set = TRUE;
 679                         }
 680                         if (i >= *tlenp) {
 681                                 tcpstat.tcps_rcvduppack++;
 682                                 tcpstat.tcps_rcvdupbyte += *tlenp;
 683                                 if (nstat_collect) {
 684                                         nstat_route_rx(inp->inp_route.ro_rt,
 685                                             1, *tlenp,
 686                                             NSTAT_RX_FLAG_DUPLICATE);
 687                                         INP_ADD_STAT(inp, cell, wifi, wired,
 688                                             rxpackets, 1);
 689                                         INP_ADD_STAT(inp, cell, wifi, wired,
 690                                             rxbytes, *tlenp);
 691                                         tp->t_stat.rxduplicatebytes += *tlenp;
 692                                 }
 693                                 m_freem(m);
 694                                 zfree(tcp_reass_zone, te);
 695                                 te = NULL;
 696                                 tp->t_reassqlen--;
 697                                 /*
 698                                  * Try to present any queued data
 699                                  * at the left window edge to the user.
 700                                  * This is needed after the 3-WHS
 701                                  * completes.
 702                                  */
 703                                 goto present;
 704                         }
 705                         m_adj(m, i);
 706                         *tlenp -= i;
 707                         th->th_seq += i;
 708                 }
 709         }
 710         tp->t_rcvoopack++;
 711         tcpstat.tcps_rcvoopack++;
 712         tcpstat.tcps_rcvoobyte += *tlenp;
 713         if (nstat_collect) {
 714                 nstat_route_rx(inp->inp_route.ro_rt, 1, *tlenp,
 715                     NSTAT_RX_FLAG_OUT_OF_ORDER);
 716                 INP_ADD_STAT(inp, cell, wifi, wired, rxpackets, 1);
 717                 INP_ADD_STAT(inp, cell, wifi, wired, rxbytes, *tlenp);
 718                 tp->t_stat.rxoutoforderbytes += *tlenp;
 719         }
 720
 721         /*
 722          * While we overlap succeeding segments trim them or,
 723          * if they are completely covered, dequeue them.
 724          */
 725         while (q) {
 726                 int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
 727                 if (i <= 0)
 728                         break;
 729
 730                 /*
 731                  * Report only the first part of partial/non-contiguous
 732                  * duplicate segment in dsack option. The variable
 733                  * dsack_set will be true if a previous entry has some of
 734                  * the duplicate sequence space.
 735                  */
 736                 if (TCP_DSACK_ENABLED(tp) && i > 1 && !dsack_set) {
 737                         if (tp->t_dsack_lseq == 0) {
 738                                 tp->t_dsack_lseq = q->tqe_th->th_seq;
 739                                 tp->t_dsack_rseq =
 740                                     tp->t_dsack_lseq + min(i, q->tqe_len);
 741                         } else {
 742                                 /*
 743                                  * this segment overlaps data in multple
 744                                  * entries in the reassembly queue, move
 745                                  * the right sequence number further.
 746                                  */
 747                                 tp->t_dsack_rseq =
 748                                     tp->t_dsack_rseq + min(i, q->tqe_len);
 749                         }
 750                 }
 751                 if (i < q->tqe_len) {
 752                         q->tqe_th->th_seq += i;
 753                         q->tqe_len -= i;
 754                         m_adj(q->tqe_m, i);
 755                         break;
 756                 }
 757
 758                 nq = LIST_NEXT(q, tqe_q);
 759                 LIST_REMOVE(q, tqe_q);
 760                 m_freem(q->tqe_m);
 761                 zfree(tcp_reass_zone, q);
 762                 tp->t_reassqlen--;
 763                 q = nq;
 764         }
 765
 766         /* Insert the new segment queue entry into place. */
 767         te->tqe_m = m;
 768         te->tqe_th = th;
 769         te->tqe_len = *tlenp;
 770
 771         if (p == NULL) {
 772                 LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
 773         } else {
 774                 LIST_INSERT_AFTER(p, te, tqe_q);
 775         }
 776
 777         /*
 778          * New out-of-order data exists, and is pointed to by
 779          * queue entry te. Set copy_oodata to 1 so out-of-order data
 780          * can be copied off to sockbuf after in-order data
 781          * is copied off.
 782          */
 783         if (!(so->so_state & SS_CANTRCVMORE))
 784                 copy_oodata = 1;
 785
 786 present:
 787         /*
 788          * Present data to user, advancing rcv_nxt through
 789          * completed sequence space.
 790          */
 791         if (!TCPS_HAVEESTABLISHED(tp->t_state))
 792                 return (0);
 793         q = LIST_FIRST(&tp->t_segq);
 794         if (!q || q->tqe_th->th_seq != tp->rcv_nxt) {
 795                 /* Stop using LRO once out of order packets arrive */
 796                 if (tp->t_flagsext & TF_LRO_OFFLOADED) {
 797                         tcp_lro_remove_state(inp->inp_laddr, inp->inp_faddr,
 798                                 th->th_dport, th->th_sport);
 799                         tp->t_flagsext &= ~TF_LRO_OFFLOADED;
 800                 }
 801
 802                 /*
 803                  * continue processing if out-of-order data
 804                  * can be delivered
 805                  */
 806                 if (q && (so->so_flags & SOF_ENABLE_MSGS))
 807                         goto msg_unordered_delivery;
 808
 809                 return (0);
 810         }
 811
 812         /* lost packet was recovered, so ooo data can be returned */
 813         tcpstat.tcps_recovered_pkts++;
 814
 815         do {
 816                 tp->rcv_nxt += q->tqe_len;
 817                 flags = q->tqe_th->th_flags & TH_FIN;
 818                 nq = LIST_NEXT(q, tqe_q);
 819                 LIST_REMOVE(q, tqe_q);
 820                 if (so->so_state & SS_CANTRCVMORE) {
 821                         m_freem(q->tqe_m);
 822                 } else {
 823                         so_recv_data_stat(so, q->tqe_m, 0); /* XXXX */
 824                         if (so->so_flags & SOF_ENABLE_MSGS) {
 825                                 /*
 826                                  * Append the inorder data as a message to the
 827                                  * receive socket buffer. Also check to see if
 828                                  * the data we are about to deliver is the same
 829                                  * data that we wanted to pass up to the user
 830                                  * out of order. If so, reset copy_oodata --
 831                                  * the received data filled a gap, and
 832                                  * is now in order!
 833                                  */
 834                                 if (q == te)
 835                                         copy_oodata = 0;
 836                         }
 837                         if (sbappendstream_rcvdemux(so, q->tqe_m,
 838                             q->tqe_th->th_seq - (tp->irs + 1), 0))
 839                                 dowakeup = 1;
 840                         if (tp->t_flagsext & TF_LRO_OFFLOADED) {
 841                                 tcp_update_lro_seq(tp->rcv_nxt,
 842                                  inp->inp_laddr, inp->inp_faddr,
 843                                  th->th_dport, th->th_sport);
 844                         }
 845                 }
 846                 zfree(tcp_reass_zone, q);
 847                 tp->t_reassqlen--;
 848                 q = nq;
 849         } while (q && q->tqe_th->th_seq == tp->rcv_nxt);
 850
 851 #if INET6
 852         if ((inp->inp_vflag & INP_IPV6) != 0) {
 853
 854                 KERNEL_DEBUG(DBG_LAYER_BEG,
 855                      ((inp->inp_fport << 16) | inp->inp_lport),
 856                      (((inp->in6p_laddr.s6_addr16[0] & 0xffff) << 16) |
 857                       (inp->in6p_faddr.s6_addr16[0] & 0xffff)),
 858                      0,0,0);
 859         }
 860         else
 861 #endif
 862         {
 863                 KERNEL_DEBUG(DBG_LAYER_BEG,
 864                      ((inp->inp_fport << 16) | inp->inp_lport),
 865                      (((inp->inp_laddr.s_addr & 0xffff) << 16) |
 866                       (inp->inp_faddr.s_addr & 0xffff)),
 867                      0,0,0);
 868         }
 869
 870 msg_unordered_delivery:
 871         /* Deliver out-of-order data as a message */
 872         if (te && (so->so_flags & SOF_ENABLE_MSGS) && copy_oodata && te->tqe_len) {
 873                 /*
 874                  * make a copy of the mbuf to be delivered up to
 875                  * the user, and add it to the sockbuf
 876                  */
 877                 oodata = m_copym(te->tqe_m, 0, M_COPYALL, M_DONTWAIT);
 878                 if (oodata != NULL) {
 879                         if (sbappendmsgstream_rcv(&so->so_rcv, oodata,
 880                                 te->tqe_th->th_seq - (tp->irs + 1), 1)) {
 881                                 dowakeup = 1;
 882                                 tcpstat.tcps_msg_unopkts++;
 883                         } else {
 884                                 tcpstat.tcps_msg_unoappendfail++;
 885                         }
 886                 }
 887         }
 888
 889         if (dowakeup)
 890                 sorwakeup(so); /* done with socket lock held */
 891         return (flags);
 892 }
 893
 894 /*
 895  * Reduce congestion window -- used when ECN is seen or when a tail loss
 896  * probe recovers the last packet.
 897  */
 898 static void
 899 tcp_reduce_congestion_window(
 900         struct tcpcb    *tp)
 901 {
 902         /*
 903          * If the current tcp cc module has
 904          * defined a hook for tasks to run
 905          * before entering FR, call it
 906          */
 907         if (CC_ALGO(tp)->pre_fr != NULL)
 908                 CC_ALGO(tp)->pre_fr(tp);
 909         ENTER_FASTRECOVERY(tp);
 910         if (tp->t_flags & TF_SENTFIN)
 911                 tp->snd_recover = tp->snd_max - 1;
 912         else
 913                 tp->snd_recover = tp->snd_max;
 914         tp->t_timer[TCPT_REXMT] = 0;
 915         tp->t_timer[TCPT_PTO] = 0;
 916         tp->t_rtttime = 0;
 917         if (tp->t_flagsext & TF_CWND_NONVALIDATED) {
 918                 tcp_cc_adjust_nonvalidated_cwnd(tp);
 919         } else {
 920                 tp->snd_cwnd = tp->snd_ssthresh +
 921                     tp->t_maxseg * tcprexmtthresh;
 922         }
 923 }
 924
 925 /*
 926  * This function is called upon reception of data on a socket. It's purpose is
 927  * to handle the adaptive keepalive timers that monitor whether the connection
 928  * is making progress. First the adaptive read-timer, second the TFO probe-timer.
 929  *
 930  * The application wants to get an event if there is a stall during read.
 931  * Set the initial keepalive timeout to be equal to twice RTO.
 932  *
 933  * If the outgoing interface is in marginal conditions, we need to
 934  * enable read probes for that too.
 935  */
 936 static inline void
 937 tcp_adaptive_rwtimo_check(struct tcpcb *tp, int tlen)
 938 {
 939         struct ifnet *outifp = tp->t_inpcb->inp_last_outifp;
 940
 941         if ((tp->t_adaptive_rtimo > 0 ||
 942             (outifp != NULL &&
 943             (outifp->if_eflags & IFEF_PROBE_CONNECTIVITY)))
 944             && tlen > 0 &&
 945             tp->t_state == TCPS_ESTABLISHED) {
 946                 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
 947                         (TCP_REXMTVAL(tp) << 1));
 948                 tp->t_flagsext |= TF_DETECT_READSTALL;
 949                 tp->t_rtimo_probes = 0;
 950         }
 951 }
 952
 953 inline void
 954 tcp_keepalive_reset(struct tcpcb *tp)
 955 {
 956         tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
 957                 TCP_CONN_KEEPIDLE(tp));
 958         tp->t_flagsext &= ~(TF_DETECT_READSTALL);
 959         tp->t_rtimo_probes = 0;
 960 }
 961
 962 /*
 963  * TCP input routine, follows pages 65-76 of the
 964  * protocol specification dated September, 1981 very closely.
 965  */
 966 #if INET6
 967 int
 968 tcp6_input(struct mbuf **mp, int *offp, int proto)
 969 {
 970 #pragma unused(proto)
 971         register struct mbuf *m = *mp;
 972         uint32_t ia6_flags;
 973         struct ifnet *ifp = m->m_pkthdr.rcvif;
 974
 975         IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), return IPPROTO_DONE);
 976
 977         /* Expect 32-bit aligned data pointer on strict-align platforms */
 978         MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
 979
 980         /*
 981          * draft-itojun-ipv6-tcp-to-anycast
 982          * better place to put this in?
 983          */
 984         if (ip6_getdstifaddr_info(m, NULL, &ia6_flags) == 0) {
 985                 if (ia6_flags & IN6_IFF_ANYCAST) {
 986                         struct ip6_hdr *ip6;
 987
 988                         ip6 = mtod(m, struct ip6_hdr *);
 989                         icmp6_error(m, ICMP6_DST_UNREACH,
 990                             ICMP6_DST_UNREACH_ADDR,
 991                             (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
 992
 993                         IF_TCP_STATINC(ifp, icmp6unreach);
 994
 995                         return (IPPROTO_DONE);
 996                 }
 997         }
 998
 999         tcp_input(m, *offp);
1000         return (IPPROTO_DONE);
1001 }
1002 #endif
1003
1004 /* Depending on the usage of mbuf space in the system, this function
1005  * will return true or false. This is used to determine if a socket
1006  * buffer can take more memory from the system for auto-tuning or not.
1007  */
1008 u_int8_t
1009 tcp_cansbgrow(struct sockbuf *sb)
1010 {
1011         /* Calculate the host level space limit in terms of MSIZE buffers.
1012          * We can use a maximum of half of the available mbuf space for
1013          * socket buffers.
1014          */
1015         u_int32_t mblim = ((nmbclusters >> 1) << (MCLSHIFT - MSIZESHIFT));
1016
1017         /* Calculate per sb limit in terms of bytes. We optimize this limit
1018          * for upto 16 socket buffers.
1019          */
1020
1021         u_int32_t sbspacelim = ((nmbclusters >> 4) << MCLSHIFT);
1022
1023         if ((total_sbmb_cnt < mblim) &&
1024                 (sb->sb_hiwat < sbspacelim)) {
1025                 return(1);
1026         } else {
1027                 OSIncrementAtomic64(&sbmb_limreached);
1028         }
1029         return(0);
1030 }
1031
1032 static void
1033 tcp_sbrcv_reserve(struct tcpcb *tp, struct sockbuf *sbrcv,
1034         u_int32_t newsize, u_int32_t idealsize)
1035 {
1036
1037         /* newsize should not exceed max */
1038         newsize = min(newsize, tcp_autorcvbuf_max);
1039
1040         /* The receive window scale negotiated at the
1041          * beginning of the connection will also set a
1042          * limit on the socket buffer size
1043          */
1044         newsize = min(newsize, TCP_MAXWIN << tp->rcv_scale);
1045
1046         /* Set new socket buffer size */
1047         if (newsize > sbrcv->sb_hiwat &&
1048                 (sbreserve(sbrcv, newsize) == 1)) {
1049                 sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize,
1050                         (idealsize != 0) ? idealsize : newsize),
1051                         tcp_autorcvbuf_max);
1052
1053                 /* Again check the limit set by the advertised
1054                  * window scale
1055                  */
1056                 sbrcv->sb_idealsize = min(sbrcv->sb_idealsize,
1057                         TCP_MAXWIN << tp->rcv_scale);
1058         }
1059 }
1060
1061 /*
1062  * This function is used to grow  a receive socket buffer. It
1063  * will take into account system-level memory usage and the
1064  * bandwidth available on the link to make a decision.
1065  */
1066 static void
1067 tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sbrcv,
1068         struct tcpopt *to, u_int32_t pktlen)
1069 {
1070         struct socket *so = sbrcv->sb_so;
1071
1072         /*
1073          * Do not grow the receive socket buffer if
1074          * - auto resizing is disabled, globally or on this socket
1075          * - the high water mark already reached the maximum
1076          * - the stream is in background and receive side is being
1077          * throttled
1078          * - if there are segments in reassembly queue indicating loss,
1079          * do not need to increase recv window during recovery as more
1080          * data is not going to be sent. A duplicate ack sent during
1081          * recovery should not change the receive window
1082          */
1083         if (tcp_do_autorcvbuf == 0 ||
1084                 (sbrcv->sb_flags & SB_AUTOSIZE) == 0 ||
1085                 tcp_cansbgrow(sbrcv) == 0 ||
1086                 sbrcv->sb_hiwat >= tcp_autorcvbuf_max ||
1087                 (tp->t_flagsext & TF_RECV_THROTTLE) ||
1088                 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ||
1089                 !LIST_EMPTY(&tp->t_segq)) {
1090                 /* Can not resize the socket buffer, just return */
1091                 goto out;
1092         }
1093
1094         if (TSTMP_GT(tcp_now,
1095                 tp->rfbuf_ts + TCPTV_RCVBUFIDLE)) {
1096                 /* If there has been an idle period in the
1097                  * connection, just restart the measurement
1098                  */
1099                 goto out;
1100         }
1101
1102         if (!TSTMP_SUPPORTED(tp)) {
1103                 /*
1104                  * Timestamp option is not supported on this connection.
1105                  * If the connection reached a state to indicate that
1106                  * the receive socket buffer needs to grow, increase
1107                  * the high water mark.
1108                  */
1109                 if (TSTMP_GEQ(tcp_now,
1110                         tp->rfbuf_ts + TCPTV_RCVNOTS_QUANTUM)) {
1111                         if (tp->rfbuf_cnt >= TCP_RCVNOTS_BYTELEVEL) {
1112                                 tcp_sbrcv_reserve(tp, sbrcv,
1113                                         tcp_autorcvbuf_max, 0);
1114                         }
1115                         goto out;
1116                 } else {
1117                         tp->rfbuf_cnt += pktlen;
1118                         return;
1119                 }
1120         } else if (to->to_tsecr != 0) {
1121                 /*
1122                  * If the timestamp shows that one RTT has
1123                  * completed, we can stop counting the
1124                  * bytes. Here we consider increasing
1125                  * the socket buffer if the bandwidth measured in
1126                  * last rtt, is more than half of sb_hiwat, this will
1127                  * help to scale the buffer according to the bandwidth
1128                  * on the link.
1129                  */
1130                 if (TSTMP_GEQ(to->to_tsecr, tp->rfbuf_ts)) {
1131                         if (tp->rfbuf_cnt > (sbrcv->sb_hiwat -
1132                                 (sbrcv->sb_hiwat >> 1))) {
1133                                 int32_t rcvbuf_inc, min_incr;
1134                                 /*
1135                                  * Increment the receive window by a
1136                                  * multiple of maximum sized segments.
1137                                  * This will prevent a connection from
1138                                  * sending smaller segments on wire if it
1139                                  * is limited by the receive window.
1140                                  *
1141                                  * Set the ideal size based on current
1142                                  * bandwidth measurements. We set the
1143                                  * ideal size on receive socket buffer to
1144                                  * be twice the bandwidth delay product.
1145                                  */
1146                                 rcvbuf_inc = (tp->rfbuf_cnt << 1)
1147                                     - sbrcv->sb_hiwat;
1148
1149                                 /*
1150                                  * Make the increment equal to 8 segments
1151                                  * at least
1152                                  */
1153                                 min_incr = tp->t_maxseg << tcp_autorcvbuf_inc_shift;
1154                                 if (rcvbuf_inc < min_incr)
1155                                     rcvbuf_inc = min_incr;
1156
1157                                 rcvbuf_inc =
1158                                     (rcvbuf_inc / tp->t_maxseg) * tp->t_maxseg;
1159                                 tcp_sbrcv_reserve(tp, sbrcv,
1160                                         sbrcv->sb_hiwat + rcvbuf_inc,
1161                                         (tp->rfbuf_cnt * 2));
1162                         }
1163                         goto out;
1164                 } else {
1165                         tp->rfbuf_cnt += pktlen;
1166                         return;
1167                 }
1168         }
1169 out:
1170         /* Restart the measurement */
1171         tp->rfbuf_ts = 0;
1172         tp->rfbuf_cnt = 0;
1173         return;
1174 }
1175
1176 /* This function will trim the excess space added to the socket buffer
1177  * to help a slow-reading app. The ideal-size of a socket buffer depends
1178  * on the link bandwidth or it is set by an application and we aim to
1179  * reach that size.
1180  */
1181 void
1182 tcp_sbrcv_trim(struct tcpcb *tp, struct sockbuf *sbrcv) {
1183         if (tcp_do_autorcvbuf == 1 && sbrcv->sb_idealsize > 0 &&
1184                 sbrcv->sb_hiwat > sbrcv->sb_idealsize) {
1185                 int32_t trim;
1186                 /* compute the difference between ideal and current sizes */
1187                 u_int32_t diff = sbrcv->sb_hiwat - sbrcv->sb_idealsize;
1188
1189                 /* Compute the maximum advertised window for
1190                  * this connection.
1191                  */
1192                 u_int32_t advwin = tp->rcv_adv - tp->rcv_nxt;
1193
1194                 /* How much can we trim the receive socket buffer?
1195                  * 1. it can not be trimmed beyond the max rcv win advertised
1196                  * 2. if possible, leave 1/16 of bandwidth*delay to
1197                  * avoid closing the win completely
1198                  */
1199                 u_int32_t leave = max(advwin, (sbrcv->sb_idealsize >> 4));
1200
1201                 /* Sometimes leave can be zero, in that case leave at least
1202                  * a few segments worth of space.
1203                  */
1204                 if (leave == 0)
1205                         leave = tp->t_maxseg << tcp_autorcvbuf_inc_shift;
1206
1207                 trim = sbrcv->sb_hiwat - (sbrcv->sb_cc + leave);
1208                 trim = imin(trim, (int32_t)diff);
1209
1210                 if (trim > 0)
1211                         sbreserve(sbrcv, (sbrcv->sb_hiwat - trim));
1212         }
1213 }
1214
1215 /* We may need to trim the send socket buffer size for two reasons:
1216  * 1. if the rtt seen on the connection is climbing up, we do not
1217  * want to fill the buffers any more.
1218  * 2. if the congestion win on the socket backed off, there is no need
1219  * to hold more mbufs for that connection than what the cwnd will allow.
1220  */
1221 void
1222 tcp_sbsnd_trim(struct sockbuf *sbsnd) {
1223         if (tcp_do_autosendbuf == 1 &&
1224                 ((sbsnd->sb_flags & (SB_AUTOSIZE | SB_TRIM)) ==
1225                         (SB_AUTOSIZE | SB_TRIM)) &&
1226                 (sbsnd->sb_idealsize > 0) &&
1227                 (sbsnd->sb_hiwat > sbsnd->sb_idealsize)) {
1228                 u_int32_t trim = 0;
1229                 if (sbsnd->sb_cc <= sbsnd->sb_idealsize) {
1230                         trim = sbsnd->sb_hiwat - sbsnd->sb_idealsize;
1231                 } else {
1232                         trim = sbsnd->sb_hiwat - sbsnd->sb_cc;
1233                 }
1234                 sbreserve(sbsnd, (sbsnd->sb_hiwat - trim));
1235         }
1236         if (sbsnd->sb_hiwat <= sbsnd->sb_idealsize)
1237                 sbsnd->sb_flags &= ~(SB_TRIM);
1238 }
1239
1240 /*
1241  * If timestamp option was not negotiated on this connection
1242  * and this connection is on the receiving side of a stream
1243  * then we can not measure the delay on the link accurately.
1244  * Instead of enabling automatic receive socket buffer
1245  * resizing, just give more space to the receive socket buffer.
1246  */
1247 static inline void
1248 tcp_sbrcv_tstmp_check(struct tcpcb *tp) {
1249         struct socket *so = tp->t_inpcb->inp_socket;
1250         u_int32_t newsize = 2 * tcp_recvspace;
1251         struct sockbuf *sbrcv = &so->so_rcv;
1252
1253         if ((tp->t_flags & (TF_REQ_TSTMP | TF_RCVD_TSTMP)) !=
1254                 (TF_REQ_TSTMP | TF_RCVD_TSTMP) &&
1255                 (sbrcv->sb_flags & SB_AUTOSIZE) != 0) {
1256                 tcp_sbrcv_reserve(tp, sbrcv, newsize, 0);
1257         }
1258 }
1259
1260 /* A receiver will evaluate the flow of packets on a connection
1261  * to see if it can reduce ack traffic. The receiver will start
1262  * stretching acks if all of the following conditions are met:
1263  * 1. tcp_delack_enabled is set to 3
1264  * 2. If the bytes received in the last 100ms is greater than a threshold
1265  *      defined by maxseg_unacked
1266  * 3. If the connection has not been idle for tcp_maxrcvidle period.
1267  * 4. If the connection has seen enough packets to let the slow-start
1268  *      finish after connection establishment or after some packet loss.
1269  *
1270  * The receiver will stop stretching acks if there is congestion/reordering
1271  * as indicated by packets on reassembly queue or an ECN. If the delayed-ack
1272  * timer fires while stretching acks, it means that the packet flow has gone
1273  * below the threshold defined by maxseg_unacked and the receiver will stop
1274  * stretching acks. The receiver gets no indication when slow-start is completed
1275  * or when the connection reaches an idle state. That is why we use
1276  * tcp_rcvsspktcnt to cover slow-start and tcp_maxrcvidle to identify idle
1277  * state.
1278  */
1279 static inline int
1280 tcp_stretch_ack_enable(struct tcpcb *tp)
1281 {
1282         if (!(tp->t_flagsext & (TF_NOSTRETCHACK|TF_DISABLE_STRETCHACK)) &&
1283                 tp->rcv_by_unackwin >= (maxseg_unacked * tp->t_maxseg) &&
1284                 TSTMP_GT(tp->rcv_unackwin + tcp_maxrcvidle, tcp_now) &&
1285                 (!(tp->t_flagsext & TF_RCVUNACK_WAITSS) ||
1286                 (tp->rcv_waitforss >= tcp_rcvsspktcnt))) {
1287                 return(1);
1288         }
1289
1290         return(0);
1291 }
1292
1293 /*
1294  * Reset the state related to stretch-ack algorithm. This will make
1295  * the receiver generate an ack every other packet. The receiver
1296  * will start re-evaluating the rate at which packets come to decide
1297  * if it can benefit by lowering the ack traffic.
1298  */
1299 void
1300 tcp_reset_stretch_ack(struct tcpcb *tp)
1301 {
1302         tp->t_flags &= ~(TF_STRETCHACK);
1303         tp->rcv_by_unackwin = 0;
1304         tp->rcv_unackwin = tcp_now + tcp_rcvunackwin;
1305
1306         /*
1307          * When there is packet loss or packet re-ordering or CWR due to
1308          * ECN, the sender's congestion window is reduced. In these states,
1309          * generate an ack for every other packet for some time to allow
1310          * the sender's congestion window to grow.
1311          */
1312         tp->t_flagsext |= TF_RCVUNACK_WAITSS;
1313         tp->rcv_waitforss = 0;
1314 }
1315
1316 /*
1317  * The last packet was a retransmission, check if this ack
1318  * indicates that the retransmission was spurious.
1319  *
1320  * If the connection supports timestamps, we could use it to
1321  * detect if the last retransmit was not needed. Otherwise,
1322  * we check if the ACK arrived within RTT/2 window, then it
1323  * was a mistake to do the retransmit in the first place.
1324  *
1325  * This function will return 1 if it is a spurious retransmit,
1326  * 0 otherwise.
1327  */
1328 int
1329 tcp_detect_bad_rexmt(struct tcpcb *tp, struct tcphdr *th,
1330         struct tcpopt *to, u_int32_t rxtime)
1331 {
1332         int32_t tdiff, bad_rexmt_win;
1333         bad_rexmt_win = (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
1334
1335         /* If the ack has ECN CE bit, then cwnd has to be adjusted */
1336         if (TCP_ECN_ENABLED(tp) && (th->th_flags & TH_ECE))
1337                 return (0);
1338         if (TSTMP_SUPPORTED(tp)) {
1339                 if (rxtime > 0 && (to->to_flags & TOF_TS)
1340                     && to->to_tsecr != 0
1341                     && TSTMP_LT(to->to_tsecr, rxtime))
1342                     return (1);
1343         } else {
1344                 if ((tp->t_rxtshift == 1
1345                     || (tp->t_flagsext & TF_SENT_TLPROBE))
1346                     && rxtime > 0) {
1347                         tdiff = (int32_t)(tcp_now - rxtime);
1348                         if (tdiff < bad_rexmt_win)
1349                                 return(1);
1350                 }
1351         }
1352         return(0);
1353 }
1354
1355
1356 /*
1357  * Restore congestion window state if a spurious timeout
1358  * was detected.
1359  */
1360 static void
1361 tcp_bad_rexmt_restore_state(struct tcpcb *tp, struct tcphdr *th)
1362 {
1363         if (TSTMP_SUPPORTED(tp)) {
1364                 u_int32_t fsize, acked;
1365                 fsize = tp->snd_max - th->th_ack;
1366                 acked = BYTES_ACKED(th, tp);
1367
1368                 /*
1369                  * Implement bad retransmit recovery as
1370                  * described in RFC 4015.
1371                  */
1372                 tp->snd_ssthresh = tp->snd_ssthresh_prev;
1373
1374                 /* Initialize cwnd to the initial window */
1375                 if (CC_ALGO(tp)->cwnd_init != NULL)
1376                         CC_ALGO(tp)->cwnd_init(tp);
1377
1378                 tp->snd_cwnd = fsize + min(acked, tp->snd_cwnd);
1379
1380         } else {
1381                 tp->snd_cwnd = tp->snd_cwnd_prev;
1382                 tp->snd_ssthresh = tp->snd_ssthresh_prev;
1383                 if (tp->t_flags & TF_WASFRECOVERY)
1384                         ENTER_FASTRECOVERY(tp);
1385
1386                 /* Do not use the loss flight size in this case */
1387                 tp->t_lossflightsize = 0;
1388         }
1389         tp->snd_cwnd = max(tp->snd_cwnd, TCP_CC_CWND_INIT_BYTES);
1390         tp->snd_recover = tp->snd_recover_prev;
1391         tp->snd_nxt = tp->snd_max;
1392         tp->t_rxtshift = 0;
1393         tp->t_rxtstart = 0;
1394
1395         /* Fix send socket buffer to reflect the change in cwnd */
1396         tcp_bad_rexmt_fix_sndbuf(tp);
1397
1398         /*
1399          * This RTT might reflect the extra delay induced
1400          * by the network. Skip using this sample for RTO
1401          * calculation and mark the connection so we can
1402          * recompute RTT when the next eligible sample is
1403          * found.
1404          */
1405         tp->t_flagsext |= TF_RECOMPUTE_RTT;
1406         tp->t_badrexmt_time = tcp_now;
1407         tp->t_rtttime = 0;
1408 }
1409
1410 /*
1411  * If the previous packet was sent in retransmission timer, and it was
1412  * not needed, then restore the congestion window to the state before that
1413  * transmission.
1414  *
1415  * If the last packet was sent in tail loss probe timeout, check if that
1416  * recovered the last packet. If so, that will indicate a real loss and
1417  * the congestion window needs to be lowered.
1418  */
1419 static void
1420 tcp_bad_rexmt_check(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to)
1421 {
1422         if (tp->t_rxtshift > 0 &&
1423             tcp_detect_bad_rexmt(tp, th, to, tp->t_rxtstart)) {
1424                 ++tcpstat.tcps_sndrexmitbad;
1425                 tcp_bad_rexmt_restore_state(tp, th);
1426                 tcp_ccdbg_trace(tp, th, TCP_CC_BAD_REXMT_RECOVERY);
1427         } else if ((tp->t_flagsext & TF_SENT_TLPROBE)
1428             && tp->t_tlphighrxt > 0
1429             && SEQ_GEQ(th->th_ack, tp->t_tlphighrxt)
1430             && !tcp_detect_bad_rexmt(tp, th, to, tp->t_tlpstart)) {
1431                 /*
1432                  * check DSACK information also to make sure that
1433                  * the TLP was indeed needed
1434                  */
1435                 if (tcp_rxtseg_dsack_for_tlp(tp)) {
1436                         /*
1437                          * received a DSACK to indicate that TLP was
1438                          * not needed
1439                          */
1440                         tcp_rxtseg_clean(tp);
1441                         goto out;
1442                 }
1443
1444                 /*
1445                  * The tail loss probe recovered the last packet and
1446                  * we need to adjust the congestion window to take
1447                  * this loss into account.
1448                  */
1449                 ++tcpstat.tcps_tlp_recoverlastpkt;
1450                 if (!IN_FASTRECOVERY(tp)) {
1451                         tcp_reduce_congestion_window(tp);
1452                         EXIT_FASTRECOVERY(tp);
1453                 }
1454                 tcp_ccdbg_trace(tp, th, TCP_CC_TLP_RECOVER_LASTPACKET);
1455         } else if (tcp_rxtseg_detect_bad_rexmt(tp, th->th_ack)) {
1456                 /*
1457                  * All of the retransmitted segments were duplicated, this
1458                  * can be an indication of bad fast retransmit.
1459                  */
1460                 tcpstat.tcps_dsack_badrexmt++;
1461                 tcp_bad_rexmt_restore_state(tp, th);
1462                 tcp_ccdbg_trace(tp, th, TCP_CC_DSACK_BAD_REXMT);
1463                 tcp_rxtseg_clean(tp);
1464         }
1465 out:
1466         tp->t_flagsext &= ~(TF_SENT_TLPROBE);
1467         tp->t_tlphighrxt = 0;
1468         tp->t_tlpstart = 0;
1469
1470         /*
1471          * check if the latest ack was for a segment sent during PMTU
1472          * blackhole detection. If the timestamp on the ack is before
1473          * PMTU blackhole detection, then revert the size of the max
1474          * segment to previous size.
1475          */
1476         if (tp->t_rxtshift > 0 && (tp->t_flags & TF_BLACKHOLE) &&
1477             tp->t_pmtud_start_ts > 0 && TSTMP_SUPPORTED(tp)) {
1478                 if ((to->to_flags & TOF_TS) && to->to_tsecr != 0
1479                     && TSTMP_LT(to->to_tsecr, tp->t_pmtud_start_ts)) {
1480                         tcp_pmtud_revert_segment_size(tp);
1481                 }
1482         }
1483         if (tp->t_pmtud_start_ts > 0)
1484                 tp->t_pmtud_start_ts = 0;
1485 }
1486
1487 /*
1488  * Check if early retransmit can be attempted according to RFC 5827.
1489  *
1490  * If packet reordering is detected on a connection, fast recovery will
1491  * be delayed until it is clear that the packet was lost and not reordered.
1492  * But reordering detection is done only when SACK is enabled.
1493  *
1494  * On connections that do not support SACK, there is a limit on the number
1495  * of early retransmits that can be done per minute. This limit is needed
1496  * to make sure that too many packets are not retransmitted when there is
1497  * packet reordering.
1498  */
1499 static void
1500 tcp_early_rexmt_check (struct tcpcb *tp, struct tcphdr *th)
1501 {
1502         u_int32_t obytes, snd_off;
1503         int32_t snd_len;
1504         struct socket *so = tp->t_inpcb->inp_socket;
1505
1506         if (early_rexmt && (SACK_ENABLED(tp) ||
1507             tp->t_early_rexmt_count < TCP_EARLY_REXMT_LIMIT) &&
1508             SEQ_GT(tp->snd_max, tp->snd_una) &&
1509             (tp->t_dupacks == 1 ||
1510             (SACK_ENABLED(tp) &&
1511             !TAILQ_EMPTY(&tp->snd_holes)))) {
1512                 /*
1513                  * If there are only a few outstanding
1514                  * segments on the connection, we might need
1515                  * to lower the retransmit threshold. This
1516                  * will allow us to do Early Retransmit as
1517                  * described in RFC 5827.
1518                  */
1519                 if (SACK_ENABLED(tp) &&
1520                     !TAILQ_EMPTY(&tp->snd_holes)) {
1521                         obytes = (tp->snd_max - tp->snd_fack) +
1522                                 tp->sackhint.sack_bytes_rexmit;
1523                 } else {
1524                         obytes = (tp->snd_max - tp->snd_una);
1525                 }
1526
1527                 /*
1528                  * In order to lower retransmit threshold the
1529                  * following two conditions must be met.
1530                  * 1. the amount of outstanding data is less
1531                  * than 4*SMSS bytes
1532                  * 2. there is no unsent data ready for
1533                  * transmission or the advertised window
1534                  * will limit sending new segments.
1535                  */
1536                 snd_off = tp->snd_max - tp->snd_una;
1537                 snd_len = min(so->so_snd.sb_cc, tp->snd_wnd) - snd_off;
1538                 if (obytes < (tp->t_maxseg << 2) &&
1539                     snd_len <= 0) {
1540                         u_int32_t osegs;
1541
1542                         osegs = obytes / tp->t_maxseg;
1543                         if ((osegs * tp->t_maxseg) < obytes)
1544                                 osegs++;
1545
1546                         /*
1547                          * Since the connection might have already
1548                          * received some dupacks, we add them to
1549                          * to the outstanding segments count to get
1550                          * the correct retransmit threshold.
1551                          *
1552                          * By checking for early retransmit after
1553                          * receiving some duplicate acks when SACK
1554                          * is supported, the connection will
1555                          * enter fast recovery even if multiple
1556                          * segments are lost in the same window.
1557                          */
1558                         osegs += tp->t_dupacks;
1559                         if (osegs < 4) {
1560                                 tp->t_rexmtthresh =
1561                                     ((osegs - 1) > 1) ? (osegs - 1) : 1;
1562                                 tp->t_rexmtthresh =
1563                                     min(tp->t_rexmtthresh, tcprexmtthresh);
1564                                 tp->t_rexmtthresh =
1565                                     max(tp->t_rexmtthresh, tp->t_dupacks);
1566
1567                                 if (tp->t_early_rexmt_count == 0)
1568                                         tp->t_early_rexmt_win = tcp_now;
1569
1570                                 if (tp->t_flagsext & TF_SENT_TLPROBE) {
1571                                         tcpstat.tcps_tlp_recovery++;
1572                                         tcp_ccdbg_trace(tp, th,
1573                                             TCP_CC_TLP_RECOVERY);
1574                                 } else {
1575                                         tcpstat.tcps_early_rexmt++;
1576                                         tp->t_early_rexmt_count++;
1577                                         tcp_ccdbg_trace(tp, th,
1578                                             TCP_CC_EARLY_RETRANSMIT);
1579                                 }
1580                         }
1581                 }
1582         }
1583
1584         /*
1585          * If we ever sent a TLP probe, the acknowledgement will trigger
1586          * early retransmit because the value of snd_fack will be close
1587          * to snd_max. This will take care of adjustments to the
1588          * congestion window. So we can reset TF_SENT_PROBE flag.
1589          */
1590         tp->t_flagsext &= ~(TF_SENT_TLPROBE);
1591         tp->t_tlphighrxt = 0;
1592         tp->t_tlpstart = 0;
1593 }
1594
1595 static boolean_t
1596 tcp_tfo_syn(tp, to)
1597         struct tcpcb *tp;
1598         struct tcpopt *to;
1599 {
1600         u_char out[CCAES_BLOCK_SIZE];
1601         unsigned char len;
1602
1603         if (!(to->to_flags & (TOF_TFO | TOF_TFOREQ)) ||
1604             !(tcp_fastopen & TCP_FASTOPEN_SERVER))
1605                 return (FALSE);
1606
1607         if ((to->to_flags & TOF_TFOREQ)) {
1608                 tp->t_tfo_flags |= TFO_F_OFFER_COOKIE;
1609
1610                 tp->t_tfo_stats |= TFO_S_COOKIEREQ_RECV;
1611                 tcpstat.tcps_tfo_cookie_req_rcv++;
1612                 return (FALSE);
1613         }
1614
1615         /* Ok, then it must be an offered cookie. We need to check that ... */
1616         tcp_tfo_gen_cookie(tp->t_inpcb, out, sizeof(out));
1617
1618         len = *to->to_tfo - TCPOLEN_FASTOPEN_REQ;
1619         to->to_tfo++;
1620         if (memcmp(out, to->to_tfo, len)) {
1621                 /* Cookies are different! Let's return and offer a new cookie */
1622                 tp->t_tfo_flags |= TFO_F_OFFER_COOKIE;
1623
1624                 tp->t_tfo_stats |= TFO_S_COOKIE_INVALID;
1625                 tcpstat.tcps_tfo_cookie_invalid++;
1626                 return (FALSE);
1627         }
1628
1629         if (OSIncrementAtomic(&tcp_tfo_halfcnt) >= tcp_tfo_backlog) {
1630                 /* Need to decrement again as we just increased it... */
1631                 OSDecrementAtomic(&tcp_tfo_halfcnt);
1632                 return (FALSE);
1633         }
1634
1635         tp->t_tfo_flags |= TFO_F_COOKIE_VALID;
1636
1637         tp->t_tfo_stats |= TFO_S_SYNDATA_RCV;
1638         tcpstat.tcps_tfo_syn_data_rcv++;
1639
1640         return (TRUE);
1641 }
1642
1643 static void
1644 tcp_tfo_synack(tp, to)
1645         struct tcpcb *tp;
1646         struct tcpopt *to;
1647 {
1648         if (to->to_flags & TOF_TFO) {
1649                 unsigned char len = *to->to_tfo - TCPOLEN_FASTOPEN_REQ;
1650
1651                 /*
1652                  * If this happens, things have gone terribly wrong. len should
1653                  * have been checked in tcp_dooptions.
1654                  */
1655                 VERIFY(len <= TFO_COOKIE_LEN_MAX);
1656
1657                 to->to_tfo++;
1658
1659                 tcp_cache_set_cookie(tp, to->to_tfo, len);
1660                 tcp_heuristic_tfo_success(tp);
1661
1662                 tp->t_tfo_stats |= TFO_S_COOKIE_RCV;
1663                 tcpstat.tcps_tfo_cookie_rcv++;
1664         } else {
1665                 /*
1666                  * Thus, no cookie in the response, but we either asked for one
1667                  * or sent SYN+DATA. Now, we need to check whether we had to
1668                  * rexmit the SYN. If that's the case, it's better to start
1669                  * backing of TFO-cookie requests.
1670                  */
1671                 if (tp->t_tfo_flags & TFO_F_SYN_LOSS)
1672                         tcp_heuristic_inc_loss(tp, 1, 0);
1673                 else
1674                         tcp_heuristic_reset_loss(tp, 1, 0);
1675         }
1676 }
1677
1678 static void
1679 tcp_tfo_rcv_probe(struct tcpcb *tp, int tlen)
1680 {
1681         if (tlen == 0) {
1682                 tp->t_tfo_probe_state = TFO_PROBE_PROBING;
1683
1684                 /*
1685                  * We send the probe out rather quickly (after one RTO). It does not
1686                  * really hurt that much, it's only one additional segment on the wire.
1687                  */
1688                 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, (TCP_REXMTVAL(tp)));
1689         } else {
1690                 /* If SYN/ACK+data, don't probe. We got the data! */
1691                 tcp_heuristic_tfo_rcv_good(tp);
1692         }
1693 }
1694
1695 static void
1696 tcp_tfo_rcv_data(struct tcpcb *tp)
1697 {
1698         /* Transition from PROBING to NONE as data has been received */
1699         if (tp->t_tfo_probe_state >= TFO_PROBE_PROBING) {
1700                 tp->t_tfo_probe_state = TFO_PROBE_NONE;
1701
1702                 /* Data has been received - we are good to go! */
1703                 tcp_heuristic_tfo_rcv_good(tp);
1704         }
1705 }
1706
1707 static void
1708 tcp_tfo_rcv_ack(struct tcpcb *tp, struct tcphdr *th)
1709 {
1710         if (tp->t_tfo_probe_state == TFO_PROBE_PROBING &&
1711             tp->t_tfo_probes > 0) {
1712                 if (th->th_seq == tp->rcv_nxt) {
1713                         /* No hole, so stop probing */
1714                         tp->t_tfo_probe_state = TFO_PROBE_NONE;
1715                 } else if (SEQ_GT(th->th_seq, tp->rcv_nxt)) {
1716                         /* There is a hole! Wait a bit for data... */
1717                         tp->t_tfo_probe_state = TFO_PROBE_WAIT_DATA;
1718                         tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
1719                             TCP_REXMTVAL(tp));
1720                 }
1721         }
1722 }
1723
1724 void
1725 tcp_input(m, off0)
1726         struct mbuf *m;
1727         int off0;
1728 {
1729         register struct tcphdr *th;
1730         register struct ip *ip = NULL;
1731         register struct inpcb *inp;
1732         u_char *optp = NULL;
1733         int optlen = 0;
1734         int tlen, off;
1735         int drop_hdrlen;
1736         register struct tcpcb *tp = 0;
1737         register int thflags;
1738         struct socket *so = 0;
1739         int todrop, acked, ourfinisacked, needoutput = 0;
1740         struct in_addr laddr;
1741 #if INET6
1742         struct in6_addr laddr6;
1743 #endif
1744         int dropsocket = 0;
1745         int iss = 0, nosock = 0;
1746         u_int32_t tiwin, sack_bytes_acked = 0;
1747         struct tcpopt to;               /* options in this segment */
1748 #if TCPDEBUG
1749         short ostate = 0;
1750 #endif
1751 #if IPFIREWALL
1752         struct sockaddr_in *next_hop = NULL;
1753         struct m_tag *fwd_tag;
1754 #endif /* IPFIREWALL */
1755         u_char ip_ecn = IPTOS_ECN_NOTECT;
1756         unsigned int ifscope;
1757         uint8_t isconnected, isdisconnected;
1758         struct ifnet *ifp = m->m_pkthdr.rcvif;
1759         int pktf_sw_lro_pkt = (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) ? 1 : 0;
1760         int nlropkts = (pktf_sw_lro_pkt == 1) ? m->m_pkthdr.lro_npkts : 1;
1761         int turnoff_lro = 0, win;
1762 #if MPTCP
1763         struct mptcb *mp_tp = NULL;
1764 #endif /* MPTCP */
1765         boolean_t cell = IFNET_IS_CELLULAR(ifp);
1766         boolean_t wifi = (!cell && IFNET_IS_WIFI(ifp));
1767         boolean_t wired = (!wifi && IFNET_IS_WIRED(ifp));
1768         boolean_t recvd_dsack = FALSE;
1769         struct tcp_respond_args tra;
1770
1771 #define TCP_INC_VAR(stat, npkts) do {                   \
1772                 stat += npkts;                          \
1773 } while (0)
1774
1775         TCP_INC_VAR(tcpstat.tcps_rcvtotal, nlropkts);
1776 #if IPFIREWALL
1777         /* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */
1778         if (!SLIST_EMPTY(&m->m_pkthdr.tags)) {
1779                 fwd_tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID,
1780                     KERNEL_TAG_TYPE_IPFORWARD, NULL);
1781         } else {
1782                 fwd_tag = NULL;
1783         }
1784         if (fwd_tag != NULL) {
1785                 struct ip_fwd_tag *ipfwd_tag =
1786                         (struct ip_fwd_tag *)(fwd_tag+1);
1787
1788                 next_hop = ipfwd_tag->next_hop;
1789                 m_tag_delete(m, fwd_tag);
1790         }
1791 #endif /* IPFIREWALL */
1792
1793 #if INET6
1794         struct ip6_hdr *ip6 = NULL;
1795         int isipv6;
1796 #endif /* INET6 */
1797         int rstreason; /* For badport_bandlim accounting purposes */
1798         struct proc *proc0=current_proc();
1799
1800         KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_START,0,0,0,0,0);
1801
1802 #if INET6
1803         isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
1804 #endif
1805         bzero((char *)&to, sizeof(to));
1806
1807 #if INET6
1808         if (isipv6) {
1809                 /*
1810                  * Expect 32-bit aligned data pointer on
1811                  * strict-align platforms
1812                  */
1813                 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
1814
1815                 /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */
1816                 ip6 = mtod(m, struct ip6_hdr *);
1817                 tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
1818                 th = (struct tcphdr *)(void *)((caddr_t)ip6 + off0);
1819
1820                 if (tcp_input_checksum(AF_INET6, m, th, off0, tlen))
1821                         goto dropnosock;
1822
1823                 KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport),
1824                      (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
1825                      th->th_seq, th->th_ack, th->th_win);
1826                 /*
1827                  * Be proactive about unspecified IPv6 address in source.
1828                  * As we use all-zero to indicate unbounded/unconnected pcb,
1829                  * unspecified IPv6 address can be used to confuse us.
1830                  *
1831                  * Note that packets with unspecified IPv6 destination is
1832                  * already dropped in ip6_input.
1833                  */
1834                 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
1835                         /* XXX stat */
1836                         IF_TCP_STATINC(ifp, unspecv6);
1837                         goto dropnosock;
1838                 }
1839                 DTRACE_TCP5(receive, struct mbuf *, m, struct inpcb *, NULL,
1840                         struct ip6_hdr *, ip6, struct tcpcb *, NULL,
1841                         struct tcphdr *, th);
1842
1843                 ip_ecn = (ntohl(ip6->ip6_flow) >> 20) & IPTOS_ECN_MASK;
1844         } else
1845 #endif /* INET6 */
1846         {
1847         /*
1848          * Get IP and TCP header together in first mbuf.
1849          * Note: IP leaves IP header in first mbuf.
1850          */
1851         if (off0 > sizeof (struct ip)) {
1852                 ip_stripoptions(m, (struct mbuf *)0);
1853                 off0 = sizeof(struct ip);
1854         }
1855         if (m->m_len < sizeof (struct tcpiphdr)) {
1856                 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) {
1857                         tcpstat.tcps_rcvshort++;
1858                         return;
1859                 }
1860         }
1861
1862         /* Expect 32-bit aligned data pointer on strict-align platforms */
1863         MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
1864
1865         ip = mtod(m, struct ip *);
1866         th = (struct tcphdr *)(void *)((caddr_t)ip + off0);
1867         tlen = ip->ip_len;
1868
1869         if (tcp_input_checksum(AF_INET, m, th, off0, tlen))
1870                 goto dropnosock;
1871
1872 #if INET6
1873         /* Re-initialization for later version check */
1874         ip->ip_v = IPVERSION;
1875 #endif
1876         ip_ecn = (ip->ip_tos & IPTOS_ECN_MASK);
1877
1878         DTRACE_TCP5(receive, struct mbuf *, m, struct inpcb *, NULL,
1879                 struct ip *, ip, struct tcpcb *, NULL, struct tcphdr *, th);
1880
1881         KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport),
1882                 (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
1883                   th->th_seq, th->th_ack, th->th_win);
1884
1885         }
1886
1887         /*
1888          * Check that TCP offset makes sense,
1889          * pull out TCP options and adjust length.              XXX
1890          */
1891         off = th->th_off << 2;
1892         if (off < sizeof (struct tcphdr) || off > tlen) {
1893                 tcpstat.tcps_rcvbadoff++;
1894                 IF_TCP_STATINC(ifp, badformat);
1895                 goto dropnosock;
1896         }
1897         tlen -= off;    /* tlen is used instead of ti->ti_len */
1898         if (off > sizeof (struct tcphdr)) {
1899 #if INET6
1900                 if (isipv6) {
1901                         IP6_EXTHDR_CHECK(m, off0, off, return);
1902                         ip6 = mtod(m, struct ip6_hdr *);
1903                         th = (struct tcphdr *)(void *)((caddr_t)ip6 + off0);
1904                 } else
1905 #endif /* INET6 */
1906                 {
1907                         if (m->m_len < sizeof(struct ip) + off) {
1908                                 if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) {
1909                                         tcpstat.tcps_rcvshort++;
1910                                         return;
1911                                 }
1912                                 ip = mtod(m, struct ip *);
1913                                 th = (struct tcphdr *)(void *)((caddr_t)ip + off0);
1914                         }
1915                 }
1916                 optlen = off - sizeof (struct tcphdr);
1917                 optp = (u_char *)(th + 1);
1918                 /*
1919                  * Do quick retrieval of timestamp options ("options
1920                  * prediction?").  If timestamp is the only option and it's
1921                  * formatted as recommended in RFC 1323 appendix A, we
1922                  * quickly get the values now and not bother calling
1923                  * tcp_dooptions(), etc.
1924                  */
1925                 if ((optlen == TCPOLEN_TSTAMP_APPA ||
1926                         (optlen > TCPOLEN_TSTAMP_APPA &&
1927                         optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
1928                         *(u_int32_t *)(void *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
1929                         (th->th_flags & TH_SYN) == 0) {
1930                         to.to_flags |= TOF_TS;
1931                         to.to_tsval = ntohl(*(u_int32_t *)(void *)(optp + 4));
1932                         to.to_tsecr = ntohl(*(u_int32_t *)(void *)(optp + 8));
1933                         optp = NULL;    /* we've parsed the options */
1934                 }
1935         }
1936         thflags = th->th_flags;
1937
1938 #if TCP_DROP_SYNFIN
1939         /*
1940          * If the drop_synfin option is enabled, drop all packets with
1941          * both the SYN and FIN bits set. This prevents e.g. nmap from
1942          * identifying the TCP/IP stack.
1943          *
1944          * This is a violation of the TCP specification.
1945          */
1946         if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN)) {
1947                 IF_TCP_STATINC(ifp, synfin);
1948                 goto dropnosock;
1949         }
1950 #endif
1951
1952         /*
1953          * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options,
1954          * until after ip6_savecontrol() is called and before other functions
1955          * which don't want those proto headers.
1956          * Because ip6_savecontrol() is going to parse the mbuf to
1957          * search for data to be passed up to user-land, it wants mbuf
1958          * parameters to be unchanged.
1959          */
1960         drop_hdrlen = off0 + off;
1961
1962         /* Since this is an entry point for input processing of tcp packets, we
1963          * can update the tcp clock here.
1964          */
1965         calculate_tcp_clock();
1966
1967         /*
1968          * Record the interface where this segment arrived on; this does not
1969          * affect normal data output (for non-detached TCP) as it provides a
1970          * hint about which route and interface to use for sending in the
1971          * absence of a PCB, when scoped routing (and thus source interface
1972          * selection) are enabled.
1973          */
1974         if ((m->m_pkthdr.pkt_flags & PKTF_LOOP) || m->m_pkthdr.rcvif == NULL)
1975                 ifscope = IFSCOPE_NONE;
1976         else
1977                 ifscope = m->m_pkthdr.rcvif->if_index;
1978
1979         /*
1980          * Convert TCP protocol specific fields to host format.
1981          */
1982
1983 #if BYTE_ORDER != BIG_ENDIAN
1984         NTOHL(th->th_seq);
1985         NTOHL(th->th_ack);
1986         NTOHS(th->th_win);
1987         NTOHS(th->th_urp);
1988 #endif
1989
1990         /*
1991          * Locate pcb for segment.
1992          */
1993 findpcb:
1994
1995         isconnected = FALSE;
1996         isdisconnected = FALSE;
1997
1998 #if IPFIREWALL_FORWARD
1999         if (next_hop != NULL
2000 #if INET6
2001             && isipv6 == 0 /* IPv6 support is not yet */
2002 #endif /* INET6 */
2003             ) {
2004                 /*
2005                  * Diverted. Pretend to be the destination.
2006                  * already got one like this?
2007                  */
2008                 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
2009                         ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif);
2010                 if (!inp) {
2011                         /*
2012                          * No, then it's new. Try find the ambushing socket
2013                          */
2014                         if (!next_hop->sin_port) {
2015                                 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src,
2016                                     th->th_sport, next_hop->sin_addr,
2017                                     th->th_dport, 1, m->m_pkthdr.rcvif);
2018                         } else {
2019                                 inp = in_pcblookup_hash(&tcbinfo,
2020                                     ip->ip_src, th->th_sport,
2021                                     next_hop->sin_addr,
2022                                     ntohs(next_hop->sin_port), 1,
2023                                     m->m_pkthdr.rcvif);
2024                         }
2025                 }
2026         } else
2027 #endif  /* IPFIREWALL_FORWARD */
2028       {
2029 #if INET6
2030         if (isipv6)
2031                 inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport,
2032                                          &ip6->ip6_dst, th->th_dport, 1,
2033                                          m->m_pkthdr.rcvif);
2034         else
2035 #endif /* INET6 */
2036         inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
2037             ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif);
2038       }
2039
2040         /*
2041          * Use the interface scope information from the PCB for outbound
2042          * segments.  If the PCB isn't present and if scoped routing is
2043          * enabled, tcp_respond will use the scope of the interface where
2044          * the segment arrived on.
2045          */
2046         if (inp != NULL && (inp->inp_flags & INP_BOUND_IF))
2047                 ifscope = inp->inp_boundifp->if_index;
2048
2049         /*
2050          * If the state is CLOSED (i.e., TCB does not exist) then
2051          * all data in the incoming segment is discarded.
2052          * If the TCB exists but is in CLOSED state, it is embryonic,
2053          * but should either do a listen or a connect soon.
2054          */
2055         if (inp == NULL) {
2056                 if (log_in_vain) {
2057 #if INET6
2058                         char dbuf[MAX_IPv6_STR_LEN], sbuf[MAX_IPv6_STR_LEN];
2059 #else /* INET6 */
2060                         char dbuf[MAX_IPv4_STR_LEN], sbuf[MAX_IPv4_STR_LEN];
2061 #endif /* INET6 */
2062
2063 #if INET6
2064                         if (isipv6) {
2065                                 inet_ntop(AF_INET6, &ip6->ip6_dst, dbuf, sizeof(dbuf));
2066                                 inet_ntop(AF_INET6, &ip6->ip6_src, sbuf, sizeof(sbuf));
2067                         } else
2068 #endif
2069                         {
2070                                 inet_ntop(AF_INET, &ip->ip_dst, dbuf, sizeof(dbuf));
2071                                 inet_ntop(AF_INET, &ip->ip_src, sbuf, sizeof(sbuf));
2072                         }
2073                         switch (log_in_vain) {
2074                         case 1:
2075                                 if(thflags & TH_SYN)
2076                                         log(LOG_INFO,
2077                                                 "Connection attempt to TCP %s:%d from %s:%d\n",
2078                                                 dbuf, ntohs(th->th_dport),
2079                                                 sbuf,
2080                                                 ntohs(th->th_sport));
2081                                 break;
2082                         case 2:
2083                                 log(LOG_INFO,
2084                                         "Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n",
2085                                         dbuf, ntohs(th->th_dport), sbuf,
2086                                         ntohs(th->th_sport), thflags);
2087                                 break;
2088                         case 3:
2089                         case 4:
2090                                 if ((thflags & TH_SYN) && !(thflags & TH_ACK) &&
2091                                         !(m->m_flags & (M_BCAST | M_MCAST)) &&
2092 #if INET6
2093                                         ((isipv6 && !IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) ||
2094                                          (!isipv6 && ip->ip_dst.s_addr != ip->ip_src.s_addr))
2095 #else
2096                                         ip->ip_dst.s_addr != ip->ip_src.s_addr
2097 #endif
2098                                          )
2099                                         log_in_vain_log((LOG_INFO,
2100                                                 "Stealth Mode connection attempt to TCP %s:%d from %s:%d\n",
2101                                                 dbuf, ntohs(th->th_dport),
2102                                                 sbuf,
2103                                                 ntohs(th->th_sport)));
2104                                 break;
2105                         default:
2106                                 break;
2107                         }
2108                 }
2109                 if (blackhole) {
2110                         if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type != IFT_LOOP)
2111
2112                                 switch (blackhole) {
2113                                 case 1:
2114                                         if (thflags & TH_SYN)
2115                                                 goto dropnosock;
2116                                         break;
2117                                 case 2:
2118                                         goto dropnosock;
2119                                 default:
2120                                         goto dropnosock;
2121                                 }
2122                 }
2123                 rstreason = BANDLIM_RST_CLOSEDPORT;
2124                 IF_TCP_STATINC(ifp, noconnnolist);
2125                 goto dropwithresetnosock;
2126         }
2127         so = inp->inp_socket;
2128         if (so == NULL) {
2129                 /* This case shouldn't happen  as the socket shouldn't be null
2130                  * if inp_state isn't set to INPCB_STATE_DEAD
2131                  * But just in case, we pretend we didn't find the socket if we hit this case
2132                  * as this isn't cause for a panic (the socket might be leaked however)...
2133                  */
2134                 inp = NULL;
2135 #if TEMPDEBUG
2136                 printf("tcp_input: no more socket for inp=%x. This shouldn't happen\n", inp);
2137 #endif
2138                 goto dropnosock;
2139         }
2140
2141         tcp_lock(so, 1, 0);
2142         if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
2143                 tcp_unlock(so, 1, (void *)2);
2144                 inp = NULL;     // pretend we didn't find it
2145                 goto dropnosock;
2146         }
2147
2148 #if NECP
2149 #if INET6
2150         if (isipv6) {
2151                 if (!necp_socket_is_allowed_to_send_recv_v6(inp, th->th_dport,
2152                                                             th->th_sport,
2153                                                             &ip6->ip6_dst,
2154                                                             &ip6->ip6_src,
2155                                                             ifp, NULL, NULL)) {
2156                         IF_TCP_STATINC(ifp, badformatipsec);
2157                         goto drop;
2158                 }
2159         } else
2160 #endif
2161         {
2162                 if (!necp_socket_is_allowed_to_send_recv_v4(inp, th->th_dport,
2163                                                             th->th_sport,
2164                                                             &ip->ip_dst,
2165                                                             &ip->ip_src,
2166                                                             ifp, NULL, NULL)) {
2167                         IF_TCP_STATINC(ifp, badformatipsec);
2168                         goto drop;
2169                 }
2170         }
2171 #endif /* NECP */
2172
2173         tp = intotcpcb(inp);
2174         if (tp == 0) {
2175                 rstreason = BANDLIM_RST_CLOSEDPORT;
2176                 IF_TCP_STATINC(ifp, noconnlist);
2177                 goto dropwithreset;
2178         }
2179         if (tp->t_state == TCPS_CLOSED)
2180                 goto drop;
2181
2182         /* Unscale the window into a 32-bit value. */
2183         if ((thflags & TH_SYN) == 0)
2184                 tiwin = th->th_win << tp->snd_scale;
2185         else
2186                 tiwin = th->th_win;
2187
2188 #if CONFIG_MACF_NET
2189         if (mac_inpcb_check_deliver(inp, m, AF_INET, SOCK_STREAM))
2190                 goto drop;
2191 #endif
2192
2193         /* Avoid processing packets while closing a listen socket */
2194         if (tp->t_state == TCPS_LISTEN &&
2195                 (so->so_options & SO_ACCEPTCONN) == 0)
2196                 goto drop;
2197
2198         if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
2199 #if TCPDEBUG
2200                 if (so->so_options & SO_DEBUG) {
2201                         ostate = tp->t_state;
2202 #if INET6
2203                         if (isipv6)
2204                                 bcopy((char *)ip6, (char *)tcp_saveipgen,
2205                                       sizeof(*ip6));
2206                         else
2207 #endif /* INET6 */
2208                         bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip));
2209                         tcp_savetcp = *th;
2210                 }
2211 #endif
2212                 if (so->so_options & SO_ACCEPTCONN) {
2213                     register struct tcpcb *tp0 = tp;
2214                         struct socket *so2;
2215                         struct socket *oso;
2216                         struct sockaddr_storage from;
2217 #if INET6
2218                         struct inpcb *oinp = sotoinpcb(so);
2219 #endif /* INET6 */
2220                         struct ifnet *head_ifscope;
2221                         unsigned int head_nocell, head_recvanyif,
2222                                      head_noexpensive, head_awdl_unrestricted;
2223
2224                         /* Get listener's bound-to-interface, if any */
2225                         head_ifscope = (inp->inp_flags & INP_BOUND_IF) ?
2226                             inp->inp_boundifp : NULL;
2227                         /* Get listener's no-cellular information, if any */
2228                         head_nocell = INP_NO_CELLULAR(inp);
2229                         /* Get listener's recv-any-interface, if any */
2230                         head_recvanyif = (inp->inp_flags & INP_RECV_ANYIF);
2231                         /* Get listener's no-expensive information, if any */
2232                         head_noexpensive = INP_NO_EXPENSIVE(inp);
2233                         head_awdl_unrestricted = INP_AWDL_UNRESTRICTED(inp);
2234
2235                         /*
2236                          * If the state is LISTEN then ignore segment if it contains an RST.
2237                          * If the segment contains an ACK then it is bad and send a RST.
2238                          * If it does not contain a SYN then it is not interesting; drop it.
2239                          * If it is from this socket, drop it, it must be forged.
2240                          */
2241                         if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
2242                                 IF_TCP_STATINC(ifp, listbadsyn);
2243
2244                                 if (thflags & TH_RST) {
2245                                         goto drop;
2246                                 }
2247                                 if (thflags & TH_ACK) {
2248                                         tp = NULL;
2249                                         tcpstat.tcps_badsyn++;
2250                                         rstreason = BANDLIM_RST_OPENPORT;
2251                                         goto dropwithreset;
2252                                 }
2253
2254                                 /* We come here if there is no SYN set */
2255                                 tcpstat.tcps_badsyn++;
2256                                 goto drop;
2257                         }
2258                         KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_START,0,0,0,0,0);
2259                         if (th->th_dport == th->th_sport) {
2260 #if INET6
2261                                 if (isipv6) {
2262                                         if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
2263                                                        &ip6->ip6_src))
2264                                                 goto drop;
2265                                 } else
2266 #endif /* INET6 */
2267                                         if (ip->ip_dst.s_addr == ip->ip_src.s_addr)
2268                                                 goto drop;
2269                         }
2270                         /*
2271                          * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
2272                          * in_broadcast() should never return true on a received
2273                          * packet with M_BCAST not set.
2274                          *
2275                          * Packets with a multicast source address should also
2276                          * be discarded.
2277                          */
2278                         if (m->m_flags & (M_BCAST|M_MCAST))
2279                                 goto drop;
2280 #if INET6
2281                         if (isipv6) {
2282                                 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
2283                                         IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
2284                                         goto drop;
2285                         } else
2286 #endif
2287                         if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
2288                                 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
2289                                 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
2290                                 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
2291                                 goto drop;
2292
2293
2294 #if INET6
2295                         /*
2296                          * If deprecated address is forbidden,
2297                          * we do not accept SYN to deprecated interface
2298                          * address to prevent any new inbound connection from
2299                          * getting established.
2300                          * When we do not accept SYN, we send a TCP RST,
2301                          * with deprecated source address (instead of dropping
2302                          * it).  We compromise it as it is much better for peer
2303                          * to send a RST, and RST will be the final packet
2304                          * for the exchange.
2305                          *
2306                          * If we do not forbid deprecated addresses, we accept
2307                          * the SYN packet.  RFC 4862 forbids dropping SYN in
2308                          * this case.
2309                          */
2310                         if (isipv6 && !ip6_use_deprecated) {
2311                                 uint32_t ia6_flags;
2312
2313                                 if (ip6_getdstifaddr_info(m, NULL,
2314                                     &ia6_flags) == 0) {
2315                                         if (ia6_flags & IN6_IFF_DEPRECATED) {
2316                                                 tp = NULL;
2317                                                 rstreason = BANDLIM_RST_OPENPORT;
2318                                                 IF_TCP_STATINC(ifp, deprecate6);
2319                                                 goto dropwithreset;
2320                                         }
2321                                 }
2322                         }
2323 #endif
2324                         if (so->so_filt) {
2325 #if INET6
2326                                 if (isipv6) {
2327                                         struct sockaddr_in6     *sin6 = (struct sockaddr_in6*)&from;
2328
2329                                         sin6->sin6_len = sizeof(*sin6);
2330                                         sin6->sin6_family = AF_INET6;
2331                                         sin6->sin6_port = th->th_sport;
2332                                         sin6->sin6_flowinfo = 0;
2333                                         sin6->sin6_addr = ip6->ip6_src;
2334                                         sin6->sin6_scope_id = 0;
2335                                 }
2336                                 else
2337 #endif
2338                                 {
2339                                         struct sockaddr_in *sin = (struct sockaddr_in*)&from;
2340
2341                                         sin->sin_len = sizeof(*sin);
2342                                         sin->sin_family = AF_INET;
2343                                         sin->sin_port = th->th_sport;
2344                                         sin->sin_addr = ip->ip_src;
2345                                 }
2346                                 so2 = sonewconn(so, 0, (struct sockaddr*)&from);
2347                         } else {
2348                                 so2 = sonewconn(so, 0, NULL);
2349                         }
2350                         if (so2 == 0) {
2351                                 tcpstat.tcps_listendrop++;
2352                                 if (tcp_dropdropablreq(so)) {
2353                                         if (so->so_filt)
2354                                                 so2 = sonewconn(so, 0, (struct sockaddr*)&from);
2355                                         else
2356                                                 so2 = sonewconn(so, 0, NULL);
2357                                 }
2358                                 if (!so2)
2359                                         goto drop;
2360                         }
2361
2362                         /* Point "inp" and "tp" in tandem to new socket */
2363                         inp = (struct inpcb *)so2->so_pcb;
2364                         tp = intotcpcb(inp);
2365
2366                         oso = so;
2367                         tcp_unlock(so, 0, 0); /* Unlock but keep a reference on listener for now */
2368
2369                         so = so2;
2370                         tcp_lock(so, 1, 0);
2371                         /*
2372                          * Mark socket as temporary until we're
2373                          * committed to keeping it.  The code at
2374                          * ``drop'' and ``dropwithreset'' check the
2375                          * flag dropsocket to see if the temporary
2376                          * socket created here should be discarded.
2377                          * We mark the socket as discardable until
2378                          * we're committed to it below in TCPS_LISTEN.
2379                          * There are some error conditions in which we
2380                          * have to drop the temporary socket.
2381                          */
2382                         dropsocket++;
2383                         /*
2384                          * Inherit INP_BOUND_IF from listener; testing if
2385                          * head_ifscope is non-NULL is sufficient, since it
2386                          * can only be set to a non-zero value earlier if
2387                          * the listener has such a flag set.
2388                          */
2389                         if (head_ifscope != NULL) {
2390                                 inp->inp_flags |= INP_BOUND_IF;
2391                                 inp->inp_boundifp = head_ifscope;
2392                         } else {
2393                                 inp->inp_flags &= ~INP_BOUND_IF;
2394                         }
2395                         /*
2396                          * Inherit restrictions from listener.
2397                          */
2398                         if (head_nocell)
2399                                 inp_set_nocellular(inp);
2400                         if (head_noexpensive)
2401                                 inp_set_noexpensive(inp);
2402                         if (head_awdl_unrestricted)
2403                                 inp_set_awdl_unrestricted(inp);
2404                         /*
2405                          * Inherit {IN,IN6}_RECV_ANYIF from listener.
2406                          */
2407                         if (head_recvanyif)
2408                                 inp->inp_flags |= INP_RECV_ANYIF;
2409                         else
2410                                 inp->inp_flags &= ~INP_RECV_ANYIF;
2411 #if INET6
2412                         if (isipv6)
2413                                 inp->in6p_laddr = ip6->ip6_dst;
2414                         else {
2415                                 inp->inp_vflag &= ~INP_IPV6;
2416                                 inp->inp_vflag |= INP_IPV4;
2417 #endif /* INET6 */
2418                                 inp->inp_laddr = ip->ip_dst;
2419 #if INET6
2420                         }
2421 #endif /* INET6 */
2422                         inp->inp_lport = th->th_dport;
2423                         if (in_pcbinshash(inp, 0) != 0) {
2424                                 /*
2425                                  * Undo the assignments above if we failed to
2426                                  * put the PCB on the hash lists.
2427                                  */
2428 #if INET6
2429                                 if (isipv6)
2430                                         inp->in6p_laddr = in6addr_any;
2431                                 else
2432 #endif /* INET6 */
2433                                         inp->inp_laddr.s_addr = INADDR_ANY;
2434                                 inp->inp_lport = 0;
2435                                 tcp_lock(oso, 0, 0);    /* release ref on parent */
2436                                 tcp_unlock(oso, 1, 0);
2437                                 goto drop;
2438                         }
2439 #if INET6
2440                         if (isipv6) {
2441                                 /*
2442                                  * Inherit socket options from the listening
2443                                  * socket.
2444                                  * Note that in6p_inputopts are not (even
2445                                  * should not be) copied, since it stores
2446                                  * previously received options and is used to
2447                                  * detect if each new option is different than
2448                                  * the previous one and hence should be passed
2449                                  * to a user.
2450                                  * If we copied in6p_inputopts, a user would
2451                                  * not be able to receive options just after
2452                                  * calling the accept system call.
2453                                  */
2454                                 inp->inp_flags |=
2455                                         oinp->inp_flags & INP_CONTROLOPTS;
2456                                 if (oinp->in6p_outputopts)
2457                                         inp->in6p_outputopts =
2458                                                 ip6_copypktopts(oinp->in6p_outputopts,
2459                                                                 M_NOWAIT);
2460                         } else
2461 #endif /* INET6 */
2462                         {
2463                                 inp->inp_options = ip_srcroute();
2464                                 inp->inp_ip_tos = oinp->inp_ip_tos;
2465                         }
2466                         tcp_lock(oso, 0, 0);
2467 #if IPSEC
2468                         /* copy old policy into new socket's */
2469                         if (sotoinpcb(oso)->inp_sp)
2470                         {
2471                                 int error = 0;
2472                                 /* Is it a security hole here to silently fail to copy the policy? */
2473                                 if (inp->inp_sp != NULL)
2474                                         error = ipsec_init_policy(so, &inp->inp_sp);
2475                                 if (error != 0 || ipsec_copy_policy(sotoinpcb(oso)->inp_sp, inp->inp_sp))
2476                                         printf("tcp_input: could not copy policy\n");
2477                         }
2478 #endif
2479                         /* inherit states from the listener */
2480                         DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
2481                                 struct tcpcb *, tp, int32_t, TCPS_LISTEN);
2482                         tp->t_state = TCPS_LISTEN;
2483                         tp->t_flags |= tp0->t_flags & (TF_NOPUSH|TF_NOOPT|TF_NODELAY);
2484                         tp->t_flagsext |= (tp0->t_flagsext & (TF_RXTFINDROP|TF_NOTIMEWAIT|TF_FASTOPEN));
2485                         tp->t_keepinit = tp0->t_keepinit;
2486                         tp->t_keepcnt = tp0->t_keepcnt;
2487                         tp->t_keepintvl = tp0->t_keepintvl;
2488                         tp->t_adaptive_wtimo = tp0->t_adaptive_wtimo;
2489                         tp->t_adaptive_rtimo = tp0->t_adaptive_rtimo;
2490                         tp->t_inpcb->inp_ip_ttl = tp0->t_inpcb->inp_ip_ttl;
2491                         if ((so->so_flags & SOF_NOTSENT_LOWAT) != 0)
2492                                 tp->t_notsent_lowat = tp0->t_notsent_lowat;
2493
2494                         /* now drop the reference on the listener */
2495                         tcp_unlock(oso, 1, 0);
2496
2497                         tcp_set_max_rwinscale(tp, so);
2498
2499                         KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_END,0,0,0,0,0);
2500                 }
2501         }
2502         lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx,
2503                 LCK_MTX_ASSERT_OWNED);
2504
2505         if (tp->t_state == TCPS_ESTABLISHED && tlen > 0) {
2506                 /*
2507                  * Evaluate the rate of arrival of packets to see if the
2508                  * receiver can reduce the ack traffic. The algorithm to
2509                  * stretch acks will be enabled if the connection meets
2510                  * certain criteria defined in tcp_stretch_ack_enable function.
2511                  */
2512                 if ((tp->t_flagsext & TF_RCVUNACK_WAITSS) != 0) {
2513                         TCP_INC_VAR(tp->rcv_waitforss, nlropkts);
2514                 }
2515                 if (tcp_stretch_ack_enable(tp)) {
2516                         tp->t_flags |= TF_STRETCHACK;
2517                         tp->t_flagsext &= ~(TF_RCVUNACK_WAITSS);
2518                         tp->rcv_waitforss = 0;
2519                 } else {
2520                         tp->t_flags &= ~(TF_STRETCHACK);
2521                 }
2522                 if (TSTMP_GT(tp->rcv_unackwin, tcp_now)) {
2523                         tp->rcv_by_unackwin += (tlen + off);
2524                 } else {
2525                         tp->rcv_unackwin = tcp_now + tcp_rcvunackwin;
2526                         tp->rcv_by_unackwin = tlen + off;
2527                 }
2528         }
2529
2530         /*
2531          * Keep track of how many bytes were received in the LRO packet
2532          */
2533         if ((pktf_sw_lro_pkt) && (nlropkts > 2))  {
2534                 tp->t_lropktlen += tlen;
2535         }
2536         /*
2537          * Explicit Congestion Notification - Flag that we need to send ECT if
2538          *      + The IP Congestion experienced flag was set.
2539          *      + Socket is in established state
2540          *      + We negotiated ECN in the TCP setup
2541          *      + This isn't a pure ack (tlen > 0)
2542          *      + The data is in the valid window
2543          *
2544          *      TE_SENDECE will be cleared when we receive a packet with TH_CWR set.
2545          */
2546         if (ip_ecn == IPTOS_ECN_CE && tp->t_state == TCPS_ESTABLISHED &&
2547             TCP_ECN_ENABLED(tp) && tlen > 0 &&
2548             SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
2549             SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
2550                 tp->t_ecn_recv_ce++;
2551                 tcpstat.tcps_ecn_recv_ce++;
2552                 INP_INC_IFNET_STAT(inp, ecn_recv_ce);
2553                 /* Mark this connection as it received CE from network */
2554                 tp->ecn_flags |= TE_RECV_ECN_CE;
2555                 tp->ecn_flags |= TE_SENDECE;
2556         }
2557
2558         /*
2559          * Clear TE_SENDECE if TH_CWR is set. This is harmless, so we don't
2560          * bother doing extensive checks for state and whatnot.
2561          */
2562         if (thflags & TH_CWR) {
2563                 tp->ecn_flags &= ~TE_SENDECE;
2564                 tp->t_ecn_recv_cwr++;
2565         }
2566
2567         /*
2568          * If we received an  explicit notification of congestion in
2569          * ip tos ecn bits or by the CWR bit in TCP header flags, reset
2570          * the ack-strteching state. We need to handle ECN notification if
2571          * an ECN setup SYN was sent even once.
2572          */
2573         if (tp->t_state == TCPS_ESTABLISHED
2574             && (tp->ecn_flags & TE_SETUPSENT)
2575             && (ip_ecn == IPTOS_ECN_CE || (thflags & TH_CWR))) {
2576                 tcp_reset_stretch_ack(tp);
2577                 CLEAR_IAJ_STATE(tp);
2578         }
2579
2580         if (ip_ecn == IPTOS_ECN_CE && tp->t_state == TCPS_ESTABLISHED &&
2581             !TCP_ECN_ENABLED(tp) && !(tp->ecn_flags & TE_CEHEURI_SET)) {
2582                 tcpstat.tcps_ecn_fallback_ce++;
2583                 tcp_heuristic_ecn_aggressive(tp);
2584                 tp->ecn_flags |= TE_CEHEURI_SET;
2585         }
2586
2587         if (tp->t_state == TCPS_ESTABLISHED && TCP_ECN_ENABLED(tp) &&
2588             ip_ecn == IPTOS_ECN_CE && !(tp->ecn_flags & TE_CEHEURI_SET)) {
2589                 if (inp->inp_stat->rxpackets < ECN_MIN_CE_PROBES) {
2590                         tp->t_ecn_recv_ce_pkt++;
2591                 } else if (tp->t_ecn_recv_ce_pkt > ECN_MAX_CE_RATIO) {
2592                         tcpstat.tcps_ecn_fallback_ce++;
2593                         tcp_heuristic_ecn_aggressive(tp);
2594                         tp->ecn_flags |= TE_CEHEURI_SET;
2595                         INP_INC_IFNET_STAT(inp,ecn_fallback_ce);
2596                 } else {
2597                         /* We tracked the first ECN_MIN_CE_PROBES segments, we
2598                          * now know that the path is good.
2599                          */
2600                         tp->ecn_flags |= TE_CEHEURI_SET;
2601                 }
2602         }
2603
2604         /*
2605          * Try to determine if we are receiving a packet after a long time.
2606          * Use our own approximation of idletime to roughly measure remote
2607          * end's idle time. Since slowstart is used after an idle period
2608          * we want to avoid doing LRO if the remote end is not up to date
2609          * on initial window support and starts with 1 or 2 packets as its IW.
2610          */
2611          if (sw_lro && (tp->t_flagsext & TF_LRO_OFFLOADED) &&
2612                 ((tcp_now - tp->t_rcvtime) >= (TCP_IDLETIMEOUT(tp)))) {
2613                 turnoff_lro = 1;
2614          }
2615
2616         /* Update rcvtime as a new segment was received on the connection */
2617         tp->t_rcvtime = tcp_now;
2618
2619         /*
2620          * Segment received on connection.
2621          * Reset idle time and keep-alive timer.
2622          */
2623         if (TCPS_HAVEESTABLISHED(tp->t_state))
2624                 tcp_keepalive_reset(tp);
2625
2626         /*
2627          * Process options if not in LISTEN state,
2628          * else do it below (after getting remote address).
2629          */
2630         if (tp->t_state != TCPS_LISTEN && optp) {
2631                 tcp_dooptions(tp, optp, optlen, th, &to);
2632 #if MPTCP
2633                 if (mptcp_input_preproc(tp, m, drop_hdrlen) != 0) {
2634                         tp->t_flags |= TF_ACKNOW;
2635                         (void) tcp_output(tp);
2636                         tcp_check_timer_state(tp);
2637                         tcp_unlock(so, 1, 0);
2638                         KERNEL_DEBUG(DBG_FNC_TCP_INPUT |
2639                             DBG_FUNC_END,0,0,0,0,0);
2640                         return;
2641                 }
2642 #endif /* MPTCP */
2643         }
2644         if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
2645                 if (!(thflags & TH_ACK) ||
2646                     (SEQ_GT(th->th_ack, tp->iss) &&
2647                     SEQ_LEQ(th->th_ack, tp->snd_max)))
2648                         tcp_finalize_options(tp, &to, ifscope);
2649         }
2650
2651 #if TRAFFIC_MGT
2652         /*
2653          * Compute inter-packet arrival jitter. According to RFC 3550,
2654          * inter-packet arrival jitter is defined as the difference in
2655          * packet spacing at the receiver compared to the sender for a
2656          * pair of packets. When two packets of maximum segment size come
2657          * one after the other with consecutive sequence numbers, we
2658          * consider them as packets sent together at the sender and use
2659          * them as a pair to compute inter-packet arrival jitter. This
2660          * metric indicates the delay induced by the network components due
2661          * to queuing in edge/access routers.
2662          */
2663         if (tp->t_state == TCPS_ESTABLISHED &&
2664             (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK|TH_ECE|TH_PUSH)) == TH_ACK &&
2665             ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
2666             ((to.to_flags & TOF_TS) == 0 ||
2667             TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
2668             th->th_seq == tp->rcv_nxt &&
2669             LIST_EMPTY(&tp->t_segq)) {
2670                 int seg_size = tlen;
2671                 if (tp->iaj_pktcnt <= IAJ_IGNORE_PKTCNT) {
2672                         TCP_INC_VAR(tp->iaj_pktcnt, nlropkts);
2673                 }
2674
2675                 if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) {
2676                         seg_size = m->m_pkthdr.lro_pktlen;
2677                 }
2678                 if ( tp->iaj_size == 0 || seg_size > tp->iaj_size ||
2679                         (seg_size == tp->iaj_size && tp->iaj_rcv_ts == 0)) {
2680                         /*
2681                          * State related to inter-arrival jitter is
2682                          * uninitialized or we are trying to find a good
2683                          * first packet to start computing the metric
2684                          */
2685                         update_iaj_state(tp, seg_size, 0);
2686                 } else {
2687                         if (seg_size == tp->iaj_size) {
2688                                 /*
2689                                  * Compute inter-arrival jitter taking
2690                                  * this packet as the second packet
2691                                  */
2692                                 if (pktf_sw_lro_pkt)
2693                                         compute_iaj(tp, nlropkts,
2694                                             m->m_pkthdr.lro_elapsed);
2695                                 else
2696                                         compute_iaj(tp, 1, 0);
2697                         }
2698                         if (seg_size  < tp->iaj_size) {
2699                                 /*
2700                                  * There is a smaller packet in the stream.
2701                                  * Some times the maximum size supported
2702                                  * on a path can change if there is a new
2703                                  * link with smaller MTU. The receiver will
2704                                  * not know about this change. If there
2705                                  * are too many packets smaller than
2706                                  * iaj_size, we try to learn the iaj_size
2707                                  * again.
2708                                  */
2709                                 TCP_INC_VAR(tp->iaj_small_pkt, nlropkts);
2710                                 if (tp->iaj_small_pkt > RESET_IAJ_SIZE_THRESH) {
2711                                         update_iaj_state(tp, seg_size, 1);
2712                                 } else {
2713                                         CLEAR_IAJ_STATE(tp);
2714                                 }
2715                         } else {
2716                                 update_iaj_state(tp, seg_size, 0);
2717                         }
2718                 }
2719         } else {
2720                 CLEAR_IAJ_STATE(tp);
2721         }
2722 #endif /* TRAFFIC_MGT */
2723
2724         /*
2725          * Header prediction: check for the two common cases
2726          * of a uni-directional data xfer.  If the packet has
2727          * no control flags, is in-sequence, the window didn't
2728          * change and we're not retransmitting, it's a
2729          * candidate.  If the length is zero and the ack moved
2730          * forward, we're the sender side of the xfer.  Just
2731          * free the data acked & wake any higher level process
2732          * that was blocked waiting for space.  If the length
2733          * is non-zero and the ack didn't move, we're the
2734          * receiver side.  If we're getting packets in-order
2735          * (the reassembly queue is empty), add the data to
2736          * the socket buffer and note that we need a delayed ack.
2737          * Make sure that the hidden state-flags are also off.
2738          * Since we check for TCPS_ESTABLISHED above, it can only
2739          * be TH_NEEDSYN.
2740          */
2741         if (tp->t_state == TCPS_ESTABLISHED &&
2742             (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK|TH_ECE|TH_CWR)) == TH_ACK &&
2743             ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
2744             ((to.to_flags & TOF_TS) == 0 ||
2745              TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
2746             th->th_seq == tp->rcv_nxt &&
2747             tiwin && tiwin == tp->snd_wnd &&
2748             tp->snd_nxt == tp->snd_max) {
2749
2750                 /*
2751                  * If last ACK falls within this segment's sequence numbers,
2752                  * record the timestamp.
2753                  * NOTE that the test is modified according to the latest
2754                  * proposal of the tcplw@cray.com list (Braden 1993/04/26).
2755                  */
2756                 if ((to.to_flags & TOF_TS) != 0 &&
2757                    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
2758                         tp->ts_recent_age = tcp_now;
2759                         tp->ts_recent = to.to_tsval;
2760                 }
2761
2762                 if (tlen == 0) {
2763                         if (SEQ_GT(th->th_ack, tp->snd_una) &&
2764                             SEQ_LEQ(th->th_ack, tp->snd_max) &&
2765                             tp->snd_cwnd >= tp->snd_ssthresh &&
2766                             (!IN_FASTRECOVERY(tp) &&
2767                             ((!(SACK_ENABLED(tp)) &&
2768                             tp->t_dupacks < tp->t_rexmtthresh) ||
2769                             (SACK_ENABLED(tp) && to.to_nsacks == 0 &&
2770                             TAILQ_EMPTY(&tp->snd_holes))))) {
2771                                 /*
2772                                  * this is a pure ack for outstanding data.
2773                                  */
2774                                 ++tcpstat.tcps_predack;
2775
2776                                 tcp_bad_rexmt_check(tp, th, &to),
2777
2778                                 /* Recalculate the RTT */
2779                                 tcp_compute_rtt(tp, &to, th);
2780
2781                                 VERIFY(SEQ_GEQ(th->th_ack, tp->snd_una));
2782                                 acked = BYTES_ACKED(th, tp);
2783                                 tcpstat.tcps_rcvackpack++;
2784                                 tcpstat.tcps_rcvackbyte += acked;
2785
2786                                 /*
2787                                  * Handle an ack that is in sequence during
2788                                  * congestion avoidance phase. The
2789                                  * calculations in this function
2790                                  * assume that snd_una is not updated yet.
2791                                  */
2792                                 if (CC_ALGO(tp)->congestion_avd != NULL)
2793                                         CC_ALGO(tp)->congestion_avd(tp, th);
2794                                 tcp_ccdbg_trace(tp, th, TCP_CC_INSEQ_ACK_RCVD);
2795                                 sbdrop(&so->so_snd, acked);
2796                                 if (so->so_flags & SOF_ENABLE_MSGS) {
2797                                         VERIFY(acked <= so->so_msg_state->msg_serial_bytes);
2798                                         so->so_msg_state->msg_serial_bytes -= acked;
2799                                 }
2800                                 tcp_sbsnd_trim(&so->so_snd);
2801
2802                                 if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
2803                                     SEQ_LEQ(th->th_ack, tp->snd_recover))
2804                                         tp->snd_recover = th->th_ack - 1;
2805                                 tp->snd_una = th->th_ack;
2806
2807                                 /*
2808                                  * pull snd_wl2 up to prevent seq wrap relative
2809                                  * to th_ack.
2810                                  */
2811                                 tp->snd_wl2 = th->th_ack;
2812
2813                                 if (tp->t_dupacks > 0) {
2814                                         tp->t_dupacks = 0;
2815                                         tp->t_rexmtthresh = tcprexmtthresh;
2816                                 }
2817
2818                                 m_freem(m);
2819
2820                                 /*
2821                                  * If all outstanding data are acked, stop
2822                                  * retransmit timer, otherwise restart timer
2823                                  * using current (possibly backed-off) value.
2824                                  * If process is waiting for space,
2825                                  * wakeup/selwakeup/signal.  If data
2826                                  * are ready to send, let tcp_output
2827                                  * decide between more output or persist.
2828                                  */
2829                                 if (tp->snd_una == tp->snd_max) {
2830                                         tp->t_timer[TCPT_REXMT] = 0;
2831                                         tp->t_timer[TCPT_PTO] = 0;
2832                                 } else if (tp->t_timer[TCPT_PERSIST] == 0) {
2833                                         tp->t_timer[TCPT_REXMT] =
2834                                             OFFSET_FROM_START(tp,
2835                                             tp->t_rxtcur);
2836                                 }
2837                                 if (!SLIST_EMPTY(&tp->t_rxt_segments) &&
2838                                     !TCP_DSACK_SEQ_IN_WINDOW(tp,
2839                                     tp->t_dsack_lastuna, tp->snd_una))
2840                                         tcp_rxtseg_clean(tp);
2841
2842                                 if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 &&
2843                                         tp->t_bwmeas != NULL)
2844                                         tcp_bwmeas_check(tp);
2845                                 sowwakeup(so); /* has to be done with socket lock held */
2846                                 if ((so->so_snd.sb_cc) || (tp->t_flags & TF_ACKNOW)) {
2847                                         (void) tcp_output(tp);
2848                                 }
2849
2850                                 tcp_tfo_rcv_ack(tp, th);
2851
2852                                 tcp_check_timer_state(tp);
2853                                 tcp_unlock(so, 1, 0);
2854                                 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
2855                                 return;
2856                         }
2857                 } else if (th->th_ack == tp->snd_una &&
2858                     LIST_EMPTY(&tp->t_segq) &&
2859                     tlen <= tcp_sbspace(tp)) {
2860                         /*
2861                          * this is a pure, in-sequence data packet
2862                          * with nothing on the reassembly queue and
2863                          * we have enough buffer space to take it.
2864                          */
2865
2866                         /*
2867                          * If this is a connection in steady state, start
2868                          * coalescing packets belonging to this flow.
2869                          */
2870                         if (turnoff_lro) {
2871                                 tcp_lro_remove_state(tp->t_inpcb->inp_laddr,
2872                                         tp->t_inpcb->inp_faddr,
2873                                         tp->t_inpcb->inp_lport,
2874                                         tp->t_inpcb->inp_fport);
2875                                 tp->t_flagsext &= ~TF_LRO_OFFLOADED;
2876                                 tp->t_idleat = tp->rcv_nxt;
2877                         } else if (sw_lro && !pktf_sw_lro_pkt && !isipv6 &&
2878                             (so->so_flags & SOF_USELRO) &&
2879                             !IFNET_IS_CELLULAR(m->m_pkthdr.rcvif) &&
2880                             (m->m_pkthdr.rcvif->if_type != IFT_LOOP) &&
2881                             ((th->th_seq - tp->irs) >
2882                             (tp->t_maxseg << lro_start)) &&
2883                             ((tp->t_idleat == 0) || ((th->th_seq -
2884                              tp->t_idleat) > (tp->t_maxseg << lro_start)))) {
2885                                 tp->t_flagsext |= TF_LRO_OFFLOADED;
2886                                 tcp_start_coalescing(ip, th, tlen);
2887                                 tp->t_idleat = 0;
2888                         }
2889
2890                         /* Clean receiver SACK report if present */
2891                         if (SACK_ENABLED(tp) && tp->rcv_numsacks)
2892                                 tcp_clean_sackreport(tp);
2893                         ++tcpstat.tcps_preddat;
2894                         tp->rcv_nxt += tlen;
2895                         /*
2896                          * Pull snd_wl1 up to prevent seq wrap relative to
2897                          * th_seq.
2898                          */
2899                         tp->snd_wl1 = th->th_seq;
2900                         /*
2901                          * Pull rcv_up up to prevent seq wrap relative to
2902                          * rcv_nxt.
2903                          */
2904                         tp->rcv_up = tp->rcv_nxt;
2905                         TCP_INC_VAR(tcpstat.tcps_rcvpack, nlropkts);
2906                         tcpstat.tcps_rcvbyte += tlen;
2907                         if (nstat_collect) {
2908                                 if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) {
2909                                         INP_ADD_STAT(inp, cell, wifi, wired,
2910                                             rxpackets, m->m_pkthdr.lro_npkts);
2911                                 } else {
2912                                         INP_ADD_STAT(inp, cell, wifi, wired,
2913                                             rxpackets, 1);
2914                                 }
2915                                 INP_ADD_STAT(inp, cell, wifi, wired,rxbytes,
2916                                     tlen);
2917                         }
2918
2919                         /*
2920                          * Calculate the RTT on the receiver only if the
2921                          * connection is in streaming mode and the last
2922                          * packet was not an end-of-write
2923                          */
2924                         if ((tp->t_flags & TF_STRETCHACK) &&
2925                                 !(tp->t_flagsext & TF_STREAMEOW))
2926                                 tcp_compute_rtt(tp, &to, th);
2927
2928                         tcp_sbrcv_grow(tp, &so->so_rcv, &to, tlen);
2929
2930                         /*
2931                          * Add data to socket buffer.
2932                          */
2933                         so_recv_data_stat(so, m, 0);
2934                         m_adj(m, drop_hdrlen);  /* delayed header drop */
2935
2936                         /*
2937                          * If message delivery (SOF_ENABLE_MSGS) is enabled on
2938                          * this socket, deliver the packet received as an
2939                          * in-order message with sequence number attached to it.
2940                          */
2941                         if (sbappendstream_rcvdemux(so, m,
2942                             th->th_seq - (tp->irs + 1), 0)) {
2943                                 sorwakeup(so);
2944                         }
2945 #if INET6
2946                         if (isipv6) {
2947                                 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
2948                                         (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
2949                                         th->th_seq, th->th_ack, th->th_win);
2950                         }
2951                         else
2952 #endif
2953                         {
2954                                 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
2955                                         (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
2956                                         th->th_seq, th->th_ack, th->th_win);
2957                         }
2958                         TCP_INC_VAR(tp->t_unacksegs, nlropkts);
2959                         if (DELAY_ACK(tp, th))  {
2960                                 if ((tp->t_flags & TF_DELACK) == 0) {
2961                                         tp->t_flags |= TF_DELACK;
2962                                         tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
2963                                 }
2964                         } else {
2965                                 tp->t_flags |= TF_ACKNOW;
2966                                 tcp_output(tp);
2967                         }
2968
2969                         tcp_adaptive_rwtimo_check(tp, tlen);
2970
2971                         if (tlen > 0)
2972                                 tcp_tfo_rcv_data(tp);
2973
2974                         tcp_check_timer_state(tp);
2975                         tcp_unlock(so, 1, 0);
2976                         KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
2977                         return;
2978                 }
2979         }
2980
2981         /*
2982          * Calculate amount of space in receive window,
2983          * and then do TCP input processing.
2984          * Receive window is amount of space in rcv queue,
2985          * but not less than advertised window.
2986          */
2987         lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx,
2988             LCK_MTX_ASSERT_OWNED);
2989         win = tcp_sbspace(tp);
2990         if (win < 0)
2991                 win = 0;
2992         else {  /* clip rcv window to 4K for modems */
2993                 if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0)
2994                         win = min(win, slowlink_wsize);
2995         }
2996         tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
2997 #if MPTCP
2998         /*
2999          * Ensure that the subflow receive window isn't greater
3000          * than the connection level receive window.
3001          */
3002         if ((tp->t_mpflags & TMPF_MPTCP_TRUE) &&
3003             (mp_tp = tptomptp(tp))) {
3004                 MPT_LOCK(mp_tp);
3005                 if (tp->rcv_wnd > mp_tp->mpt_rcvwnd) {
3006                         tp->rcv_wnd = mp_tp->mpt_rcvwnd;
3007                         tcpstat.tcps_mp_reducedwin++;
3008                 }
3009                 MPT_UNLOCK(mp_tp);
3010         }
3011 #endif /* MPTCP */
3012
3013         switch (tp->t_state) {
3014
3015         /*
3016          * Initialize tp->rcv_nxt, and tp->irs, select an initial
3017          * tp->iss, and send a segment:
3018          *     <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
3019          * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
3020          * Fill in remote peer address fields if not previously specified.
3021          * Enter SYN_RECEIVED state, and process any other fields of this
3022          * segment in this state.
3023          */
3024         case TCPS_LISTEN: {
3025                 register struct sockaddr_in *sin;
3026 #if INET6
3027                 register struct sockaddr_in6 *sin6;
3028 #endif
3029
3030                 lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx,
3031                     LCK_MTX_ASSERT_OWNED);
3032 #if INET6
3033                 if (isipv6) {
3034                         MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6,
3035                                M_SONAME, M_NOWAIT);
3036                         if (sin6 == NULL)
3037                                 goto drop;
3038                         bzero(sin6, sizeof(*sin6));
3039                         sin6->sin6_family = AF_INET6;
3040                         sin6->sin6_len = sizeof(*sin6);
3041                         sin6->sin6_addr = ip6->ip6_src;
3042                         sin6->sin6_port = th->th_sport;
3043                         laddr6 = inp->in6p_laddr;
3044                         if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
3045                                 inp->in6p_laddr = ip6->ip6_dst;
3046                         if (in6_pcbconnect(inp, (struct sockaddr *)sin6,
3047                                            proc0)) {
3048                                 inp->in6p_laddr = laddr6;
3049                                 FREE(sin6, M_SONAME);
3050                                 goto drop;
3051                         }
3052                         FREE(sin6, M_SONAME);
3053                 } else
3054 #endif
3055             {
3056                         lck_mtx_assert(
3057                             &((struct inpcb *)so->so_pcb)->inpcb_mtx,
3058                             LCK_MTX_ASSERT_OWNED);
3059                         MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME,
3060                        M_NOWAIT);
3061                         if (sin == NULL)
3062                                 goto drop;
3063                         sin->sin_family = AF_INET;
3064                         sin->sin_len = sizeof(*sin);
3065                         sin->sin_addr = ip->ip_src;
3066                         sin->sin_port = th->th_sport;
3067                         bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero));
3068                         laddr = inp->inp_laddr;
3069                         if (inp->inp_laddr.s_addr == INADDR_ANY)
3070                                 inp->inp_laddr = ip->ip_dst;
3071                         if (in_pcbconnect(inp, (struct sockaddr *)sin, proc0,
3072                             IFSCOPE_NONE, NULL)) {
3073                                 inp->inp_laddr = laddr;
3074                                 FREE(sin, M_SONAME);
3075                                 goto drop;
3076                         }
3077                         FREE(sin, M_SONAME);
3078                 }
3079
3080                 tcp_dooptions(tp, optp, optlen, th, &to);
3081                 tcp_finalize_options(tp, &to, ifscope);
3082
3083                 if (tfo_enabled(tp) && tcp_tfo_syn(tp, &to))
3084                         isconnected = TRUE;
3085
3086                 if (iss)
3087                         tp->iss = iss;
3088                 else {
3089                         tp->iss = tcp_new_isn(tp);
3090                 }
3091                 tp->irs = th->th_seq;
3092                 tcp_sendseqinit(tp);
3093                 tcp_rcvseqinit(tp);
3094                 tp->snd_recover = tp->snd_una;
3095                 /*
3096                  * Initialization of the tcpcb for transaction;
3097                  *   set SND.WND = SEG.WND,
3098                  *   initialize CCsend and CCrecv.
3099                  */
3100                 tp->snd_wnd = tiwin;    /* initial send-window */
3101                 tp->t_flags |= TF_ACKNOW;
3102                 tp->t_unacksegs = 0;
3103                 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
3104                         struct tcpcb *, tp, int32_t, TCPS_SYN_RECEIVED);
3105                 tp->t_state = TCPS_SYN_RECEIVED;
3106                 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
3107                         TCP_CONN_KEEPINIT(tp));
3108                 dropsocket = 0;         /* committed to socket */
3109
3110                 if (inp->inp_flowhash == 0)
3111                         inp->inp_flowhash = inp_calc_flowhash(inp);
3112 #if INET6
3113                 /* update flowinfo - RFC 6437 */
3114                 if (inp->inp_flow == 0 &&
3115                     inp->in6p_flags & IN6P_AUTOFLOWLABEL) {
3116                         inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
3117                         inp->inp_flow |=
3118                             (htonl(inp->inp_flowhash) & IPV6_FLOWLABEL_MASK);
3119                 }
3120 #endif /* INET6 */
3121
3122                 /* reset the incomp processing flag */
3123                 so->so_flags &= ~(SOF_INCOMP_INPROGRESS);
3124                 tcpstat.tcps_accepts++;
3125                 if ((thflags & (TH_ECE | TH_CWR)) == (TH_ECE | TH_CWR)) {
3126                         /* ECN-setup SYN */
3127                         tp->ecn_flags |= (TE_SETUPRECEIVED | TE_SENDIPECT);
3128                 }
3129
3130 #if CONFIG_IFEF_NOWINDOWSCALE
3131                 if (tcp_obey_ifef_nowindowscale && m->m_pkthdr.rcvif != NULL &&
3132                     (m->m_pkthdr.rcvif->if_eflags & IFEF_NOWINDOWSCALE)) {
3133                         /* Window scaling is not enabled on this interface */
3134                         tp->t_flags &= ~TF_REQ_SCALE;
3135                 }
3136 #endif
3137                 goto trimthenstep6;
3138                 }
3139
3140         /*
3141          * If the state is SYN_RECEIVED and the seg contains an ACK,
3142          * but not for our SYN/ACK, send a RST.
3143          */
3144         case TCPS_SYN_RECEIVED:
3145                 if ((thflags & TH_ACK) &&
3146                     (SEQ_LEQ(th->th_ack, tp->snd_una) ||
3147                      SEQ_GT(th->th_ack, tp->snd_max))) {
3148                                 rstreason = BANDLIM_RST_OPENPORT;
3149                                 IF_TCP_STATINC(ifp, ooopacket);
3150                                 goto dropwithreset;
3151                 }
3152
3153                 /*
3154                  * In SYN_RECEIVED state, if we recv some SYNS with
3155                  * window scale and others without, window scaling should
3156                  * be disabled. Otherwise the window advertised will be
3157                  * lower if we assume scaling and the other end does not.
3158                  */
3159                 if ((thflags & TH_SYN) &&
3160                     (tp->irs == th->th_seq) &&
3161                     !(to.to_flags & TOF_SCALE))
3162                         tp->t_flags &= ~TF_RCVD_SCALE;
3163                 break;
3164
3165         /*
3166          * If the state is SYN_SENT:
3167          *      if seg contains an ACK, but not for our SYN, drop the input.
3168          *      if seg contains a RST, then drop the connection.
3169          *      if seg does not contain SYN, then drop it.
3170          * Otherwise this is an acceptable SYN segment
3171          *      initialize tp->rcv_nxt and tp->irs
3172          *      if seg contains ack then advance tp->snd_una
3173          *      if SYN has been acked change to ESTABLISHED else SYN_RCVD state
3174          *      arrange for segment to be acked (eventually)
3175          *      continue processing rest of data/controls, beginning with URG
3176          */
3177         case TCPS_SYN_SENT:
3178                 if ((thflags & TH_ACK) &&
3179                     (SEQ_LEQ(th->th_ack, tp->iss) ||
3180                      SEQ_GT(th->th_ack, tp->snd_max))) {
3181                         rstreason = BANDLIM_UNLIMITED;
3182                         IF_TCP_STATINC(ifp, ooopacket);
3183                         goto dropwithreset;
3184                 }
3185                 if (thflags & TH_RST) {
3186                         if ((thflags & TH_ACK) != 0) {
3187 #if MPTCP
3188                                 if ((so->so_flags & SOF_MPTCP_FASTJOIN) &&
3189                                         SEQ_GT(th->th_ack, tp->iss+1)) {
3190                                         so->so_flags &= ~SOF_MPTCP_FASTJOIN;
3191                                         /* ignore the RST and retransmit SYN */
3192                                         goto drop;
3193                                 }
3194 #endif /* MPTCP */
3195                                 soevent(so,
3196                                     (SO_FILT_HINT_LOCKED |
3197                                     SO_FILT_HINT_CONNRESET));
3198                                 tp = tcp_drop(tp, ECONNREFUSED);
3199                                 postevent(so, 0, EV_RESET);
3200                         }
3201                         goto drop;
3202                 }
3203                 if ((thflags & TH_SYN) == 0)
3204                         goto drop;
3205                 tp->snd_wnd = th->th_win;       /* initial send window */
3206
3207                 tp->irs = th->th_seq;
3208                 tcp_rcvseqinit(tp);
3209                 if (thflags & TH_ACK) {
3210                         tcpstat.tcps_connects++;
3211
3212                         if ((thflags & (TH_ECE | TH_CWR)) == (TH_ECE)) {
3213                                 /* ECN-setup SYN-ACK */
3214                                 tp->ecn_flags |= TE_SETUPRECEIVED;
3215                                 if (TCP_ECN_ENABLED(tp)) {
3216                                         tcp_heuristic_reset_loss(tp, 0, 1);
3217                                         tcpstat.tcps_ecn_client_success++;
3218                                 }
3219                         } else {
3220                                 if (tp->ecn_flags & TE_SETUPSENT &&
3221                                     tp->t_rxtshift == 0) {
3222                                         tcp_heuristic_reset_loss(tp, 0, 1);
3223                                         tcpstat.tcps_ecn_not_supported++;
3224                                 }
3225                                 if (tp->ecn_flags & TE_SETUPSENT &&
3226                                     tp->t_rxtshift > 0)
3227                                         tcp_heuristic_inc_loss(tp, 0, 1);
3228
3229                                 /* non-ECN-setup SYN-ACK */
3230                                 tp->ecn_flags &= ~TE_SENDIPECT;
3231                         }
3232
3233 #if CONFIG_MACF_NET && CONFIG_MACF_SOCKET
3234                         /* XXXMAC: recursive lock: SOCK_LOCK(so); */
3235                         mac_socketpeer_label_associate_mbuf(m, so);
3236                         /* XXXMAC: SOCK_UNLOCK(so); */
3237 #endif
3238                         /* Do window scaling on this connection? */
3239                         if (TCP_WINDOW_SCALE_ENABLED(tp)) {
3240                                 tp->snd_scale = tp->requested_s_scale;
3241                                 tp->rcv_scale = tp->request_r_scale;
3242                         }
3243
3244                         tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN << tp->rcv_scale);
3245                         tp->snd_una++;          /* SYN is acked */
3246                         if (SEQ_LT(tp->snd_nxt, tp->snd_una))
3247                                 tp->snd_nxt = tp->snd_una;
3248
3249                         /*
3250                          * We have sent more in the SYN than what is being
3251                          * acked. (e.g., TFO)
3252                          * We should restart the sending from what the receiver
3253                          * has acknowledged immediately.
3254                          */
3255                         if (SEQ_GT(tp->snd_nxt, th->th_ack))
3256                                 tp->snd_nxt = th->th_ack;
3257
3258                         /*
3259                          * If there's data, delay ACK; if there's also a FIN
3260                          * ACKNOW will be turned on later.
3261                          */
3262                         TCP_INC_VAR(tp->t_unacksegs, nlropkts);
3263                         if (DELAY_ACK(tp, th) && tlen != 0 ) {
3264                                 if ((tp->t_flags & TF_DELACK) == 0) {
3265                                         tp->t_flags |= TF_DELACK;
3266                                         tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
3267                                 }
3268                         }
3269                         else {
3270                                 tp->t_flags |= TF_ACKNOW;
3271                         }
3272                         /*
3273                          * Received <SYN,ACK> in SYN_SENT[*] state.
3274                          * Transitions:
3275                          *      SYN_SENT  --> ESTABLISHED
3276                          *      SYN_SENT* --> FIN_WAIT_1
3277                          */
3278                         tp->t_starttime = tcp_now;
3279                         tcp_sbrcv_tstmp_check(tp);
3280                         if (tp->t_flags & TF_NEEDFIN) {
3281                                 DTRACE_TCP4(state__change, void, NULL,
3282                                     struct inpcb *, inp,
3283                                     struct tcpcb *, tp, int32_t,
3284                                     TCPS_FIN_WAIT_1);
3285                                 tp->t_state = TCPS_FIN_WAIT_1;
3286                                 tp->t_flags &= ~TF_NEEDFIN;
3287                                 thflags &= ~TH_SYN;
3288                         } else {
3289                                 DTRACE_TCP4(state__change, void, NULL,
3290                                     struct inpcb *, inp, struct tcpcb *,
3291                                     tp, int32_t, TCPS_ESTABLISHED);
3292                                 tp->t_state = TCPS_ESTABLISHED;
3293                                 tp->t_timer[TCPT_KEEP] =
3294                                     OFFSET_FROM_START(tp,
3295                                     TCP_CONN_KEEPIDLE(tp));
3296                                 if (nstat_collect)
3297                                         nstat_route_connect_success(
3298                                             tp->t_inpcb->inp_route.ro_rt);
3299                         }
3300 #if MPTCP
3301                         /*
3302                          * Do not send the connect notification for additional
3303                          * subflows until ACK for 3-way handshake arrives.
3304                          */
3305                         if ((!(tp->t_mpflags & TMPF_MPTCP_TRUE)) &&
3306                             (tp->t_mpflags & TMPF_SENT_JOIN)) {
3307                                 isconnected = FALSE;
3308                                 /* Start data xmit if fastjoin */
3309                                 if (mptcp_fastjoin && (so->so_flags & SOF_MPTCP_FASTJOIN)) {
3310                                         soevent(so, (SO_FILT_HINT_LOCKED |
3311                                             SO_FILT_HINT_MPFASTJ));
3312                                 }
3313                         } else
3314 #endif /* MPTCP */
3315                                 isconnected = TRUE;
3316
3317                         if (tp->t_tfo_flags & (TFO_F_COOKIE_REQ | TFO_F_COOKIE_SENT)) {
3318                                 tcp_tfo_synack(tp, &to);
3319
3320                                 if ((tp->t_tfo_stats & TFO_S_SYN_DATA_SENT) &&
3321                                     SEQ_LT(tp->snd_una, th->th_ack)) {
3322                                         tp->t_tfo_stats |= TFO_S_SYN_DATA_ACKED;
3323                                         tcpstat.tcps_tfo_syn_data_acked++;
3324
3325                                         if (!(tp->t_tfo_flags & TFO_F_NO_RCVPROBING))
3326                                                 tcp_tfo_rcv_probe(tp, tlen);
3327                                 }
3328                         }
3329                 } else {
3330                         /*
3331                          *  Received initial SYN in SYN-SENT[*] state => simul-
3332                          *  taneous open.  If segment contains CC option and there is
3333                          *  a cached CC, apply TAO test; if it succeeds, connection is
3334                          *  half-synchronized.  Otherwise, do 3-way handshake:
3335                          *        SYN-SENT -> SYN-RECEIVED
3336                          *        SYN-SENT* -> SYN-RECEIVED*
3337                          */
3338                         tp->t_flags |= TF_ACKNOW;
3339                         tp->t_timer[TCPT_REXMT] = 0;
3340                         DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
3341                                 struct tcpcb *, tp, int32_t, TCPS_SYN_RECEIVED);
3342                         tp->t_state = TCPS_SYN_RECEIVED;
3343
3344                         /*
3345                          * During simultaneous open, TFO should not be used.
3346                          * So, we disable it here, to prevent that data gets
3347                          * sent on the SYN/ACK.
3348                          */
3349                         tcp_disable_tfo(tp);
3350                 }
3351
3352 trimthenstep6:
3353                 /*
3354                  * Advance th->th_seq to correspond to first data byte.
3355                  * If data, trim to stay within window,
3356                  * dropping FIN if necessary.
3357                  */
3358                 th->th_seq++;
3359                 if (tlen > tp->rcv_wnd) {
3360                         todrop = tlen - tp->rcv_wnd;
3361                         m_adj(m, -todrop);
3362                         tlen = tp->rcv_wnd;
3363                         thflags &= ~TH_FIN;
3364                         tcpstat.tcps_rcvpackafterwin++;
3365                         tcpstat.tcps_rcvbyteafterwin += todrop;
3366                 }
3367                 tp->snd_wl1 = th->th_seq - 1;
3368                 tp->rcv_up = th->th_seq;
3369                 /*
3370                  *  Client side of transaction: already sent SYN and data.
3371                  *  If the remote host used T/TCP to validate the SYN,
3372                  *  our data will be ACK'd; if so, enter normal data segment
3373                  *  processing in the middle of step 5, ack processing.
3374                  *  Otherwise, goto step 6.
3375                  */
3376                 if (thflags & TH_ACK)
3377                         goto process_ACK;
3378                 goto step6;
3379         /*
3380          * If the state is LAST_ACK or CLOSING or TIME_WAIT:
3381          *      do normal processing.
3382          *
3383          * NB: Leftover from RFC1644 T/TCP.  Cases to be reused later.
3384          */
3385         case TCPS_LAST_ACK:
3386         case TCPS_CLOSING:
3387         case TCPS_TIME_WAIT:
3388                 break;  /* continue normal processing */
3389
3390         /* Received a SYN while connection is already established.
3391          * This is a "half open connection and other anomalies" described
3392          * in RFC793 page 34, send an ACK so the remote reset the connection
3393          * or recovers by adjusting its sequence numberering
3394          */
3395         case TCPS_ESTABLISHED:
3396                 if (thflags & TH_SYN)
3397                         goto dropafterack;
3398                 break;
3399         }
3400
3401         /*
3402          * States other than LISTEN or SYN_SENT.
3403          * First check the RST flag and sequence number since reset segments
3404          * are exempt from the timestamp and connection count tests.  This
3405          * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
3406          * below which allowed reset segments in half the sequence space
3407          * to fall though and be processed (which gives forged reset
3408          * segments with a random sequence number a 50 percent chance of
3409          * killing a connection).
3410          * Then check timestamp, if present.
3411          * Then check the connection count, if present.
3412          * Then check that at least some bytes of segment are within
3413          * receive window.  If segment begins before rcv_nxt,
3414          * drop leading data (and SYN); if nothing left, just ack.
3415          *
3416          *
3417          * If the RST bit is set, check the sequence number to see
3418          * if this is a valid reset segment.
3419          * RFC 793 page 37:
3420          *   In all states except SYN-SENT, all reset (RST) segments
3421          *   are validated by checking their SEQ-fields.  A reset is
3422          *   valid if its sequence number is in the window.
3423          * Note: this does not take into account delayed ACKs, so
3424          *   we should test against last_ack_sent instead of rcv_nxt.
3425          *   The sequence number in the reset segment is normally an
3426          *   echo of our outgoing acknowlegement numbers, but some hosts
3427          *   send a reset with the sequence number at the rightmost edge
3428          *   of our receive window, and we have to handle this case.
3429          * Note 2: Paul Watson's paper "Slipping in the Window" has shown
3430          *   that brute force RST attacks are possible.  To combat this,
3431          *   we use a much stricter check while in the ESTABLISHED state,
3432          *   only accepting RSTs where the sequence number is equal to
3433          *   last_ack_sent.  In all other states (the states in which a
3434          *   RST is more likely), the more permissive check is used.
3435          * If we have multiple segments in flight, the intial reset
3436          * segment sequence numbers will be to the left of last_ack_sent,
3437          * but they will eventually catch up.
3438          * In any case, it never made sense to trim reset segments to
3439          * fit the receive window since RFC 1122 says:
3440          *   4.2.2.12  RST Segment: RFC-793 Section 3.4
3441          *
3442          *    A TCP SHOULD allow a received RST segment to include data.
3443          *
3444          *    DISCUSSION
3445          *         It has been suggested that a RST segment could contain
3446          *         ASCII text that encoded and explained the cause of the
3447          *         RST.  No standard has yet been established for such
3448          *         data.
3449          *
3450          * If the reset segment passes the sequence number test examine
3451          * the state:
3452          *    SYN_RECEIVED STATE:
3453          *      If passive open, return to LISTEN state.
3454          *      If active open, inform user that connection was refused.
3455          *    ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES:
3456          *      Inform user that connection was reset, and close tcb.
3457          *    CLOSING, LAST_ACK STATES:
3458          *      Close the tcb.
3459          *    TIME_WAIT STATE:
3460          *      Drop the segment - see Stevens, vol. 2, p. 964 and
3461          *      RFC 1337.
3462          *
3463          *      Radar 4803931: Allows for the case where we ACKed the FIN but
3464          *                     there is already a RST in flight from the peer.
3465          *                     In that case, accept the RST for non-established
3466          *                     state if it's one off from last_ack_sent.
3467
3468          */
3469         if (thflags & TH_RST) {
3470                 if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
3471                     SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
3472                     (tp->rcv_wnd == 0 &&
3473                     ((tp->last_ack_sent == th->th_seq) ||
3474                     ((tp->last_ack_sent -1) == th->th_seq)))) {
3475                         switch (tp->t_state) {
3476
3477                         case TCPS_SYN_RECEIVED:
3478                                 IF_TCP_STATINC(ifp, rstinsynrcv);
3479                                 so->so_error = ECONNREFUSED;
3480                                 goto close;
3481
3482                         case TCPS_ESTABLISHED:
3483                                 if (tp->last_ack_sent != th->th_seq) {
3484                                         tcpstat.tcps_badrst++;
3485                                         goto drop;
3486                                 }
3487                         case TCPS_FIN_WAIT_1:
3488                         case TCPS_CLOSE_WAIT:
3489                                 /*
3490                                   Drop through ...
3491                                 */
3492                         case TCPS_FIN_WAIT_2:
3493                                 so->so_error = ECONNRESET;
3494                         close:
3495                                 postevent(so, 0, EV_RESET);
3496                                 soevent(so,
3497                                     (SO_FILT_HINT_LOCKED |
3498                                     SO_FILT_HINT_CONNRESET));
3499
3500                                 tcpstat.tcps_drops++;
3501                                 tp = tcp_close(tp);
3502                                 break;
3503
3504                         case TCPS_CLOSING:
3505                         case TCPS_LAST_ACK:
3506                                 tp = tcp_close(tp);
3507                                 break;
3508
3509                         case TCPS_TIME_WAIT:
3510                                 break;
3511                         }
3512                 }
3513                 goto drop;
3514         }
3515
3516         /*
3517          * RFC 1323 PAWS: If we have a timestamp reply on this segment
3518          * and it's less than ts_recent, drop it.
3519          */
3520         if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
3521             TSTMP_LT(to.to_tsval, tp->ts_recent)) {
3522
3523                 /* Check to see if ts_recent is over 24 days old.  */
3524                 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) {
3525                         /*
3526                          * Invalidate ts_recent.  If this segment updates
3527                          * ts_recent, the age will be reset later and ts_recent
3528                          * will get a valid value.  If it does not, setting
3529                          * ts_recent to zero will at least satisfy the
3530                          * requirement that zero be placed in the timestamp
3531                          * echo reply when ts_recent isn't valid.  The
3532                          * age isn't reset until we get a valid ts_recent
3533                          * because we don't want out-of-order segments to be
3534                          * dropped when ts_recent is old.
3535                          */
3536                         tp->ts_recent = 0;
3537                 } else {
3538                         tcpstat.tcps_rcvduppack++;
3539                         tcpstat.tcps_rcvdupbyte += tlen;
3540                         tp->t_pawsdrop++;
3541                         tcpstat.tcps_pawsdrop++;
3542
3543                         /*
3544                          * PAWS-drop when ECN is being used? That indicates
3545                          * that ECT-marked packets take a different path, with
3546                          * different congestion-characteristics.
3547                          *
3548                          * Only fallback when we did send less than 2GB as PAWS
3549                          * really has no reason to kick in earlier.
3550                          */
3551                         if (TCP_ECN_ENABLED(tp) &&
3552                             inp->inp_stat->rxbytes < 2147483648) {
3553                                 INP_INC_IFNET_STAT(inp, ecn_fallback_reorder);
3554                                 tcpstat.tcps_ecn_fallback_reorder++;
3555                                 tcp_heuristic_ecn_aggressive(tp);
3556                         }
3557
3558                         if (nstat_collect) {
3559                                 nstat_route_rx(tp->t_inpcb->inp_route.ro_rt,
3560                                         1, tlen, NSTAT_RX_FLAG_DUPLICATE);
3561                                 INP_ADD_STAT(inp, cell, wifi, wired,
3562                                     rxpackets, 1);
3563                                 INP_ADD_STAT(inp, cell, wifi, wired,
3564                                     rxbytes, tlen);
3565                                 tp->t_stat.rxduplicatebytes += tlen;
3566                         }
3567                         if (tlen > 0)
3568                                 goto dropafterack;
3569                         goto drop;
3570                 }
3571         }
3572
3573         /*
3574          * In the SYN-RECEIVED state, validate that the packet belongs to
3575          * this connection before trimming the data to fit the receive
3576          * window.  Check the sequence number versus IRS since we know
3577          * the sequence numbers haven't wrapped.  This is a partial fix
3578          * for the "LAND" DoS attack.
3579          */
3580         if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
3581                 rstreason = BANDLIM_RST_OPENPORT;
3582                 IF_TCP_STATINC(ifp, dospacket);
3583                 goto dropwithreset;
3584         }
3585
3586         todrop = tp->rcv_nxt - th->th_seq;
3587         if (todrop > 0) {
3588                 if (thflags & TH_SYN) {
3589                         thflags &= ~TH_SYN;
3590                         th->th_seq++;
3591                         if (th->th_urp > 1)
3592                                 th->th_urp--;
3593                         else
3594                                 thflags &= ~TH_URG;
3595                         todrop--;
3596                 }
3597                 /*
3598                  * Following if statement from Stevens, vol. 2, p. 960.
3599                  */
3600                 if (todrop > tlen
3601                     || (todrop == tlen && (thflags & TH_FIN) == 0)) {
3602                         /*
3603                          * Any valid FIN must be to the left of the window.
3604                          * At this point the FIN must be a duplicate or out
3605                          * of sequence; drop it.
3606                          */
3607                         thflags &= ~TH_FIN;
3608
3609                         /*
3610                          * Send an ACK to resynchronize and drop any data.
3611                          * But keep on processing for RST or ACK.
3612                          */
3613                         tp->t_flags |= TF_ACKNOW;
3614                         if (todrop == 1) {
3615                                 /* This could be a keepalive */
3616                                 soevent(so, SO_FILT_HINT_LOCKED |
3617                                         SO_FILT_HINT_KEEPALIVE);
3618                         }
3619                         todrop = tlen;
3620                         tcpstat.tcps_rcvduppack++;
3621                         tcpstat.tcps_rcvdupbyte += todrop;
3622                 } else {
3623                         tcpstat.tcps_rcvpartduppack++;
3624                         tcpstat.tcps_rcvpartdupbyte += todrop;
3625                 }
3626
3627                 if (TCP_DSACK_ENABLED(tp) && todrop > 1) {
3628                         /*
3629                          * Note the duplicate data sequence space so that
3630                          * it can be reported in DSACK option.
3631                          */
3632                         tp->t_dsack_lseq = th->th_seq;
3633                         tp->t_dsack_rseq = th->th_seq + todrop;
3634                         tp->t_flags |= TF_ACKNOW;
3635                 }
3636                 if (nstat_collect) {
3637                         nstat_route_rx(tp->t_inpcb->inp_route.ro_rt, 1,
3638                                 todrop, NSTAT_RX_FLAG_DUPLICATE);
3639                         INP_ADD_STAT(inp, cell, wifi, wired, rxpackets, 1);
3640                         INP_ADD_STAT(inp, cell, wifi, wired, rxbytes, todrop);
3641                         tp->t_stat.rxduplicatebytes += todrop;
3642                 }
3643                 drop_hdrlen += todrop;  /* drop from the top afterwards */
3644                 th->th_seq += todrop;
3645                 tlen -= todrop;
3646                 if (th->th_urp > todrop)
3647                         th->th_urp -= todrop;
3648                 else {
3649                         thflags &= ~TH_URG;
3650                         th->th_urp = 0;
3651                 }
3652         }
3653
3654         /*
3655          * If new data are received on a connection after the user
3656          * processes are gone, then RST the other end.
3657          * Send also a RST when we received a data segment after we've
3658          * sent our FIN when the socket is defunct.
3659          * Note that an MPTCP subflow socket would have SS_NOFDREF set
3660          * by default so check to make sure that we test for SOF_MP_SUBFLOW
3661          * socket flag (which would be cleared when the socket is closed.)
3662          */
3663         if (!(so->so_flags & SOF_MP_SUBFLOW) && tlen &&
3664             (((so->so_state & SS_NOFDREF) &&
3665             tp->t_state > TCPS_CLOSE_WAIT) ||
3666             ((so->so_flags & SOF_DEFUNCT) &&
3667             tp->t_state > TCPS_FIN_WAIT_1))) {
3668                 tp = tcp_close(tp);
3669                 tcpstat.tcps_rcvafterclose++;
3670                 rstreason = BANDLIM_UNLIMITED;
3671                 IF_TCP_STATINC(ifp, cleanup);
3672                 goto dropwithreset;
3673         }
3674
3675         /*
3676          * If segment ends after window, drop trailing data
3677          * (and PUSH and FIN); if nothing left, just ACK.
3678          */
3679         todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd);
3680         if (todrop > 0) {
3681                 tcpstat.tcps_rcvpackafterwin++;
3682                 if (todrop >= tlen) {
3683                         tcpstat.tcps_rcvbyteafterwin += tlen;
3684                         /*
3685                          * If a new connection request is received
3686                          * while in TIME_WAIT, drop the old connection
3687                          * and start over if the sequence numbers
3688                          * are above the previous ones.
3689                          */
3690                         if (thflags & TH_SYN &&
3691                             tp->t_state == TCPS_TIME_WAIT &&
3692                             SEQ_GT(th->th_seq, tp->rcv_nxt)) {
3693                                 iss = tcp_new_isn(tp);
3694                                 tp = tcp_close(tp);
3695                                 tcp_unlock(so, 1, 0);
3696                                 goto findpcb;
3697                         }
3698                         /*
3699                          * If window is closed can only take segments at
3700                          * window edge, and have to drop data and PUSH from
3701                          * incoming segments.  Continue processing, but
3702                          * remember to ack.  Otherwise, drop segment
3703                          * and ack.
3704                          */
3705                         if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
3706                                 tp->t_flags |= TF_ACKNOW;
3707                                 tcpstat.tcps_rcvwinprobe++;
3708                         } else
3709                                 goto dropafterack;
3710                 } else
3711                         tcpstat.tcps_rcvbyteafterwin += todrop;
3712                 m_adj(m, -todrop);
3713                 tlen -= todrop;
3714                 thflags &= ~(TH_PUSH|TH_FIN);
3715         }
3716
3717         /*
3718          * If last ACK falls within this segment's sequence numbers,
3719          * record its timestamp.
3720          * NOTE:
3721          * 1) That the test incorporates suggestions from the latest
3722          *    proposal of the tcplw@cray.com list (Braden 1993/04/26).
3723          * 2) That updating only on newer timestamps interferes with
3724          *    our earlier PAWS tests, so this check should be solely
3725          *    predicated on the sequence space of this segment.
3726          * 3) That we modify the segment boundary check to be
3727          *        Last.ACK.Sent <= SEG.SEQ + SEG.Len
3728          *    instead of RFC1323's
3729          *        Last.ACK.Sent < SEG.SEQ + SEG.Len,
3730          *    This modified check allows us to overcome RFC1323's
3731          *    limitations as described in Stevens TCP/IP Illustrated
3732          *    Vol. 2 p.869. In such cases, we can still calculate the
3733          *    RTT correctly when RCV.NXT == Last.ACK.Sent.
3734          */
3735         if ((to.to_flags & TOF_TS) != 0 &&
3736             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
3737             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
3738                 ((thflags & (TH_SYN|TH_FIN)) != 0))) {
3739                 tp->ts_recent_age = tcp_now;
3740                 tp->ts_recent = to.to_tsval;
3741         }
3742
3743         /*
3744          * If a SYN is in the window, then this is an
3745          * error and we send an RST and drop the connection.
3746          */
3747         if (thflags & TH_SYN) {
3748                 tp = tcp_drop(tp, ECONNRESET);
3749                 rstreason = BANDLIM_UNLIMITED;
3750                 postevent(so, 0, EV_RESET);
3751                 IF_TCP_STATINC(ifp, synwindow);
3752                 goto dropwithreset;
3753         }
3754
3755         /*
3756          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN
3757          * flag is on (half-synchronized state), then queue data for
3758          * later processing; else drop segment and return.
3759          */
3760         if ((thflags & TH_ACK) == 0) {
3761                 if (tp->t_state == TCPS_SYN_RECEIVED ||
3762                     (tp->t_flags & TF_NEEDSYN)) {
3763                         if ((tfo_enabled(tp))) {
3764                                 /*
3765                                  * So, we received a valid segment while in
3766                                  * SYN-RECEIVED (TF_NEEDSYN is actually never
3767                                  * set, so this is dead code).
3768                                  * As this cannot be an RST (see that if a bit
3769                                  * higher), and it does not have the ACK-flag
3770                                  * set, we want to retransmit the SYN/ACK.
3771                                  * Thus, we have to reset snd_nxt to snd_una to
3772                                  * trigger the going back to sending of the
3773                                  * SYN/ACK. This is more consistent with the
3774                                  * behavior of tcp_output(), which expects
3775                                  * to send the segment that is pointed to by
3776                                  * snd_nxt.
3777                                  */
3778                                 tp->snd_nxt = tp->snd_una;
3779
3780                                 /*
3781                                  * We need to make absolutely sure that we are
3782                                  * going to reply upon a duplicate SYN-segment.
3783                                  */
3784                                 if (th->th_flags & TH_SYN)
3785                                         needoutput = 1;
3786                         }
3787
3788                         goto step6;
3789                 } else if (tp->t_flags & TF_ACKNOW)
3790                         goto dropafterack;
3791                 else
3792                         goto drop;
3793         }
3794
3795         /*
3796          * Ack processing.
3797          */
3798
3799         switch (tp->t_state) {
3800
3801         /*
3802          * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
3803          * ESTABLISHED state and continue processing.
3804          * The ACK was checked above.
3805          */
3806         case TCPS_SYN_RECEIVED:
3807
3808                 tcpstat.tcps_connects++;
3809
3810                 /* Do window scaling? */
3811                 if (TCP_WINDOW_SCALE_ENABLED(tp)) {
3812                         tp->snd_scale = tp->requested_s_scale;
3813                         tp->rcv_scale = tp->request_r_scale;
3814                         tp->snd_wnd = th->th_win << tp->snd_scale;
3815                         tiwin = tp->snd_wnd;
3816                 }
3817                 /*
3818                  * Make transitions:
3819                  *      SYN-RECEIVED  -> ESTABLISHED
3820                  *      SYN-RECEIVED* -> FIN-WAIT-1
3821                  */
3822                 tp->t_starttime = tcp_now;
3823                 tcp_sbrcv_tstmp_check(tp);
3824                 if (tp->t_flags & TF_NEEDFIN) {
3825                         DTRACE_TCP4(state__change, void, NULL,
3826                             struct inpcb *, inp,
3827                             struct tcpcb *, tp, int32_t, TCPS_FIN_WAIT_1);
3828                         tp->t_state = TCPS_FIN_WAIT_1;
3829                         tp->t_flags &= ~TF_NEEDFIN;
3830                 } else {
3831                         DTRACE_TCP4(state__change, void, NULL,
3832                             struct inpcb *, inp,
3833                             struct tcpcb *, tp, int32_t, TCPS_ESTABLISHED);
3834                         tp->t_state = TCPS_ESTABLISHED;
3835                         tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
3836                                 TCP_CONN_KEEPIDLE(tp));
3837                         if (nstat_collect)
3838                                 nstat_route_connect_success(
3839                                     tp->t_inpcb->inp_route.ro_rt);
3840                 }
3841                 /*
3842                  * If segment contains data or ACK, will call tcp_reass()
3843                  * later; if not, do so now to pass queued data to user.
3844                  */
3845                 if (tlen == 0 && (thflags & TH_FIN) == 0)
3846                         (void) tcp_reass(tp, (struct tcphdr *)0, &tlen,
3847                             NULL, ifp);
3848                 tp->snd_wl1 = th->th_seq - 1;
3849
3850 #if MPTCP
3851                 /*
3852                  * Do not send the connect notification for additional subflows
3853                  * until ACK for 3-way handshake arrives.
3854                  */
3855                 if ((!(tp->t_mpflags & TMPF_MPTCP_TRUE)) &&
3856                     (tp->t_mpflags & TMPF_SENT_JOIN)) {
3857                         isconnected = FALSE;
3858                 } else
3859 #endif /* MPTCP */
3860                         isconnected = TRUE;
3861                 if ((tp->t_tfo_flags & TFO_F_COOKIE_VALID)) {
3862                         /* Done this when receiving the SYN */
3863                         isconnected = FALSE;
3864
3865                         OSDecrementAtomic(&tcp_tfo_halfcnt);
3866
3867                         /* Panic if something has gone terribly wrong. */
3868                         VERIFY(tcp_tfo_halfcnt >= 0);
3869
3870                         tp->t_tfo_flags &= ~TFO_F_COOKIE_VALID;
3871                 }
3872
3873                 /*
3874                  * In case there is data in the send-queue (e.g., TFO is being
3875                  * used, or connectx+data has been done), then if we would
3876                  * "FALLTHROUGH", we would handle this ACK as if data has been
3877                  * acknowledged. But, we have to prevent this. And this
3878                  * can be prevented by increasing snd_una by 1, so that the
3879                  * SYN is not considered as data (snd_una++ is actually also
3880                  * done in SYN_SENT-state as part of the regular TCP stack).
3881                  *
3882                  * In case there is data on this ack as well, the data will be
3883                  * handled by the label "dodata" right after step6.
3884                  */
3885                 if (so->so_snd.sb_cc) {
3886                         tp->snd_una++;  /* SYN is acked */
3887                         if (SEQ_LT(tp->snd_nxt, tp->snd_una))
3888                                 tp->snd_nxt = tp->snd_una;
3889
3890                         /*
3891                          * No duplicate-ACK handling is needed. So, we
3892                          * directly advance to processing the ACK (aka,
3893                          * updating the RTT estimation,...)
3894                          *
3895                          * But, we first need to handle eventual SACKs,
3896                          * because TFO will start sending data with the
3897                          * SYN/ACK, so it might be that the client
3898                          * includes a SACK with its ACK.
3899                          */
3900                         if (SACK_ENABLED(tp) &&
3901                             (to.to_nsacks > 0 ||
3902                              !TAILQ_EMPTY(&tp->snd_holes)))
3903                                 tcp_sack_doack(tp, &to, th,
3904                                     &sack_bytes_acked);
3905
3906                         goto process_ACK;
3907                 }
3908
3909                 /* FALLTHROUGH */
3910
3911         /*
3912          * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
3913          * ACKs.  If the ack is in the range
3914          *      tp->snd_una < th->th_ack <= tp->snd_max
3915          * then advance tp->snd_una to th->th_ack and drop
3916          * data from the retransmission queue.  If this ACK reflects
3917          * more up to date window information we update our window information.
3918          */
3919         case TCPS_ESTABLISHED:
3920         case TCPS_FIN_WAIT_1:
3921         case TCPS_FIN_WAIT_2:
3922         case TCPS_CLOSE_WAIT:
3923         case TCPS_CLOSING:
3924         case TCPS_LAST_ACK:
3925         case TCPS_TIME_WAIT:
3926                 if (SEQ_GT(th->th_ack, tp->snd_max)) {
3927                         tcpstat.tcps_rcvacktoomuch++;
3928                         goto dropafterack;
3929                 }
3930                 if (SACK_ENABLED(tp) && to.to_nsacks > 0) {
3931                         recvd_dsack = tcp_sack_process_dsack(tp, &to, th);
3932                         /*
3933                          * If DSACK is received and this packet has no
3934                          * other SACK information, it can be dropped.
3935                          * We do not want to treat it as a duplicate ack.
3936                          */
3937                         if (recvd_dsack &&
3938                             SEQ_LEQ(th->th_ack, tp->snd_una) &&
3939                             to.to_nsacks == 0) {
3940                                 tcp_bad_rexmt_check(tp, th, &to);
3941                                 goto drop;
3942                         }
3943                 }
3944
3945                 if (SACK_ENABLED(tp) &&
3946                     (to.to_nsacks > 0 || !TAILQ_EMPTY(&tp->snd_holes)))
3947                         tcp_sack_doack(tp, &to, th, &sack_bytes_acked);
3948
3949 #if MPTCP
3950                 if ((tp->t_mpuna) && (SEQ_GEQ(th->th_ack, tp->t_mpuna))) {
3951                         if (tp->t_mpflags & TMPF_PREESTABLISHED) {
3952                                 /* MP TCP establishment succeeded */
3953                                 tp->t_mpuna = 0;
3954                                 if (tp->t_mpflags & TMPF_JOINED_FLOW) {
3955                                         if (tp->t_mpflags & TMPF_SENT_JOIN) {
3956                                                 tp->t_mpflags &=
3957                                                     ~TMPF_PREESTABLISHED;
3958                                                 tp->t_mpflags |=
3959                                                     TMPF_MPTCP_TRUE;
3960                                                 so->so_flags |= SOF_MPTCP_TRUE;
3961                                                 mptcplog((LOG_DEBUG, "MPTCP "
3962                                                     "Sockets: %s \n",__func__),
3963                                                     MPTCP_SOCKET_DBG,
3964                                                     MPTCP_LOGLVL_LOG);
3965
3966                                                 tp->t_timer[TCPT_JACK_RXMT] = 0;
3967                                                 tp->t_mprxtshift = 0;
3968                                                 isconnected = TRUE;
3969                                         } else {
3970                                                 isconnected = FALSE;
3971                                         }
3972                                 } else {
3973                                         isconnected = TRUE;
3974                                         tp->t_mpflags &= ~TMPF_SENT_KEYS;
3975                                 }
3976                         }
3977                 }
3978 #endif /* MPTCP */
3979
3980                 tcp_tfo_rcv_ack(tp, th);
3981
3982                 /*
3983                  * If we have outstanding data (other than
3984                  * a window probe), this is a completely
3985                  * duplicate ack (ie, window info didn't
3986                  * change) and the ack is the biggest we've seen.
3987                  */
3988                 if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
3989                         if (tlen == 0 && tiwin == tp->snd_wnd) {
3990                                 /*
3991                                  * If both ends send FIN at the same time,
3992                                  * then the ack will be a duplicate ack
3993                                  * but we have to process the FIN. Check
3994                                  * for this condition and process the FIN
3995                                  * instead of the dupack
3996                                  */
3997                                 if ((thflags & TH_FIN) &&
3998                                     (tp->t_flags & TF_SENTFIN) &&
3999                                     !TCPS_HAVERCVDFIN(tp->t_state) &&
4000                                     (th->th_ack + 1) == tp->snd_max)
4001                                         break;
4002 process_dupack:
4003 #if MPTCP
4004                                 /*
4005                                  * MPTCP options that are ignored must
4006                                  * not be treated as duplicate ACKs.
4007                                  */
4008                                 if (to.to_flags & TOF_MPTCP) {
4009                                         goto drop;
4010                                 }
4011
4012                                 if ((isconnected) && (tp->t_mpflags & TMPF_JOINED_FLOW)) {
4013                                         mptcplog((LOG_DEBUG, "MPTCP "
4014                                             "Sockets: bypass ack recovery\n"),
4015                                             MPTCP_SOCKET_DBG,
4016                                             MPTCP_LOGLVL_VERBOSE);
4017                                         break;
4018                                 }
4019 #endif /* MPTCP */
4020                                 /*
4021                                  * If a duplicate acknowledgement was seen
4022                                  * after ECN, it indicates packet loss in
4023                                  * addition to ECN. Reset INRECOVERY flag
4024                                  * so that we can process partial acks
4025                                  * correctly
4026                                  */
4027                                 if (tp->ecn_flags & TE_INRECOVERY)
4028                                         tp->ecn_flags &= ~TE_INRECOVERY;
4029
4030                                 tcpstat.tcps_rcvdupack++;
4031                                 ++tp->t_dupacks;
4032
4033                                 /*
4034                                  * Check if we need to reset the limit on
4035                                  * early retransmit
4036                                  */
4037                                 if (tp->t_early_rexmt_count > 0 &&
4038                                     TSTMP_GEQ(tcp_now,
4039                                     (tp->t_early_rexmt_win +
4040                                     TCP_EARLY_REXMT_WIN)))
4041                                         tp->t_early_rexmt_count = 0;
4042
4043                                 /*
4044                                  * Is early retransmit needed? We check for
4045                                  * this when the connection is waiting for
4046                                  * duplicate acks to enter fast recovery.
4047                                  */
4048                                 if (!IN_FASTRECOVERY(tp))
4049                                         tcp_early_rexmt_check(tp, th);
4050
4051                                 /*
4052                                  * If we've seen exactly rexmt threshold
4053                                  * of duplicate acks, assume a packet
4054                                  * has been dropped and retransmit it.
4055                                  * Kludge snd_nxt & the congestion
4056                                  * window so we send only this one
4057                                  * packet.
4058                                  *
4059                                  * We know we're losing at the current
4060                                  * window size so do congestion avoidance
4061                                  * (set ssthresh to half the current window
4062                                  * and pull our congestion window back to
4063                                  * the new ssthresh).
4064                                  *
4065                                  * Dup acks mean that packets have left the
4066                                  * network (they're now cached at the receiver)
4067                                  * so bump cwnd by the amount in the receiver
4068                                  * to keep a constant cwnd packets in the
4069                                  * network.
4070                                  */
4071                                 if (tp->t_timer[TCPT_REXMT] == 0 ||
4072                                     (th->th_ack != tp->snd_una
4073                                     && sack_bytes_acked == 0)) {
4074                                         tp->t_dupacks = 0;
4075                                         tp->t_rexmtthresh = tcprexmtthresh;
4076                                 } else if (tp->t_dupacks > tp->t_rexmtthresh ||
4077                                         IN_FASTRECOVERY(tp)) {
4078
4079                                         /*
4080                                          * If this connection was seeing packet
4081                                          * reordering, then recovery might be
4082                                          * delayed to disambiguate between
4083                                          * reordering and loss
4084                                          */
4085                                         if (SACK_ENABLED(tp) && !IN_FASTRECOVERY(tp) &&
4086                                             (tp->t_flagsext &
4087                                             (TF_PKTS_REORDERED|TF_DELAY_RECOVERY)) ==
4088                                             (TF_PKTS_REORDERED|TF_DELAY_RECOVERY)) {
4089                                                 /*
4090                                                  * Since the SACK information is already
4091                                                  * updated, this ACK will be dropped
4092                                                  */
4093                                                 break;
4094                                         }
4095
4096                                         if (SACK_ENABLED(tp)
4097                                             && IN_FASTRECOVERY(tp)) {
4098                                                 int awnd;
4099
4100                                                 /*
4101                                                  * Compute the amount of data in flight first.
4102                                                  * We can inject new data into the pipe iff
4103                                                  * we have less than 1/2 the original window's
4104                                                  * worth of data in flight.
4105                                                  */
4106                                                 awnd = (tp->snd_nxt - tp->snd_fack) +
4107                                                         tp->sackhint.sack_bytes_rexmit;
4108                                                 if (awnd < tp->snd_ssthresh) {
4109                                                         tp->snd_cwnd += tp->t_maxseg;
4110                                                         if (tp->snd_cwnd > tp->snd_ssthresh)
4111                                                                 tp->snd_cwnd = tp->snd_ssthresh;
4112                                                 }
4113                                         } else
4114                                                 tp->snd_cwnd += tp->t_maxseg;
4115
4116                                         tcp_ccdbg_trace(tp, th, TCP_CC_IN_FASTRECOVERY);
4117
4118                                         (void) tcp_output(tp);
4119                                         goto drop;
4120                                 } else if (tp->t_dupacks == tp->t_rexmtthresh) {
4121                                         tcp_seq onxt = tp->snd_nxt;
4122
4123                                         /*
4124                                          * If we're doing sack, check to
4125                                          * see if we're already in sack
4126                                          * recovery. If we're not doing sack,
4127                                          * check to see if we're in newreno
4128                                          * recovery.
4129                                          */
4130                                         if (SACK_ENABLED(tp)) {
4131                                                 if (IN_FASTRECOVERY(tp)) {
4132                                                         tp->t_dupacks = 0;
4133                                                         break;
4134                                                 } else if (tp->t_flagsext & TF_DELAY_RECOVERY) {
4135                                                         break;
4136                                                 }
4137                                         } else {
4138                                                 if (SEQ_LEQ(th->th_ack,
4139                                                     tp->snd_recover)) {
4140                                                         tp->t_dupacks = 0;
4141                                                         break;
4142                                                 }
4143                                         }
4144                                         if (tp->t_flags & TF_SENTFIN)
4145                                                 tp->snd_recover = tp->snd_max - 1;
4146                                         else
4147                                                 tp->snd_recover = tp->snd_max;
4148                                         tp->t_timer[TCPT_PTO] = 0;
4149                                         tp->t_rtttime = 0;
4150
4151                                         /*
4152                                          * If the connection has seen pkt
4153                                          * reordering, delay recovery until
4154                                          * it is clear that the packet
4155                                          * was lost.
4156                                          */
4157                                         if (SACK_ENABLED(tp) &&
4158                                             (tp->t_flagsext &
4159                                             (TF_PKTS_REORDERED|TF_DELAY_RECOVERY))
4160                                             == TF_PKTS_REORDERED &&
4161                                             !IN_FASTRECOVERY(tp) &&
4162                                             tp->t_reorderwin > 0 &&
4163                                             (tp->t_state == TCPS_ESTABLISHED ||
4164                                             tp->t_state == TCPS_FIN_WAIT_1)) {
4165                                                 tp->t_timer[TCPT_DELAYFR] =
4166                                                     OFFSET_FROM_START(tp,
4167                                                     tp->t_reorderwin);
4168                                                 tp->t_flagsext |= TF_DELAY_RECOVERY;
4169                                                 tcpstat.tcps_delay_recovery++;
4170                                                 tcp_ccdbg_trace(tp, th,
4171                                                     TCP_CC_DELAY_FASTRECOVERY);
4172                                                 break;
4173                                         }
4174
4175                                         tcp_rexmt_save_state(tp);
4176                                         /*
4177                                          * If the current tcp cc module has
4178                                          * defined a hook for tasks to run
4179                                          * before entering FR, call it
4180                                          */
4181                                         if (CC_ALGO(tp)->pre_fr != NULL)
4182                                                 CC_ALGO(tp)->pre_fr(tp);
4183                                         ENTER_FASTRECOVERY(tp);
4184                                         tp->t_timer[TCPT_REXMT] = 0;
4185                                         if (TCP_ECN_ENABLED(tp))
4186                                                 tp->ecn_flags |= TE_SENDCWR;
4187
4188                                         if (SACK_ENABLED(tp)) {
4189                                                 tcpstat.tcps_sack_recovery_episode++;
4190                                                 tp->t_sack_recovery_episode++;
4191                                                 tp->sack_newdata = tp->snd_nxt;
4192                                                 tp->snd_cwnd = tp->t_maxseg;
4193                                                 tp->t_flagsext &=
4194                                                     ~TF_CWND_NONVALIDATED;
4195                                                 tcp_ccdbg_trace(tp, th,
4196                                                     TCP_CC_ENTER_FASTRECOVERY);
4197                                                 (void) tcp_output(tp);
4198                                                 goto drop;
4199                                         }
4200                                         tp->snd_nxt = th->th_ack;
4201                                         tp->snd_cwnd = tp->t_maxseg;
4202                                         (void) tcp_output(tp);
4203                                         if (tp->t_flagsext & TF_CWND_NONVALIDATED) {
4204                                                 tcp_cc_adjust_nonvalidated_cwnd(tp);
4205                                         } else {
4206                                                 tp->snd_cwnd = tp->snd_ssthresh +
4207                                                      tp->t_maxseg * tp->t_dupacks;
4208                                         }
4209                                         if (SEQ_GT(onxt, tp->snd_nxt))
4210                                                 tp->snd_nxt = onxt;
4211                                         tcp_ccdbg_trace(tp, th,
4212                                             TCP_CC_ENTER_FASTRECOVERY);
4213                                         goto drop;
4214                                 } else if (limited_txmt &&
4215                                         ALLOW_LIMITED_TRANSMIT(tp) &&
4216                                         (!(SACK_ENABLED(tp)) || sack_bytes_acked > 0) &&
4217                                         (so->so_snd.sb_cc - (tp->snd_max - tp->snd_una)) > 0) {
4218                                         u_int32_t incr = (tp->t_maxseg * tp->t_dupacks);
4219
4220                                         /* Use Limited Transmit algorithm on the first two
4221                                          * duplicate acks when there is new data to transmit
4222                                          */
4223                                         tp->snd_cwnd += incr;
4224                                         tcpstat.tcps_limited_txt++;
4225                                         (void) tcp_output(tp);
4226
4227                                         tcp_ccdbg_trace(tp, th, TCP_CC_LIMITED_TRANSMIT);
4228
4229                                         /* Reset snd_cwnd back to normal */
4230                                         tp->snd_cwnd -= incr;
4231                                 }
4232                         } else {
4233                                 tp->t_dupacks = 0;
4234                                 tp->t_rexmtthresh = tcprexmtthresh;
4235                         }
4236                         break;
4237                 }
4238                 /*
4239                  * If the congestion window was inflated to account
4240                  * for the other side's cached packets, retract it.
4241                  */
4242                 if (IN_FASTRECOVERY(tp)) {
4243                         if (SEQ_LT(th->th_ack, tp->snd_recover)) {
4244                                 /*
4245                                  * If we received an ECE and entered
4246                                  * recovery, the subsequent ACKs should
4247                                  * not be treated as partial acks.
4248                                  */
4249                                 if (tp->ecn_flags & TE_INRECOVERY)
4250                                         goto process_ACK;
4251
4252                                 if (SACK_ENABLED(tp))
4253                                         tcp_sack_partialack(tp, th);
4254                                 else
4255                                         tcp_newreno_partial_ack(tp, th);
4256                                 tcp_ccdbg_trace(tp, th, TCP_CC_PARTIAL_ACK);
4257                         } else {
4258                                 EXIT_FASTRECOVERY(tp);
4259                                 if (CC_ALGO(tp)->post_fr != NULL)
4260                                         CC_ALGO(tp)->post_fr(tp, th);
4261                                 tp->t_pipeack = 0;
4262                                 tcp_clear_pipeack_state(tp);
4263                                 tcp_ccdbg_trace(tp, th,
4264                                     TCP_CC_EXIT_FASTRECOVERY);
4265                         }
4266                 } else if ((tp->t_flagsext &
4267                         (TF_PKTS_REORDERED|TF_DELAY_RECOVERY))
4268                         == (TF_PKTS_REORDERED|TF_DELAY_RECOVERY)) {
4269                         /*
4270                          * If the ack acknowledges upto snd_recover or if
4271                          * it acknowledges all the snd holes, exit
4272                          * recovery and cancel the timer. Otherwise,
4273                          * this is a partial ack. Wait for recovery timer
4274                          * to enter recovery. The snd_holes have already
4275                          * been updated.
4276                          */
4277                         if (SEQ_GEQ(th->th_ack, tp->snd_recover) ||
4278                             TAILQ_EMPTY(&tp->snd_holes)) {
4279                                 tp->t_timer[TCPT_DELAYFR] = 0;
4280                                 tp->t_flagsext &= ~TF_DELAY_RECOVERY;
4281                                 EXIT_FASTRECOVERY(tp);
4282                                 tcp_ccdbg_trace(tp, th,
4283                                     TCP_CC_EXIT_FASTRECOVERY);
4284                         }
4285                 } else {
4286                         /*
4287                          * We were not in fast recovery. Reset the
4288                          * duplicate ack counter.
4289                          */
4290                         tp->t_dupacks = 0;
4291                         tp->t_rexmtthresh = tcprexmtthresh;
4292                 }
4293
4294
4295                 /*
4296                  * If we reach this point, ACK is not a duplicate,
4297                  *     i.e., it ACKs something we sent.
4298                  */
4299                 if (tp->t_flags & TF_NEEDSYN) {
4300                         /*
4301                          * T/TCP: Connection was half-synchronized, and our
4302                          * SYN has been ACK'd (so connection is now fully
4303                          * synchronized).  Go to non-starred state,
4304                          * increment snd_una for ACK of SYN, and check if
4305                          * we can do window scaling.
4306                          */
4307                         tp->t_flags &= ~TF_NEEDSYN;
4308                         tp->snd_una++;
4309                         /* Do window scaling? */
4310                         if (TCP_WINDOW_SCALE_ENABLED(tp)) {
4311                                 tp->snd_scale = tp->requested_s_scale;
4312                                 tp->rcv_scale = tp->request_r_scale;
4313                         }
4314                 }
4315
4316 process_ACK:
4317                 VERIFY(SEQ_GEQ(th->th_ack, tp->snd_una));
4318                 acked = BYTES_ACKED(th, tp);
4319                 tcpstat.tcps_rcvackpack++;
4320                 tcpstat.tcps_rcvackbyte += acked;
4321
4322                 /*
4323                  * If the last packet was a retransmit, make sure
4324                  * it was not spurious.
4325                  *
4326                  * This will also take care of congestion window
4327                  * adjustment if a last packet was recovered due to a
4328                  * tail loss probe.
4329                  */
4330                 tcp_bad_rexmt_check(tp, th, &to);
4331
4332                 /* Recalculate the RTT */
4333                 tcp_compute_rtt(tp, &to, th);
4334
4335                 /*
4336                  * If all outstanding data is acked, stop retransmit
4337                  * timer and remember to restart (more output or persist).
4338                  * If there is more data to be acked, restart retransmit
4339                  * timer, using current (possibly backed-off) value.
4340                  */
4341                 if (th->th_ack == tp->snd_max) {
4342                         tp->t_timer[TCPT_REXMT] = 0;
4343                         tp->t_timer[TCPT_PTO] = 0;
4344                         needoutput = 1;
4345                 } else if (tp->t_timer[TCPT_PERSIST] == 0)
4346                         tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp,
4347                             tp->t_rxtcur);
4348
4349                 /*
4350                  * If no data (only SYN) was ACK'd, skip rest of ACK
4351                  * processing.
4352                  */
4353                 if (acked == 0)
4354                         goto step6;
4355
4356                 /*
4357                  * When outgoing data has been acked (except the SYN+data), we
4358                  * mark this connection as "sending good" for TFO.
4359                  */
4360                 if ((tp->t_tfo_stats & TFO_S_SYN_DATA_SENT) &&
4361                     !(tp->t_tfo_flags & TFO_F_NO_SNDPROBING) &&
4362                     !(th->th_flags & TH_SYN))
4363                         tcp_heuristic_tfo_snd_good(tp);
4364
4365                 /*
4366                  * If TH_ECE is received, make sure that ECN is enabled
4367                  * on that connection and we have sent ECT on data packets.
4368                  */
4369                 if ((thflags & TH_ECE) != 0 && TCP_ECN_ENABLED(tp) &&
4370                     (tp->ecn_flags & TE_SENDIPECT)) {
4371                         /*
4372                          * Reduce the congestion window if we haven't
4373                          * done so.
4374                          */
4375                         if (!IN_FASTRECOVERY(tp)) {
4376                                 tcp_reduce_congestion_window(tp);
4377                                 tp->ecn_flags |= (TE_INRECOVERY|TE_SENDCWR);
4378                                 /*
4379                                  * Also note that the connection received
4380                                  * ECE atleast once
4381                                  */
4382                                 tp->ecn_flags |= TE_RECV_ECN_ECE;
4383                                 INP_INC_IFNET_STAT(inp, ecn_recv_ece);
4384                                 tcpstat.tcps_ecn_recv_ece++;
4385                                 tcp_ccdbg_trace(tp, th, TCP_CC_ECN_RCVD);
4386                         }
4387                 }
4388
4389                 /*
4390                  * When new data is acked, open the congestion window.
4391                  * The specifics of how this is achieved are up to the
4392                  * congestion control algorithm in use for this connection.
4393                  *
4394                  * The calculations in this function assume that snd_una is
4395                  * not updated yet.
4396                  */
4397                 if (!IN_FASTRECOVERY(tp)) {
4398                         if (CC_ALGO(tp)->ack_rcvd != NULL)
4399                                 CC_ALGO(tp)->ack_rcvd(tp, th);
4400                         tcp_ccdbg_trace(tp, th, TCP_CC_ACK_RCVD);
4401                 }
4402                 if (acked > so->so_snd.sb_cc) {
4403                         tp->snd_wnd -= so->so_snd.sb_cc;
4404                         sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
4405                         if (so->so_flags & SOF_ENABLE_MSGS) {
4406                                 so->so_msg_state->msg_serial_bytes -=
4407                                         (int)so->so_snd.sb_cc;
4408                         }
4409                         ourfinisacked = 1;
4410                 } else {
4411                         sbdrop(&so->so_snd, acked);
4412                         if (so->so_flags & SOF_ENABLE_MSGS) {
4413                                 so->so_msg_state->msg_serial_bytes -=
4414                                         acked;
4415                         }
4416                         tcp_sbsnd_trim(&so->so_snd);
4417                         tp->snd_wnd -= acked;
4418                         ourfinisacked = 0;
4419                 }
4420                 /* detect una wraparound */
4421                 if ( !IN_FASTRECOVERY(tp) &&
4422                     SEQ_GT(tp->snd_una, tp->snd_recover) &&
4423                     SEQ_LEQ(th->th_ack, tp->snd_recover))
4424                         tp->snd_recover = th->th_ack - 1;
4425
4426                 if (IN_FASTRECOVERY(tp) &&
4427                     SEQ_GEQ(th->th_ack, tp->snd_recover))
4428                         EXIT_FASTRECOVERY(tp);
4429
4430                 tp->snd_una = th->th_ack;
4431                 if (SACK_ENABLED(tp)) {
4432                         if (SEQ_GT(tp->snd_una, tp->snd_recover))
4433                                 tp->snd_recover = tp->snd_una;
4434                 }
4435                 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
4436                         tp->snd_nxt = tp->snd_una;
4437                 if (!SLIST_EMPTY(&tp->t_rxt_segments) &&
4438                     !TCP_DSACK_SEQ_IN_WINDOW(tp, tp->t_dsack_lastuna,
4439                     tp->snd_una))
4440                         tcp_rxtseg_clean(tp);
4441                 if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 &&
4442                         tp->t_bwmeas != NULL)
4443                         tcp_bwmeas_check(tp);
4444
4445                 /*
4446                  * sowwakeup must happen after snd_una, et al. are updated so that
4447                  * the sequence numbers are in sync with so_snd
4448                  */
4449                 sowwakeup(so);
4450
4451                 switch (tp->t_state) {
4452
4453                 /*
4454                  * In FIN_WAIT_1 STATE in addition to the processing
4455                  * for the ESTABLISHED state if our FIN is now acknowledged
4456                  * then enter FIN_WAIT_2.
4457                  */
4458                 case TCPS_FIN_WAIT_1:
4459                         if (ourfinisacked) {
4460                                 /*
4461                                  * If we can't receive any more
4462                                  * data, then closing user can proceed.
4463                                  * Starting the TCPT_2MSL timer is contrary to the
4464                                  * specification, but if we don't get a FIN
4465                                  * we'll hang forever.
4466                                  */
4467                                 if (so->so_state & SS_CANTRCVMORE) {
4468                                         tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp,
4469                                                 TCP_CONN_MAXIDLE(tp));
4470                                         isconnected = FALSE;
4471                                         isdisconnected = TRUE;
4472                                 }
4473                                 DTRACE_TCP4(state__change, void, NULL,
4474                                         struct inpcb *, inp,
4475                                         struct tcpcb *, tp,
4476                                         int32_t, TCPS_FIN_WAIT_2);
4477                                 tp->t_state = TCPS_FIN_WAIT_2;
4478                                 /* fall through and make sure we also recognize
4479                                  * data ACKed with the FIN
4480                                  */
4481                         }
4482                         tp->t_flags |= TF_ACKNOW;
4483                         break;
4484
4485                 /*
4486                  * In CLOSING STATE in addition to the processing for
4487                  * the ESTABLISHED state if the ACK acknowledges our FIN
4488                  * then enter the TIME-WAIT state, otherwise ignore
4489                  * the segment.
4490                  */
4491                 case TCPS_CLOSING:
4492                         if (ourfinisacked) {
4493                                 DTRACE_TCP4(state__change, void, NULL,
4494                                         struct inpcb *, inp,
4495                                         struct tcpcb *, tp,
4496                                         int32_t, TCPS_TIME_WAIT);
4497                                 tp->t_state = TCPS_TIME_WAIT;
4498                                 tcp_canceltimers(tp);
4499                                 if (tp->t_flagsext & TF_NOTIMEWAIT) {
4500                                         tp->t_flags |= TF_CLOSING;
4501                                 } else {
4502                                         add_to_time_wait(tp, 2 * tcp_msl);
4503                                 }
4504                                 isconnected = FALSE;
4505                                 isdisconnected = TRUE;
4506                         }
4507                         tp->t_flags |= TF_ACKNOW;
4508                         break;
4509
4510                 /*
4511                  * In LAST_ACK, we may still be waiting for data to drain
4512                  * and/or to be acked, as well as for the ack of our FIN.
4513                  * If our FIN is now acknowledged, delete the TCB,
4514                  * enter the closed state and return.
4515                  */
4516                 case TCPS_LAST_ACK:
4517                         if (ourfinisacked) {
4518                                 tp = tcp_close(tp);
4519                                 goto drop;
4520                         }
4521                         break;
4522
4523                 /*
4524                  * In TIME_WAIT state the only thing that should arrive
4525                  * is a retransmission of the remote FIN.  Acknowledge
4526                  * it and restart the finack timer.
4527                  */
4528                 case TCPS_TIME_WAIT:
4529                         add_to_time_wait(tp, 2 * tcp_msl);
4530                         goto dropafterack;
4531                 }
4532
4533                 /*
4534                  * If there is a SACK option on the ACK and we
4535                  * haven't seen any duplicate acks before, count
4536                  * it as a duplicate ack even if the cumulative
4537                  * ack is advanced. If the receiver delayed an
4538                  * ack and detected loss afterwards, then the ack
4539                  * will advance cumulative ack and will also have
4540                  * a SACK option. So counting it as one duplicate
4541                  * ack is ok.
4542                  */
4543                 if (sack_ackadv == 1 &&
4544                     tp->t_state == TCPS_ESTABLISHED &&
4545                     SACK_ENABLED(tp) && sack_bytes_acked > 0 &&
4546                     to.to_nsacks > 0 && tp->t_dupacks == 0 &&
4547                     SEQ_LEQ(th->th_ack, tp->snd_una) && tlen == 0 &&
4548                     !(tp->t_flagsext & TF_PKTS_REORDERED)) {
4549                         tcpstat.tcps_sack_ackadv++;
4550                         goto process_dupack;
4551                 }
4552         }
4553
4554 step6:
4555         /*
4556          * Update window information.
4557          * Don't look at window if no ACK: TAC's send garbage on first SYN.
4558          */
4559         if ((thflags & TH_ACK) &&
4560             (SEQ_LT(tp->snd_wl1, th->th_seq) ||
4561             (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
4562              (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
4563                 /* keep track of pure window updates */
4564                 if (tlen == 0 &&
4565                     tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
4566                         tcpstat.tcps_rcvwinupd++;
4567                 tp->snd_wnd = tiwin;
4568                 tp->snd_wl1 = th->th_seq;
4569                 tp->snd_wl2 = th->th_ack;
4570                 if (tp->snd_wnd > tp->max_sndwnd)
4571                         tp->max_sndwnd = tp->snd_wnd;
4572                 needoutput = 1;
4573         }
4574
4575         /*
4576          * Process segments with URG.
4577          */
4578         if ((thflags & TH_URG) && th->th_urp &&
4579             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
4580                 /*
4581                  * This is a kludge, but if we receive and accept
4582                  * random urgent pointers, we'll crash in
4583                  * soreceive.  It's hard to imagine someone
4584                  * actually wanting to send this much urgent data.
4585                  */
4586                 if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
4587                         th->th_urp = 0;                 /* XXX */
4588                         thflags &= ~TH_URG;             /* XXX */
4589                         goto dodata;                    /* XXX */
4590                 }
4591                 /*
4592                  * If this segment advances the known urgent pointer,
4593                  * then mark the data stream.  This should not happen
4594                  * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
4595                  * a FIN has been received from the remote side.
4596                  * In these states we ignore the URG.
4597                  *
4598                  * According to RFC961 (Assigned Protocols),
4599                  * the urgent pointer points to the last octet
4600                  * of urgent data.  We continue, however,
4601                  * to consider it to indicate the first octet
4602                  * of data past the urgent section as the original
4603                  * spec states (in one of two places).
4604                  */
4605                 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
4606                         tp->rcv_up = th->th_seq + th->th_urp;
4607                         so->so_oobmark = so->so_rcv.sb_cc +
4608                             (tp->rcv_up - tp->rcv_nxt) - 1;
4609                         if (so->so_oobmark == 0) {
4610                                 so->so_state |= SS_RCVATMARK;
4611                                 postevent(so, 0, EV_OOB);
4612                         }
4613                         sohasoutofband(so);
4614                         tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
4615                 }
4616                 /*
4617                  * Remove out of band data so doesn't get presented to user.
4618                  * This can happen independent of advancing the URG pointer,
4619                  * but if two URG's are pending at once, some out-of-band
4620                  * data may creep in... ick.
4621                  */
4622                 if (th->th_urp <= (u_int32_t)tlen
4623 #if SO_OOBINLINE
4624                      && (so->so_options & SO_OOBINLINE) == 0
4625 #endif
4626                      )
4627                         tcp_pulloutofband(so, th, m,
4628                                 drop_hdrlen);   /* hdr drop is delayed */
4629         } else {
4630                 /*
4631                  * If no out of band data is expected,
4632                  * pull receive urgent pointer along
4633                  * with the receive window.
4634                  */
4635                 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
4636                         tp->rcv_up = tp->rcv_nxt;
4637         }
4638 dodata:
4639
4640         /* Set socket's connect or disconnect state correcly before doing data.
4641          * The following might unlock the socket if there is an upcall or a socket
4642          * filter.
4643          */
4644         if (isconnected) {
4645                 soisconnected(so);
4646         } else if (isdisconnected) {
4647                 soisdisconnected(so);
4648         }
4649
4650         /* Let's check the state of pcb just to make sure that it did not get closed
4651          * when we unlocked above
4652          */
4653         if (inp->inp_state == INPCB_STATE_DEAD) {
4654                 /* Just drop the packet that we are processing and return */
4655                 goto drop;
4656         }
4657
4658         /*
4659          * Process the segment text, merging it into the TCP sequencing queue,
4660          * and arranging for acknowledgment of receipt if necessary.
4661          * This process logically involves adjusting tp->rcv_wnd as data
4662          * is presented to the user (this happens in tcp_usrreq.c,
4663          * case PRU_RCVD).  If a FIN has already been received on this
4664          * connection then we just ignore the text.
4665          *
4666          * If we are in SYN-received state and got a valid TFO cookie, we want
4667          * to process the data.
4668          */
4669         if ((tlen || (thflags & TH_FIN)) &&
4670             TCPS_HAVERCVDFIN(tp->t_state) == 0 &&
4671             (TCPS_HAVEESTABLISHED(tp->t_state) ||
4672              (tp->t_state == TCPS_SYN_RECEIVED &&
4673              (tp->t_tfo_flags & TFO_F_COOKIE_VALID)))) {
4674                 tcp_seq save_start = th->th_seq;
4675                 tcp_seq save_end = th->th_seq + tlen;
4676                 m_adj(m, drop_hdrlen);  /* delayed header drop */
4677                 /*
4678                  * Insert segment which includes th into TCP reassembly queue
4679                  * with control block tp.  Set thflags to whether reassembly now
4680                  * includes a segment with FIN.  This handles the common case
4681                  * inline (segment is the next to be received on an established
4682                  * connection, and the queue is empty), avoiding linkage into
4683                  * and removal from the queue and repetition of various
4684                  * conversions.
4685                  * Set DELACK for segments received in order, but ack
4686                  * immediately when segments are out of order (so
4687                  * fast retransmit can work).
4688                  */
4689                 if (th->th_seq == tp->rcv_nxt && LIST_EMPTY(&tp->t_segq)) {
4690                         TCP_INC_VAR(tp->t_unacksegs, nlropkts);
4691                         /*
4692                          * Calculate the RTT on the receiver only if the
4693                          * connection is in streaming mode and the last
4694                          * packet was not an end-of-write
4695                          */
4696                         if ((tp->t_flags & TF_STRETCHACK) &&
4697                                 !(tp->t_flagsext & TF_STREAMEOW))
4698                                 tcp_compute_rtt(tp, &to, th);
4699
4700                         if (DELAY_ACK(tp, th) &&
4701                                 ((tp->t_flags & TF_ACKNOW) == 0) ) {
4702                                 if ((tp->t_flags & TF_DELACK) == 0) {
4703                                         tp->t_flags |= TF_DELACK;
4704                                         tp->t_timer[TCPT_DELACK] =
4705                                                 OFFSET_FROM_START(tp, tcp_delack);
4706                                 }
4707                         }
4708                         else {
4709                                 tp->t_flags |= TF_ACKNOW;
4710                         }
4711                         tp->rcv_nxt += tlen;
4712                         thflags = th->th_flags & TH_FIN;
4713                         TCP_INC_VAR(tcpstat.tcps_rcvpack, nlropkts);
4714                         tcpstat.tcps_rcvbyte += tlen;
4715                         if (nstat_collect) {
4716                                 if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) {
4717                                         INP_ADD_STAT(inp, cell, wifi, wired,
4718                                             rxpackets, m->m_pkthdr.lro_npkts);
4719                                 } else {
4720                                         INP_ADD_STAT(inp, cell, wifi, wired,
4721                                             rxpackets, 1);
4722                                 }
4723                                 INP_ADD_STAT(inp, cell, wifi, wired,
4724                                     rxbytes, tlen);
4725                         }
4726                         tcp_sbrcv_grow(tp, &so->so_rcv, &to, tlen);
4727                         so_recv_data_stat(so, m, drop_hdrlen);
4728
4729                         if (sbappendstream_rcvdemux(so, m,
4730                             th->th_seq - (tp->irs + 1), 0)) {
4731                                 sorwakeup(so);
4732                         }
4733                 } else {
4734                         thflags = tcp_reass(tp, th, &tlen, m, ifp);
4735                         tp->t_flags |= TF_ACKNOW;
4736                 }
4737
4738                 if (tlen > 0 && SACK_ENABLED(tp))
4739                         tcp_update_sack_list(tp, save_start, save_end);
4740
4741                 tcp_adaptive_rwtimo_check(tp, tlen);
4742
4743                 if (tlen > 0)
4744                         tcp_tfo_rcv_data(tp);
4745
4746                 if (tp->t_flags & TF_DELACK)
4747                 {
4748 #if INET6
4749                         if (isipv6) {
4750                                 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
4751                                         (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
4752                                         th->th_seq, th->th_ack, th->th_win);
4753                         }
4754                         else
4755 #endif
4756                         {
4757                                 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
4758                                         (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
4759                                         th->th_seq, th->th_ack, th->th_win);
4760                         }
4761
4762                 }
4763         } else {
4764                 m_freem(m);
4765                 thflags &= ~TH_FIN;
4766         }
4767
4768         /*
4769          * If FIN is received ACK the FIN and let the user know
4770          * that the connection is closing.
4771          */
4772         if (thflags & TH_FIN) {
4773                 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
4774                         socantrcvmore(so);
4775                         postevent(so, 0, EV_FIN);
4776                         /*
4777                          * If connection is half-synchronized
4778                          * (ie NEEDSYN flag on) then delay ACK,
4779                          * so it may be piggybacked when SYN is sent.
4780                          * Otherwise, since we received a FIN then no
4781                          * more input can be expected, send ACK now.
4782                          */
4783                         TCP_INC_VAR(tp->t_unacksegs, nlropkts);
4784                         if (DELAY_ACK(tp, th) && (tp->t_flags & TF_NEEDSYN)) {
4785                                 if ((tp->t_flags & TF_DELACK) == 0) {
4786                                         tp->t_flags |= TF_DELACK;
4787                                         tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
4788                                 }
4789                         } else {
4790                                 tp->t_flags |= TF_ACKNOW;
4791                         }
4792                         tp->rcv_nxt++;
4793                 }
4794                 switch (tp->t_state) {
4795
4796                 /*
4797                  * In SYN_RECEIVED and ESTABLISHED STATES
4798                  * enter the CLOSE_WAIT state.
4799                  */
4800                 case TCPS_SYN_RECEIVED:
4801                         tp->t_starttime = tcp_now;
4802                 case TCPS_ESTABLISHED:
4803                         DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
4804                                 struct tcpcb *, tp, int32_t, TCPS_CLOSE_WAIT);
4805                         tp->t_state = TCPS_CLOSE_WAIT;
4806                         break;
4807
4808                 /*
4809                  * If still in FIN_WAIT_1 STATE FIN has not been acked so
4810                  * enter the CLOSING state.
4811                  */
4812                 case TCPS_FIN_WAIT_1:
4813                         DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
4814                                 struct tcpcb *, tp, int32_t, TCPS_CLOSING);
4815                         tp->t_state = TCPS_CLOSING;
4816                         break;
4817
4818                 /*
4819                  * In FIN_WAIT_2 state enter the TIME_WAIT state,
4820                  * starting the time-wait timer, turning off the other
4821                  * standard timers.
4822                  */
4823                 case TCPS_FIN_WAIT_2:
4824                         DTRACE_TCP4(state__change, void, NULL,
4825                                 struct inpcb *, inp,
4826                                 struct tcpcb *, tp,
4827                                 int32_t, TCPS_TIME_WAIT);
4828                         tp->t_state = TCPS_TIME_WAIT;
4829                         tcp_canceltimers(tp);
4830                         tp->t_flags |= TF_ACKNOW;
4831                         if (tp->t_flagsext & TF_NOTIMEWAIT) {
4832                                 tp->t_flags |= TF_CLOSING;
4833                         } else {
4834                                 add_to_time_wait(tp, 2 * tcp_msl);
4835                         }
4836                         soisdisconnected(so);
4837                         break;
4838
4839                 /*
4840                  * In TIME_WAIT state restart the 2 MSL time_wait timer.
4841                  */
4842                 case TCPS_TIME_WAIT:
4843                         add_to_time_wait(tp, 2 * tcp_msl);
4844                         break;
4845                 }
4846         }
4847 #if TCPDEBUG
4848         if (so->so_options & SO_DEBUG)
4849                 tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
4850                           &tcp_savetcp, 0);
4851 #endif
4852
4853         /*
4854          * Return any desired output.
4855          */
4856         if (needoutput || (tp->t_flags & TF_ACKNOW)) {
4857                 (void) tcp_output(tp);
4858         }
4859
4860         tcp_check_timer_state(tp);
4861
4862
4863         tcp_unlock(so, 1, 0);
4864         KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
4865         return;
4866
4867 dropafterack:
4868         /*
4869          * Generate an ACK dropping incoming segment if it occupies
4870          * sequence space, where the ACK reflects our state.
4871          *
4872          * We can now skip the test for the RST flag since all
4873          * paths to this code happen after packets containing
4874          * RST have been dropped.
4875          *
4876          * In the SYN-RECEIVED state, don't send an ACK unless the
4877          * segment we received passes the SYN-RECEIVED ACK test.
4878          * If it fails send a RST.  This breaks the loop in the
4879          * "LAND" DoS attack, and also prevents an ACK storm
4880          * between two listening ports that have been sent forged
4881          * SYN segments, each with the source address of the other.
4882          */
4883         if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
4884             (SEQ_GT(tp->snd_una, th->th_ack) ||
4885              SEQ_GT(th->th_ack, tp->snd_max)) ) {
4886                 rstreason = BANDLIM_RST_OPENPORT;
4887                 IF_TCP_STATINC(ifp, dospacket);
4888                 goto dropwithreset;
4889         }
4890 #if TCPDEBUG
4891         if (so->so_options & SO_DEBUG)
4892                 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
4893                           &tcp_savetcp, 0);
4894 #endif
4895         m_freem(m);
4896         tp->t_flags |= TF_ACKNOW;
4897         (void) tcp_output(tp);
4898
4899         /* Don't need to check timer state as we should have done it during tcp_output */
4900         tcp_unlock(so, 1, 0);
4901         KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
4902         return;
4903 dropwithresetnosock:
4904         nosock = 1;
4905 dropwithreset:
4906         /*
4907          * Generate a RST, dropping incoming segment.
4908          * Make ACK acceptable to originator of segment.
4909          * Don't bother to respond if destination was broadcast/multicast.
4910          */
4911         if ((thflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST))
4912                 goto drop;
4913 #if INET6
4914         if (isipv6) {
4915                 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
4916                     IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
4917                         goto drop;
4918         } else
4919 #endif /* INET6 */
4920         if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
4921             IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
4922             ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
4923             in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
4924                 goto drop;
4925         /* IPv6 anycast check is done at tcp6_input() */
4926
4927         /*
4928          * Perform bandwidth limiting.
4929          */
4930 #if ICMP_BANDLIM
4931         if (badport_bandlim(rstreason) < 0)
4932                 goto drop;
4933 #endif
4934
4935 #if TCPDEBUG
4936         if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
4937                 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
4938                           &tcp_savetcp, 0);
4939 #endif
4940         bzero(&tra, sizeof(tra));
4941         tra.ifscope = ifscope;
4942         tra.awdl_unrestricted = 1;
4943         if (thflags & TH_ACK)
4944                 /* mtod() below is safe as long as hdr dropping is delayed */
4945                 tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack,
4946                     TH_RST, &tra);
4947         else {
4948                 if (thflags & TH_SYN)
4949                         tlen++;
4950                 /* mtod() below is safe as long as hdr dropping is delayed */
4951                 tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
4952                     (tcp_seq)0, TH_RST|TH_ACK, &tra);
4953         }
4954         /* destroy temporarily created socket */
4955         if (dropsocket) {
4956                 (void) soabort(so);
4957                 tcp_unlock(so, 1, 0);
4958         } else if ((inp != NULL) && (nosock == 0)) {
4959                 tcp_unlock(so, 1, 0);
4960         }
4961         KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
4962         return;
4963 dropnosock:
4964         nosock = 1;
4965 drop:
4966         /*
4967          * Drop space held by incoming segment and return.
4968          */
4969 #if TCPDEBUG
4970         if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
4971                 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
4972                           &tcp_savetcp, 0);
4973 #endif
4974         m_freem(m);
4975         /* destroy temporarily created socket */
4976         if (dropsocket) {
4977                 (void) soabort(so);
4978                 tcp_unlock(so, 1, 0);
4979         }
4980         else if (nosock == 0) {
4981                 tcp_unlock(so, 1, 0);
4982         }
4983         KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
4984         return;
4985 }
4986
4987 /*
4988  * Parse TCP options and place in tcpopt.
4989  */
4990 static void
4991 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th,
4992     struct tcpopt *to)
4993 {
4994         u_short mss = 0;
4995         int opt, optlen;
4996
4997         for (; cnt > 0; cnt -= optlen, cp += optlen) {
4998                 opt = cp[0];
4999                 if (opt == TCPOPT_EOL)
5000                         break;
5001                 if (opt == TCPOPT_NOP)
5002                         optlen = 1;
5003                 else {
5004                         if (cnt < 2)
5005                                 break;
5006                         optlen = cp[1];
5007                         if (optlen < 2 || optlen > cnt)
5008                                 break;
5009                 }
5010                 switch (opt) {
5011
5012                 default:
5013                         continue;
5014
5015                 case TCPOPT_MAXSEG:
5016                         if (optlen != TCPOLEN_MAXSEG)
5017                                 continue;
5018                         if (!(th->th_flags & TH_SYN))
5019                                 continue;
5020                         bcopy((char *) cp + 2, (char *) &mss, sizeof(mss));
5021                         NTOHS(mss);
5022                         to->to_mss = mss;
5023                         to->to_flags |= TOF_MSS;
5024                         break;
5025
5026                 case TCPOPT_WINDOW:
5027                         if (optlen != TCPOLEN_WINDOW)
5028                                 continue;
5029                         if (!(th->th_flags & TH_SYN))
5030                                 continue;
5031                         to->to_flags |= TOF_SCALE;
5032                         to->to_requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
5033                         break;
5034
5035                 case TCPOPT_TIMESTAMP:
5036                         if (optlen != TCPOLEN_TIMESTAMP)
5037                                 continue;
5038                         to->to_flags |= TOF_TS;
5039                         bcopy((char *)cp + 2,
5040                             (char *)&to->to_tsval, sizeof(to->to_tsval));
5041                         NTOHL(to->to_tsval);
5042                         bcopy((char *)cp + 6,
5043                             (char *)&to->to_tsecr, sizeof(to->to_tsecr));
5044                         NTOHL(to->to_tsecr);
5045                         /* Re-enable sending Timestamps if we received them */
5046                         if (!(tp->t_flags & TF_REQ_TSTMP) &&
5047                             tcp_do_rfc1323 == 1)
5048                                 tp->t_flags |= TF_REQ_TSTMP;
5049                         break;
5050                 case TCPOPT_SACK_PERMITTED:
5051                         if (!tcp_do_sack ||
5052                             optlen != TCPOLEN_SACK_PERMITTED)
5053                                 continue;
5054                         if (th->th_flags & TH_SYN)
5055                                 to->to_flags |= TOF_SACK;
5056                         break;
5057                 case TCPOPT_SACK:
5058                         if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
5059                                 continue;
5060                         to->to_nsacks = (optlen - 2) / TCPOLEN_SACK;
5061                         to->to_sacks = cp + 2;
5062                         tcpstat.tcps_sack_rcv_blocks++;
5063
5064                         break;
5065                 case TCPOPT_FASTOPEN:
5066                         if (optlen == TCPOLEN_FASTOPEN_REQ) {
5067                                 if (tp->t_state != TCPS_LISTEN)
5068                                         continue;
5069
5070                                 to->to_flags |= TOF_TFOREQ;
5071                         } else {
5072                                 if (optlen < TCPOLEN_FASTOPEN_REQ ||
5073                                     (optlen - TCPOLEN_FASTOPEN_REQ) > TFO_COOKIE_LEN_MAX ||
5074                                     (optlen - TCPOLEN_FASTOPEN_REQ) < TFO_COOKIE_LEN_MIN)
5075                                         continue;
5076                                 if (tp->t_state != TCPS_LISTEN &&
5077                                     tp->t_state != TCPS_SYN_SENT)
5078                                         continue;
5079
5080                                 to->to_flags |= TOF_TFO;
5081                                 to->to_tfo = cp + 1;
5082                         }
5083
5084                         break;
5085 #if MPTCP
5086                 case TCPOPT_MULTIPATH:
5087                         tcp_do_mptcp_options(tp, cp, th, to, optlen);
5088                         break;
5089 #endif /* MPTCP */
5090                 }
5091         }
5092 }
5093
5094 static void
5095 tcp_finalize_options(struct tcpcb *tp, struct tcpopt *to, unsigned int ifscope)
5096 {
5097         if (to->to_flags & TOF_TS) {
5098                 tp->t_flags |= TF_RCVD_TSTMP;
5099                 tp->ts_recent = to->to_tsval;
5100                 tp->ts_recent_age = tcp_now;
5101
5102         }
5103         if (to->to_flags & TOF_MSS)
5104                 tcp_mss(tp, to->to_mss, ifscope);
5105         if (SACK_ENABLED(tp)) {
5106                 if (!(to->to_flags & TOF_SACK))
5107                         tp->t_flagsext &= ~(TF_SACK_ENABLE);
5108                 else
5109                         tp->t_flags |= TF_SACK_PERMIT;
5110         }
5111         if (to->to_flags & TOF_SCALE) {
5112                 tp->t_flags |= TF_RCVD_SCALE;
5113                 tp->requested_s_scale = to->to_requested_s_scale;
5114
5115                 /* Re-enable window scaling, if the option is received */
5116                 if (tp->request_r_scale > 0)
5117                         tp->t_flags |= TF_REQ_SCALE;
5118         }
5119 }
5120
5121 /*
5122  * Pull out of band byte out of a segment so
5123  * it doesn't appear in the user's data queue.
5124  * It is still reflected in the segment length for
5125  * sequencing purposes.
5126  */
5127 static void
5128 tcp_pulloutofband(so, th, m, off)
5129         struct socket *so;
5130         struct tcphdr *th;
5131         register struct mbuf *m;
5132         int off;                /* delayed to be droped hdrlen */
5133 {
5134         int cnt = off + th->th_urp - 1;
5135
5136         while (cnt >= 0) {
5137                 if (m->m_len > cnt) {
5138                         char *cp = mtod(m, caddr_t) + cnt;
5139                         struct tcpcb *tp = sototcpcb(so);
5140
5141                         tp->t_iobc = *cp;
5142                         tp->t_oobflags |= TCPOOB_HAVEDATA;
5143                         bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
5144                         m->m_len--;
5145                         if (m->m_flags & M_PKTHDR)
5146                                 m->m_pkthdr.len--;
5147                         return;
5148                 }
5149                 cnt -= m->m_len;
5150                 m = m->m_next;
5151                 if (m == 0)
5152                         break;
5153         }
5154         panic("tcp_pulloutofband");
5155 }
5156
5157 uint32_t
5158 get_base_rtt(struct tcpcb *tp)
5159 {
5160         uint32_t base_rtt = 0, i;
5161         for (i = 0; i < N_RTT_BASE; ++i) {
5162                 if (tp->rtt_hist[i] != 0 &&
5163                         (base_rtt == 0 || tp->rtt_hist[i] < base_rtt))
5164                         base_rtt = tp->rtt_hist[i];
5165         }
5166         return base_rtt;
5167 }
5168
5169 /* Each value of RTT base represents the minimum RTT seen in a minute.
5170  * We keep upto N_RTT_BASE minutes worth of history.
5171  */
5172 void
5173 update_base_rtt(struct tcpcb *tp, uint32_t rtt)
5174 {
5175         int32_t i, qdelay;
5176         u_int32_t base_rtt;
5177
5178         if (++tp->rtt_count >= rtt_samples_per_slot) {
5179 #if TRAFFIC_MGT
5180                 /*
5181                  * If the recv side is being throttled, check if the
5182                  * current RTT is closer to the base RTT seen in
5183                  * first (recent) two slots. If so, unthrottle the stream.
5184                  */
5185                 if (tp->t_flagsext & TF_RECV_THROTTLE) {
5186                         base_rtt = min(tp->rtt_hist[0], tp->rtt_hist[1]);
5187                         qdelay = tp->t_rttcur - base_rtt;
5188                         if (qdelay < target_qdelay)
5189                                 tp->t_flagsext &= ~(TF_RECV_THROTTLE);
5190                 }
5191 #endif /* TRAFFIC_MGT */
5192
5193                 for (i = (N_RTT_BASE-1); i > 0; --i) {
5194                         tp->rtt_hist[i] = tp->rtt_hist[i-1];
5195                 }
5196                 tp->rtt_hist[0] = rtt;
5197                 tp->rtt_count = 0;
5198         } else {
5199                 tp->rtt_hist[0] = min(tp->rtt_hist[0], rtt);
5200         }
5201 }
5202
5203 /*
5204  * If we have a timestamp reply, update smoothed RTT. If no timestamp is
5205  * present but transmit timer is running and timed sequence number was
5206  * acked, update smoothed RTT.
5207  *
5208  * If timestamps are supported, a receiver can update RTT even if
5209  * there is no outstanding data.
5210  *
5211  * Some boxes send broken timestamp replies during the SYN+ACK phase,
5212  * ignore timestamps of 0or we could calculate a huge RTT and blow up
5213  * the retransmit timer.
5214  */
5215 static void
5216 tcp_compute_rtt(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th)
5217 {
5218         int rtt = 0;
5219         VERIFY(to != NULL && th != NULL);
5220         if (tp->t_rtttime != 0 && SEQ_GT(th->th_ack, tp->t_rtseq)) {
5221                 u_int32_t pipe_ack_val;
5222                 rtt = tcp_now - tp->t_rtttime;
5223                 /*
5224                  * Compute pipe ack -- the amount of data acknowledged
5225                  * in the last RTT
5226                  */
5227                 if (SEQ_GT(th->th_ack, tp->t_pipeack_lastuna)) {
5228                         pipe_ack_val = th->th_ack - tp->t_pipeack_lastuna;
5229                         /* Update the sample */
5230                         tp->t_pipeack_sample[tp->t_pipeack_ind++] =
5231                             pipe_ack_val;
5232                         tp->t_pipeack_ind %= TCP_PIPEACK_SAMPLE_COUNT;
5233
5234                         /* Compute the max of the pipeack samples */
5235                         pipe_ack_val = tcp_get_max_pipeack(tp);
5236                         tp->t_pipeack = (pipe_ack_val >
5237                                     TCP_CC_CWND_INIT_BYTES) ?
5238                                     pipe_ack_val : 0;
5239                 }
5240                 /* start another measurement */
5241                 tp->t_rtttime = 0;
5242         }
5243         if (((to->to_flags & TOF_TS) != 0) &&
5244                 (to->to_tsecr != 0) &&
5245                 TSTMP_GEQ(tcp_now, to->to_tsecr)) {
5246                 tcp_xmit_timer(tp, (tcp_now - to->to_tsecr),
5247                         to->to_tsecr, th->th_ack);
5248         } else if (rtt > 0) {
5249                 tcp_xmit_timer(tp, rtt, 0, th->th_ack);
5250         }
5251 }
5252
5253 /*
5254  * Collect new round-trip time estimate
5255  * and update averages and current timeout.
5256  */
5257 static void
5258 tcp_xmit_timer(register struct tcpcb *tp, int rtt,
5259         u_int32_t tsecr, tcp_seq th_ack)
5260 {
5261         register int delta;
5262
5263         if (tp->t_flagsext & TF_RECOMPUTE_RTT) {
5264                 if (SEQ_GT(th_ack, tp->snd_una) &&
5265                     SEQ_LEQ(th_ack, tp->snd_max) &&
5266                     (tsecr == 0 ||
5267                     TSTMP_GEQ(tsecr, tp->t_badrexmt_time))) {
5268                         /*
5269                          * We received a new ACk after a
5270                          * spurious timeout. Adapt retransmission
5271                          * timer as described in rfc 4015.
5272                          */
5273                         tp->t_flagsext &= ~(TF_RECOMPUTE_RTT);
5274                         tp->t_badrexmt_time = 0;
5275                         tp->t_srtt = max(tp->t_srtt_prev, rtt);
5276                         tp->t_srtt = tp->t_srtt << TCP_RTT_SHIFT;
5277                         tp->t_rttvar = max(tp->t_rttvar_prev, (rtt >> 1));
5278                         tp->t_rttvar = tp->t_rttvar << TCP_RTTVAR_SHIFT;
5279
5280                         if (tp->t_rttbest > (tp->t_srtt + tp->t_rttvar))
5281                                 tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
5282
5283                         goto compute_rto;
5284                 } else {
5285                         return;
5286                 }
5287         }
5288
5289         tcpstat.tcps_rttupdated++;
5290         tp->t_rttupdated++;
5291
5292         if (rtt > 0) {
5293                 tp->t_rttcur = rtt;
5294                 update_base_rtt(tp, rtt);
5295         }
5296
5297         if (tp->t_srtt != 0) {
5298                 /*
5299                  * srtt is stored as fixed point with 5 bits after the
5300                  * binary point (i.e., scaled by 32).  The following magic
5301                  * is equivalent to the smoothing algorithm in rfc793 with
5302                  * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
5303                  * point).
5304                  *
5305                  * Freebsd adjusts rtt to origin 0 by subtracting 1
5306                  * from the provided rtt value. This was required because
5307                  * of the way t_rtttime was initiailised to 1 before.
5308                  * Since we changed t_rtttime to be based on
5309                  * tcp_now, this extra adjustment is not needed.
5310                  */
5311                 delta = (rtt << TCP_DELTA_SHIFT)
5312                         - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
5313
5314                 if ((tp->t_srtt += delta) <= 0)
5315                         tp->t_srtt = 1;
5316
5317                 /*
5318                  * We accumulate a smoothed rtt variance (actually, a
5319                  * smoothed mean difference), then set the retransmit
5320                  * timer to smoothed rtt + 4 times the smoothed variance.
5321                  * rttvar is stored as fixed point with 4 bits after the
5322                  * binary point (scaled by 16).  The following is
5323                  * equivalent to rfc793 smoothing with an alpha of .75
5324                  * (rttvar = rttvar*3/4 + |delta| / 4).  This replaces
5325                  * rfc793's wired-in beta.
5326                  */
5327                 if (delta < 0)
5328                         delta = -delta;
5329                 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
5330                 if ((tp->t_rttvar += delta) <= 0)
5331                         tp->t_rttvar = 1;
5332                 if (tp->t_rttbest == 0  ||
5333                         tp->t_rttbest > (tp->t_srtt + tp->t_rttvar))
5334                         tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
5335         } else {
5336                 /*
5337                  * No rtt measurement yet - use the unsmoothed rtt.
5338                  * Set the variance to half the rtt (so our first
5339                  * retransmit happens at 3*rtt).
5340                  */
5341                 tp->t_srtt = rtt << TCP_RTT_SHIFT;
5342                 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
5343         }
5344
5345 compute_rto:
5346         nstat_route_rtt(tp->t_inpcb->inp_route.ro_rt, tp->t_srtt,
5347                 tp->t_rttvar);
5348         tp->t_rxtshift = 0;
5349         tp->t_rxtstart = 0;
5350
5351         /*
5352          * the retransmit should happen at rtt + 4 * rttvar.
5353          * Because of the way we do the smoothing, srtt and rttvar
5354          * will each average +1/2 tick of bias.  When we compute
5355          * the retransmit timer, we want 1/2 tick of rounding and
5356          * 1 extra tick because of +-1/2 tick uncertainty in the
5357          * firing of the timer.  The bias will give us exactly the
5358          * 1.5 tick we need.  But, because the bias is
5359          * statistical, we have to test that we don't drop below
5360          * the minimum feasible timer (which is 2 ticks).
5361          */
5362         TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
5363                 max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX,
5364                 TCP_ADD_REXMTSLOP(tp));
5365
5366         /*
5367          * We received an ack for a packet that wasn't retransmitted;
5368          * it is probably safe to discard any error indications we've
5369          * received recently.  This isn't quite right, but close enough
5370          * for now (a route might have failed after we sent a segment,
5371          * and the return path might not be symmetrical).
5372          */
5373         tp->t_softerror = 0;
5374 }
5375
5376 static inline unsigned int
5377 tcp_maxmtu(struct rtentry *rt)
5378 {
5379         unsigned int maxmtu;
5380
5381         RT_LOCK_ASSERT_HELD(rt);
5382         if (rt->rt_rmx.rmx_mtu == 0)
5383                 maxmtu = rt->rt_ifp->if_mtu;
5384         else
5385                 maxmtu = MIN(rt->rt_rmx.rmx_mtu, rt->rt_ifp->if_mtu);
5386
5387         return (maxmtu);
5388 }
5389
5390 #if INET6
5391 static inline unsigned int
5392 tcp_maxmtu6(struct rtentry *rt)
5393 {
5394         unsigned int maxmtu;
5395         struct nd_ifinfo *ndi = NULL;
5396
5397         RT_LOCK_ASSERT_HELD(rt);
5398         if ((ndi = ND_IFINFO(rt->rt_ifp)) != NULL && !ndi->initialized)
5399                 ndi = NULL;
5400         if (ndi != NULL)
5401                 lck_mtx_lock(&ndi->lock);
5402         if (rt->rt_rmx.rmx_mtu == 0)
5403                 maxmtu = IN6_LINKMTU(rt->rt_ifp);
5404         else
5405                 maxmtu = MIN(rt->rt_rmx.rmx_mtu, IN6_LINKMTU(rt->rt_ifp));
5406         if (ndi != NULL)
5407                 lck_mtx_unlock(&ndi->lock);
5408
5409         return (maxmtu);
5410 }
5411 #endif
5412
5413 /*
5414  * Determine a reasonable value for maxseg size.
5415  * If the route is known, check route for mtu.
5416  * If none, use an mss that can be handled on the outgoing
5417  * interface without forcing IP to fragment; if bigger than
5418  * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
5419  * to utilize large mbufs.  If no route is found, route has no mtu,
5420  * or the destination isn't local, use a default, hopefully conservative
5421  * size (usually 512 or the default IP max size, but no more than the mtu
5422  * of the interface), as we can't discover anything about intervening
5423  * gateways or networks.  We also initialize the congestion/slow start
5424  * window. While looking at the routing entry, we also initialize
5425  * other path-dependent parameters from pre-set or cached values
5426  * in the routing entry.
5427  *
5428  * Also take into account the space needed for options that we
5429  * send regularly.  Make maxseg shorter by that amount to assure
5430  * that we can send maxseg amount of data even when the options
5431  * are present.  Store the upper limit of the length of options plus
5432  * data in maxopd.
5433  *
5434  * NOTE that this routine is only called when we process an incoming
5435  * segment, for outgoing segments only tcp_mssopt is called.
5436  *
5437  */
5438 void
5439 tcp_mss(tp, offer, input_ifscope)
5440         struct tcpcb *tp;
5441         int offer;
5442         unsigned int input_ifscope;
5443 {
5444         register struct rtentry *rt;
5445         struct ifnet *ifp;
5446         register int rtt, mss;
5447         u_int32_t bufsize;
5448         struct inpcb *inp;
5449         struct socket *so;
5450         struct rmxp_tao *taop;
5451         int origoffer = offer;
5452         u_int32_t sb_max_corrected;
5453         int isnetlocal = 0;
5454 #if INET6
5455         int isipv6;
5456         int min_protoh;
5457 #endif
5458
5459         inp = tp->t_inpcb;
5460 #if INET6
5461         isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
5462         min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr)
5463                             : sizeof (struct tcpiphdr);
5464 #else
5465 #define min_protoh  (sizeof (struct tcpiphdr))
5466 #endif
5467
5468 #if INET6
5469         if (isipv6) {
5470                 rt = tcp_rtlookup6(inp, input_ifscope);
5471         }
5472         else
5473 #endif /* INET6 */
5474         {
5475                 rt = tcp_rtlookup(inp, input_ifscope);
5476         }
5477         isnetlocal = (tp->t_flags & TF_LOCAL);
5478
5479         if (rt == NULL) {
5480                 tp->t_maxopd = tp->t_maxseg =
5481 #if INET6
5482                 isipv6 ? tcp_v6mssdflt :
5483 #endif /* INET6 */
5484                 tcp_mssdflt;
5485                 return;
5486         }
5487         ifp = rt->rt_ifp;
5488         /*
5489          * Slower link window correction:
5490          * If a value is specificied for slowlink_wsize use it for
5491          * PPP links believed to be on a serial modem (speed <128Kbps).
5492          * Excludes 9600bps as it is the default value adversized
5493          * by pseudo-devices over ppp.
5494          */
5495         if (ifp->if_type == IFT_PPP && slowlink_wsize > 0 &&
5496             ifp->if_baudrate > 9600 && ifp->if_baudrate <= 128000) {
5497                 tp->t_flags |= TF_SLOWLINK;
5498         }
5499         so = inp->inp_socket;
5500
5501         taop = rmx_taop(rt->rt_rmx);
5502         /*
5503          * Offer == -1 means that we didn't receive SYN yet,
5504          * use cached value in that case;
5505          */
5506         if (offer == -1)
5507                 offer = taop->tao_mssopt;
5508         /*
5509          * Offer == 0 means that there was no MSS on the SYN segment,
5510          * in this case we use tcp_mssdflt.
5511          */
5512         if (offer == 0)
5513                 offer =
5514 #if INET6
5515                         isipv6 ? tcp_v6mssdflt :
5516 #endif /* INET6 */
5517                         tcp_mssdflt;
5518         else {
5519                 /*
5520                  * Prevent DoS attack with too small MSS. Round up
5521                  * to at least minmss.
5522                  */
5523                 offer = max(offer, tcp_minmss);
5524                 /*
5525                  * Sanity check: make sure that maxopd will be large
5526                  * enough to allow some data on segments even is the
5527                  * all the option space is used (40bytes).  Otherwise
5528                  * funny things may happen in tcp_output.
5529                  */
5530                 offer = max(offer, 64);
5531         }
5532         taop->tao_mssopt = offer;
5533
5534         /*
5535          * While we're here, check if there's an initial rtt
5536          * or rttvar.  Convert from the route-table units
5537          * to scaled multiples of the slow timeout timer.
5538          */
5539         if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt) != 0) {
5540                 tcp_getrt_rtt(tp, rt);
5541         } else {
5542                 tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN : TCPTV_REXMTMIN;
5543         }
5544
5545 #if INET6
5546         mss = (isipv6 ? tcp_maxmtu6(rt) : tcp_maxmtu(rt));
5547 #else
5548         mss = tcp_maxmtu(rt);
5549 #endif
5550
5551 #if NECP
5552         // At this point, the mss is just the MTU. Adjust if necessary.
5553         mss = necp_socket_get_effective_mtu(inp, mss);
5554 #endif /* NECP */
5555
5556         mss -= min_protoh;
5557
5558         if (rt->rt_rmx.rmx_mtu == 0) {
5559 #if INET6
5560                 if (isipv6) {
5561                         if (!isnetlocal)
5562                                 mss = min(mss, tcp_v6mssdflt);
5563                 } else
5564 #endif /* INET6 */
5565                 if (!isnetlocal)
5566                         mss = min(mss, tcp_mssdflt);
5567         }
5568
5569         mss = min(mss, offer);
5570         /*
5571          * maxopd stores the maximum length of data AND options
5572          * in a segment; maxseg is the amount of data in a normal
5573          * segment.  We need to store this value (maxopd) apart
5574          * from maxseg, because now every segment carries options
5575          * and thus we normally have somewhat less data in segments.
5576          */
5577         tp->t_maxopd = mss;
5578
5579         /*
5580          * origoffer==-1 indicates, that no segments were received yet.
5581          * In this case we just guess.
5582          */
5583         if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
5584             (origoffer == -1 ||
5585              (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
5586                 mss -= TCPOLEN_TSTAMP_APPA;
5587
5588 #if MPTCP
5589         mss -= mptcp_adj_mss(tp, FALSE);
5590 #endif /* MPTCP */
5591         tp->t_maxseg = mss;
5592
5593         /*
5594          * Calculate corrected value for sb_max; ensure to upgrade the
5595          * numerator for large sb_max values else it will overflow.
5596          */
5597         sb_max_corrected = (sb_max * (u_int64_t)MCLBYTES) / (MSIZE + MCLBYTES);
5598
5599         /*
5600          * If there's a pipesize (ie loopback), change the socket
5601          * buffer to that size only if it's bigger than the current
5602          * sockbuf size.  Make the socket buffers an integral
5603          * number of mss units; if the mss is larger than
5604          * the socket buffer, decrease the mss.
5605          */
5606 #if RTV_SPIPE
5607         bufsize = rt->rt_rmx.rmx_sendpipe;
5608         if (bufsize < so->so_snd.sb_hiwat)
5609 #endif
5610                 bufsize = so->so_snd.sb_hiwat;
5611         if (bufsize < mss)
5612                 mss = bufsize;
5613         else {
5614                 bufsize = (((bufsize + (u_int64_t)mss - 1) / (u_int64_t)mss) * (u_int64_t)mss);
5615                 if (bufsize > sb_max_corrected)
5616                         bufsize = sb_max_corrected;
5617                 (void)sbreserve(&so->so_snd, bufsize);
5618         }
5619         tp->t_maxseg = mss;
5620
5621 #if RTV_RPIPE
5622         bufsize = rt->rt_rmx.rmx_recvpipe;
5623         if (bufsize < so->so_rcv.sb_hiwat)
5624 #endif
5625                 bufsize = so->so_rcv.sb_hiwat;
5626         if (bufsize > mss) {
5627                 bufsize = (((bufsize + (u_int64_t)mss - 1) / (u_int64_t)mss) * (u_int64_t)mss);
5628                 if (bufsize > sb_max_corrected)
5629                         bufsize = sb_max_corrected;
5630                 (void)sbreserve(&so->so_rcv, bufsize);
5631         }
5632
5633         set_tcp_stream_priority(so);
5634
5635         if (rt->rt_rmx.rmx_ssthresh) {
5636                 /*
5637                  * There's some sort of gateway or interface
5638                  * buffer limit on the path.  Use this to set
5639                  * slow-start threshold, but set the threshold to
5640                  * no less than 2*mss.
5641                  */
5642                 tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
5643                 tcpstat.tcps_usedssthresh++;
5644         } else {
5645                 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
5646         }
5647
5648         /*
5649          * Set the slow-start flight size depending on whether this
5650          * is a local network or not.
5651          */
5652         if (CC_ALGO(tp)->cwnd_init != NULL)
5653                 CC_ALGO(tp)->cwnd_init(tp);
5654
5655         tcp_ccdbg_trace(tp, NULL, TCP_CC_CWND_INIT);
5656
5657         /* Route locked during lookup above */
5658         RT_UNLOCK(rt);
5659 }
5660
5661 /*
5662  * Determine the MSS option to send on an outgoing SYN.
5663  */
5664 int
5665 tcp_mssopt(tp)
5666         struct tcpcb *tp;
5667 {
5668         struct rtentry *rt;
5669         int mss;
5670 #if INET6
5671         int isipv6;
5672         int min_protoh;
5673 #endif
5674
5675 #if INET6
5676         isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
5677         min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr)
5678                             : sizeof (struct tcpiphdr);
5679 #else
5680 #define min_protoh  (sizeof (struct tcpiphdr))
5681 #endif
5682
5683 #if INET6
5684         if (isipv6)
5685                 rt = tcp_rtlookup6(tp->t_inpcb, IFSCOPE_NONE);
5686         else
5687 #endif /* INET6 */
5688         rt = tcp_rtlookup(tp->t_inpcb, IFSCOPE_NONE);
5689         if (rt == NULL) {
5690                 return (
5691 #if INET6
5692                         isipv6 ? tcp_v6mssdflt :
5693 #endif /* INET6 */
5694                         tcp_mssdflt);
5695         }
5696         /*
5697          * Slower link window correction:
5698          * If a value is specificied for slowlink_wsize use it for PPP links
5699          * believed to be on a serial modem (speed <128Kbps). Excludes 9600bps as
5700          * it is the default value adversized by pseudo-devices over ppp.
5701          */
5702         if (rt->rt_ifp->if_type == IFT_PPP && slowlink_wsize > 0 &&
5703             rt->rt_ifp->if_baudrate > 9600 && rt->rt_ifp->if_baudrate <= 128000) {
5704                 tp->t_flags |= TF_SLOWLINK;
5705         }
5706
5707 #if INET6
5708         mss = (isipv6 ? tcp_maxmtu6(rt) : tcp_maxmtu(rt));
5709 #else
5710         mss = tcp_maxmtu(rt);
5711 #endif
5712         /* Route locked during lookup above */
5713         RT_UNLOCK(rt);
5714
5715 #if NECP
5716         // At this point, the mss is just the MTU. Adjust if necessary.
5717         mss = necp_socket_get_effective_mtu(tp->t_inpcb, mss);
5718 #endif /* NECP */
5719
5720         return (mss - min_protoh);
5721 }
5722
5723 /*
5724  * On a partial ack arrives, force the retransmission of the
5725  * next unacknowledged segment.  Do not clear tp->t_dupacks.
5726  * By setting snd_nxt to th_ack, this forces retransmission timer to
5727  * be started again.
5728  */
5729 static void
5730 tcp_newreno_partial_ack(tp, th)
5731         struct tcpcb *tp;
5732         struct tcphdr *th;
5733 {
5734                 tcp_seq onxt = tp->snd_nxt;
5735                 u_int32_t  ocwnd = tp->snd_cwnd;
5736                 tp->t_timer[TCPT_REXMT] = 0;
5737                 tp->t_timer[TCPT_PTO] = 0;
5738                 tp->t_rtttime = 0;
5739                 tp->snd_nxt = th->th_ack;
5740                 /*
5741                  * Set snd_cwnd to one segment beyond acknowledged offset
5742                  * (tp->snd_una has not yet been updated when this function
5743                  *  is called)
5744                  */
5745                 tp->snd_cwnd = tp->t_maxseg + BYTES_ACKED(th, tp);
5746                 tp->t_flags |= TF_ACKNOW;
5747                 (void) tcp_output(tp);
5748                 tp->snd_cwnd = ocwnd;
5749                 if (SEQ_GT(onxt, tp->snd_nxt))
5750                         tp->snd_nxt = onxt;
5751                 /*
5752                  * Partial window deflation.  Relies on fact that tp->snd_una
5753                  * not updated yet.
5754                  */
5755                 if (tp->snd_cwnd > BYTES_ACKED(th, tp))
5756                         tp->snd_cwnd -= BYTES_ACKED(th, tp);
5757                 else
5758                         tp->snd_cwnd = 0;
5759                 tp->snd_cwnd += tp->t_maxseg;
5760
5761 }
5762
5763 /*
5764  * Drop a random TCP connection that hasn't been serviced yet and
5765  * is eligible for discard.  There is a one in qlen chance that
5766  * we will return a null, saying that there are no dropable
5767  * requests.  In this case, the protocol specific code should drop
5768  * the new request.  This insures fairness.
5769  *
5770  * The listening TCP socket "head" must be locked
5771  */
5772 static int
5773 tcp_dropdropablreq(struct socket *head)
5774 {
5775         struct socket *so, *sonext;
5776         unsigned int i, j, qlen;
5777         static u_int32_t rnd = 0;
5778         static u_int64_t old_runtime;
5779         static unsigned int cur_cnt, old_cnt;
5780         u_int64_t now_sec;
5781         struct inpcb *inp = NULL;
5782         struct tcpcb *tp;
5783
5784         if ((head->so_options & SO_ACCEPTCONN) == 0)
5785                 return (0);
5786
5787         if (TAILQ_EMPTY(&head->so_incomp))
5788                 return (0);
5789
5790         /*
5791          * Check if there is any socket in the incomp queue
5792          * that is closed because of a reset from the peer and is
5793          * waiting to be garbage collected. If so, pick that as
5794          * the victim
5795          */
5796         TAILQ_FOREACH_SAFE(so, &head->so_incomp, so_list, sonext) {
5797                 inp = sotoinpcb(so);
5798                 tp = intotcpcb(inp);
5799                 if (tp != NULL && tp->t_state == TCPS_CLOSED &&
5800                     so->so_head != NULL &&
5801                     (so->so_state & (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) ==
5802                     (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) {
5803                         /*
5804                          * The listen socket is already locked but we
5805                          * can lock this socket here without lock ordering
5806                          * issues because it is in the incomp queue and
5807                          * is not visible to others.
5808                          */
5809                         if (lck_mtx_try_lock(&inp->inpcb_mtx)) {
5810                                 so->so_usecount++;
5811                                 goto found_victim;
5812                         } else {
5813                                 continue;
5814                         }
5815                 }
5816         }
5817
5818         so = TAILQ_FIRST(&head->so_incomp);
5819
5820         now_sec = net_uptime();
5821         if ((i = (now_sec - old_runtime)) != 0) {
5822                 old_runtime = now_sec;
5823                 old_cnt = cur_cnt / i;
5824                 cur_cnt = 0;
5825         }
5826
5827
5828         qlen = head->so_incqlen;
5829         if (rnd == 0)
5830                 rnd = RandomULong();
5831
5832         if (++cur_cnt > qlen || old_cnt > qlen) {
5833                 rnd = (314159 * rnd + 66329) & 0xffff;
5834                 j = ((qlen + 1) * rnd) >> 16;
5835
5836                 while (j-- && so)
5837                         so = TAILQ_NEXT(so, so_list);
5838         }
5839         /* Find a connection that is not already closing (or being served) */
5840         while (so) {
5841                 inp = (struct inpcb *)so->so_pcb;
5842
5843                 sonext = TAILQ_NEXT(so, so_list);
5844
5845                 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0)
5846                         != WNT_STOPUSING) {
5847                         /*
5848                          * Avoid the issue of a socket being accepted
5849                          * by one input thread and being dropped by
5850                          * another input thread. If we can't get a hold
5851                          * on this mutex, then grab the next socket in
5852                          * line.
5853                          */
5854                         if (lck_mtx_try_lock(&inp->inpcb_mtx)) {
5855                                 so->so_usecount++;
5856                                 if ((so->so_usecount == 2) &&
5857                                     (so->so_state & SS_INCOMP) &&
5858                                     !(so->so_flags & SOF_INCOMP_INPROGRESS))  {
5859                                         break;
5860                                 } else {
5861                                         /*
5862                                          * don't use if being accepted or
5863                                          * used in any other way
5864                                          */
5865                                         in_pcb_checkstate(inp, WNT_RELEASE, 1);
5866                                         tcp_unlock(so, 1, 0);
5867                                 }
5868                         } else {
5869                                 /*
5870                                  * do not try to lock the inp in
5871                                  * in_pcb_checkstate because the lock
5872                                  * is already held in some other thread.
5873                                  * Only drop the inp_wntcnt reference.
5874                                  */
5875                                 in_pcb_checkstate(inp, WNT_RELEASE, 1);
5876                         }
5877                 }
5878                 so = sonext;
5879
5880         }
5881         if (so == NULL) {
5882                 return (0);
5883         }
5884
5885         /* Makes sure socket is still in the right state to be discarded */
5886
5887         if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
5888                 tcp_unlock(so, 1, 0);
5889                 return (0);
5890         }
5891
5892 found_victim:
5893         if (so->so_usecount != 2 || !(so->so_state & SS_INCOMP)) {
5894                 /* do not discard: that socket is being accepted */
5895                 tcp_unlock(so, 1, 0);
5896                 return (0);
5897         }
5898
5899         TAILQ_REMOVE(&head->so_incomp, so, so_list);
5900         tcp_unlock(head, 0, 0);
5901
5902         lck_mtx_assert(&inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
5903         tp = sototcpcb(so);
5904         so->so_flags |= SOF_OVERFLOW;
5905         so->so_head = NULL;
5906
5907         tcp_close(tp);
5908         if (inp->inp_wantcnt > 0 && inp->inp_wantcnt != WNT_STOPUSING) {
5909                 /*
5910                  * Some one has a wantcnt on this pcb. Since WNT_ACQUIRE
5911                  * doesn't require a lock, it could have happened while
5912                  * we are holding the lock. This pcb will have to
5913                  * be garbage collected later.
5914                  * Release the reference held for so_incomp queue
5915                  */
5916                 so->so_usecount--;
5917                 tcp_unlock(so, 1, 0);
5918         } else {
5919                 /*
5920                  * Unlock this socket and leave the reference on.
5921                  * We need to acquire the pcbinfo lock in order to
5922                  * fully dispose it off
5923                  */
5924                 tcp_unlock(so, 0, 0);
5925
5926                 lck_rw_lock_exclusive(tcbinfo.ipi_lock);
5927
5928                 tcp_lock(so, 0, 0);
5929                 /* Release the reference held for so_incomp queue */
5930                 so->so_usecount--;
5931
5932                 if (so->so_usecount != 1 ||
5933                     (inp->inp_wantcnt > 0 &&
5934                     inp->inp_wantcnt != WNT_STOPUSING)) {
5935                         /*
5936                          * There is an extra wantcount or usecount
5937                          * that must have been added when the socket
5938                          * was unlocked. This socket will have to be
5939                          * garbage collected later
5940                          */
5941                         tcp_unlock(so, 1, 0);
5942                 } else {
5943
5944                         /* Drop the reference held for this function */
5945                         so->so_usecount--;
5946
5947                         in_pcbdispose(inp);
5948                 }
5949                 lck_rw_done(tcbinfo.ipi_lock);
5950         }
5951         tcpstat.tcps_drops++;
5952
5953         tcp_lock(head, 0, 0);
5954         head->so_incqlen--;
5955         head->so_qlen--;
5956         return(1);
5957 }
5958
5959 /* Set background congestion control on a socket */
5960 void
5961 tcp_set_background_cc(struct socket *so)
5962 {
5963         tcp_set_new_cc(so, TCP_CC_ALGO_BACKGROUND_INDEX);
5964 }
5965
5966 /* Set foreground congestion control on a socket */
5967 void
5968 tcp_set_foreground_cc(struct socket *so)
5969 {
5970         if (tcp_use_newreno)
5971                 tcp_set_new_cc(so, TCP_CC_ALGO_NEWRENO_INDEX);
5972         else
5973                 tcp_set_new_cc(so, TCP_CC_ALGO_CUBIC_INDEX);
5974 }
5975
5976 static void
5977 tcp_set_new_cc(struct socket *so, uint16_t cc_index)
5978 {
5979         struct inpcb *inp = sotoinpcb(so);
5980         struct tcpcb *tp = intotcpcb(inp);
5981         u_char old_cc_index = 0;
5982         if (tp->tcp_cc_index != cc_index) {
5983
5984                 old_cc_index = tp->tcp_cc_index;
5985
5986                 if (CC_ALGO(tp)->cleanup != NULL)
5987                         CC_ALGO(tp)->cleanup(tp);
5988                 tp->tcp_cc_index = cc_index;
5989
5990                 tcp_cc_allocate_state(tp);
5991
5992                 if (CC_ALGO(tp)->switch_to != NULL)
5993                         CC_ALGO(tp)->switch_to(tp, old_cc_index);
5994
5995                 tcp_ccdbg_trace(tp, NULL, TCP_CC_CHANGE_ALGO);
5996         }
5997 }
5998
5999 void
6000 tcp_set_recv_bg(struct socket *so)
6001 {
6002         if (!IS_TCP_RECV_BG(so))
6003                 so->so_traffic_mgt_flags |= TRAFFIC_MGT_TCP_RECVBG;
6004
6005         /* Unset Large Receive Offload on background sockets */
6006         so_set_lro(so, SO_TC_BK);
6007 }
6008
6009 void
6010 tcp_clear_recv_bg(struct socket *so)
6011 {
6012         if (IS_TCP_RECV_BG(so))
6013                 so->so_traffic_mgt_flags &= ~(TRAFFIC_MGT_TCP_RECVBG);
6014
6015         /*
6016          * Set/unset use of Large Receive Offload depending on
6017          * the traffic class
6018          */
6019         so_set_lro(so, so->so_traffic_class);
6020 }
6021
6022 void
6023 inp_fc_unthrottle_tcp(struct inpcb *inp)
6024 {
6025         struct tcpcb *tp = inp->inp_ppcb;
6026         /*
6027          * Back off the slow-start threshold and enter
6028          * congestion avoidance phase
6029          */
6030         if (CC_ALGO(tp)->pre_fr != NULL)
6031                 CC_ALGO(tp)->pre_fr(tp);
6032
6033         tp->snd_cwnd = tp->snd_ssthresh;
6034         tp->t_flagsext &= ~TF_CWND_NONVALIDATED;
6035         /*
6036          * Restart counting for ABC as we changed the
6037          * congestion window just now.
6038          */
6039         tp->t_bytes_acked = 0;
6040
6041         /* Reset retransmit shift as we know that the reason
6042          * for delay in sending a packet is due to flow
6043          * control on the outgoing interface. There is no need
6044          * to backoff retransmit timer.
6045          */
6046         tp->t_rxtshift = 0;
6047         tp->t_rtttime = 0;
6048
6049         /*
6050          * Start the output stream again. Since we are
6051          * not retransmitting data, do not reset the
6052          * retransmit timer or rtt calculation.
6053          */
6054         tcp_output(tp);
6055 }
6056
6057 static int
6058 tcp_getstat SYSCTL_HANDLER_ARGS
6059 {
6060 #pragma unused(oidp, arg1, arg2)
6061
6062         int error;
6063
6064         proc_t caller = PROC_NULL;
6065         proc_t caller_parent = PROC_NULL;
6066         char command_name[MAXCOMLEN + 1] = "";
6067         char parent_name[MAXCOMLEN + 1] = "";
6068
6069         if ((caller = proc_self()) != PROC_NULL) {
6070                 /* get process name */
6071                 strlcpy(command_name, caller->p_comm, sizeof(command_name));
6072
6073                 /* get parent process name if possible */
6074                 if ((caller_parent = proc_find(caller->p_ppid)) != PROC_NULL) {
6075                         strlcpy(parent_name, caller_parent->p_comm,
6076                             sizeof(parent_name));
6077                         proc_rele(caller_parent);
6078                 }
6079
6080                 if ((escape_str(command_name, strlen(command_name),
6081                     sizeof(command_name)) == 0) &&
6082                     (escape_str(parent_name, strlen(parent_name),
6083                     sizeof(parent_name)) == 0)) {
6084                         kern_asl_msg(LOG_DEBUG, "messagetracer",
6085                             5,
6086                             "com.apple.message.domain",
6087                             "com.apple.kernel.tcpstat", /* 1 */
6088                             "com.apple.message.signature",
6089                             "tcpstat", /* 2 */
6090                             "com.apple.message.signature2", command_name, /* 3 */
6091                             "com.apple.message.signature3", parent_name, /* 4 */
6092                             "com.apple.message.summarize", "YES", /* 5 */
6093                             NULL);
6094                 }
6095         }
6096         if (caller != PROC_NULL)
6097                 proc_rele(caller);
6098
6099         if (req->oldptr == 0) {
6100                 req->oldlen= (size_t)sizeof(struct tcpstat);
6101         }
6102
6103         error = SYSCTL_OUT(req, &tcpstat, MIN(sizeof (tcpstat), req->oldlen));
6104
6105         return (error);
6106
6107 }
6108
6109 /*
6110  * Checksum extended TCP header and data.
6111  */
6112 int
6113 tcp_input_checksum(int af, struct mbuf *m, struct tcphdr *th, int off, int tlen)
6114 {
6115         struct ifnet *ifp = m->m_pkthdr.rcvif;
6116
6117         switch (af) {
6118         case AF_INET: {
6119                 struct ip *ip = mtod(m, struct ip *);
6120                 struct ipovly *ipov = (struct ipovly *)ip;
6121
6122                 if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_DID_CSUM)
6123                         return (0);
6124
6125                 if ((hwcksum_rx || (ifp->if_flags & IFF_LOOPBACK) ||
6126                     (m->m_pkthdr.pkt_flags & PKTF_LOOP)) &&
6127                     (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)) {
6128                         if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
6129                                 th->th_sum = m->m_pkthdr.csum_rx_val;
6130                         } else {
6131                                 uint16_t sum = m->m_pkthdr.csum_rx_val;
6132                                 uint16_t start = m->m_pkthdr.csum_rx_start;
6133
6134                                 /*
6135                                  * Perform 1's complement adjustment of octets
6136                                  * that got included/excluded in the hardware-
6137                                  * calculated checksum value.  Ignore cases
6138                                  * where the value includes or excludes the IP
6139                                  * header span, as the sum for those octets
6140                                  * would already be 0xffff and thus no-op.
6141                                  */
6142                                 if ((m->m_pkthdr.csum_flags & CSUM_PARTIAL) &&
6143                                     start != 0 && (off - start) != off) {
6144 #if BYTE_ORDER != BIG_ENDIAN
6145                                         if (start < off) {
6146                                                 HTONS(ip->ip_len);
6147                                                 HTONS(ip->ip_off);
6148                                         }
6149 #endif
6150                                         /* callee folds in sum */
6151                                         sum = m_adj_sum16(m, start, off, sum);
6152 #if BYTE_ORDER != BIG_ENDIAN
6153                                         if (start < off) {
6154                                                 NTOHS(ip->ip_off);
6155                                                 NTOHS(ip->ip_len);
6156                                         }
6157 #endif
6158                                 }
6159
6160                                 /* callee folds in sum */
6161                                 th->th_sum = in_pseudo(ip->ip_src.s_addr,
6162                                     ip->ip_dst.s_addr,
6163                                     sum + htonl(tlen + IPPROTO_TCP));
6164                         }
6165                         th->th_sum ^= 0xffff;
6166                 } else {
6167                         uint16_t ip_sum;
6168                         int len;
6169                         char b[9];
6170
6171                         bcopy(ipov->ih_x1, b, sizeof (ipov->ih_x1));
6172                         bzero(ipov->ih_x1, sizeof (ipov->ih_x1));
6173                         ip_sum = ipov->ih_len;
6174                         ipov->ih_len = (u_short)tlen;
6175 #if BYTE_ORDER != BIG_ENDIAN
6176                         HTONS(ipov->ih_len);
6177 #endif
6178                         len = sizeof (struct ip) + tlen;
6179                         th->th_sum = in_cksum(m, len);
6180                         bcopy(b, ipov->ih_x1, sizeof (ipov->ih_x1));
6181                         ipov->ih_len = ip_sum;
6182
6183                         tcp_in_cksum_stats(len);
6184                 }
6185                 break;
6186         }
6187 #if INET6
6188         case AF_INET6: {
6189                 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
6190
6191                 if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_DID_CSUM)
6192                         return (0);
6193
6194                 if ((hwcksum_rx || (ifp->if_flags & IFF_LOOPBACK) ||
6195                     (m->m_pkthdr.pkt_flags & PKTF_LOOP)) &&
6196                     (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)) {
6197                         if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
6198                                 th->th_sum = m->m_pkthdr.csum_rx_val;
6199                         } else {
6200                                 uint16_t sum = m->m_pkthdr.csum_rx_val;
6201                                 uint16_t start = m->m_pkthdr.csum_rx_start;
6202
6203                                 /*
6204                                  * Perform 1's complement adjustment of octets
6205                                  * that got included/excluded in the hardware-
6206                                  * calculated checksum value.
6207                                  */
6208                                 if ((m->m_pkthdr.csum_flags & CSUM_PARTIAL) &&
6209                                     start != off) {
6210                                         uint16_t s, d;
6211
6212                                         if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src)) {
6213                                                 s = ip6->ip6_src.s6_addr16[1];
6214                                                 ip6->ip6_src.s6_addr16[1] = 0 ;
6215                                         }
6216                                         if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst)) {
6217                                                 d = ip6->ip6_dst.s6_addr16[1];
6218                                                 ip6->ip6_dst.s6_addr16[1] = 0;
6219                                         }
6220
6221                                         /* callee folds in sum */
6222                                         sum = m_adj_sum16(m, start, off, sum);
6223
6224                                         if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src))
6225                                                 ip6->ip6_src.s6_addr16[1] = s;
6226                                         if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst))
6227                                                 ip6->ip6_dst.s6_addr16[1] = d;
6228                                 }
6229
6230                                 th->th_sum = in6_pseudo(
6231                                     &ip6->ip6_src, &ip6->ip6_dst,
6232                                     sum + htonl(tlen + IPPROTO_TCP));
6233                         }
6234                         th->th_sum ^= 0xffff;
6235                 } else {
6236                         tcp_in6_cksum_stats(tlen);
6237                         th->th_sum = in6_cksum(m, IPPROTO_TCP, off, tlen);
6238                 }
6239                 break;
6240         }
6241 #endif /* INET6 */
6242         default:
6243                 VERIFY(0);
6244                 /* NOTREACHED */
6245         }
6246
6247         if (th->th_sum != 0) {
6248                 tcpstat.tcps_rcvbadsum++;
6249                 IF_TCP_STATINC(ifp, badformat);
6250                 return (-1);
6251         }
6252
6253         return (0);
6254 }
6255
6256 SYSCTL_PROC(_net_inet_tcp, TCPCTL_STATS, stats,
6257     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, tcp_getstat,
6258     "S,tcpstat", "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
6259
6260 static int
6261 sysctl_rexmtthresh SYSCTL_HANDLER_ARGS
6262 {
6263 #pragma unused(arg1, arg2)
6264
6265         int error, val = tcprexmtthresh;
6266
6267         error = sysctl_handle_int(oidp, &val, 0, req);
6268         if (error || !req->newptr)
6269                 return (error);
6270
6271         /*
6272          * Constrain the number of duplicate ACKs
6273          * to consider for TCP fast retransmit
6274          * to either 2 or 3
6275          */
6276
6277         if (val < 2 || val > 3)
6278                 return (EINVAL);
6279
6280          tcprexmtthresh = val;
6281
6282         return (0);
6283 }
6284
6285 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmt_thresh, CTLTYPE_INT | CTLFLAG_RW |
6286         CTLFLAG_LOCKED, &tcprexmtthresh, 0, &sysctl_rexmtthresh, "I",
6287         "Duplicate ACK Threshold for Fast Retransmit");
6288