bsd/netinet/tcp_input.c

   1 /*
   2  * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
  30  *      The Regents of the University of California.  All rights reserved.
  31  *
  32  * Redistribution and use in source and binary forms, with or without
  33  * modification, are permitted provided that the following conditions
  34  * are met:
  35  * 1. Redistributions of source code must retain the above copyright
  36  *    notice, this list of conditions and the following disclaimer.
  37  * 2. Redistributions in binary form must reproduce the above copyright
  38  *    notice, this list of conditions and the following disclaimer in the
  39  *    documentation and/or other materials provided with the distribution.
  40  * 3. All advertising materials mentioning features or use of this software
  41  *    must display the following acknowledgement:
  42  *      This product includes software developed by the University of
  43  *      California, Berkeley and its contributors.
  44  * 4. Neither the name of the University nor the names of its contributors
  45  *    may be used to endorse or promote products derived from this software
  46  *    without specific prior written permission.
  47  *
  48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  58  * SUCH DAMAGE.
  59  *
  60  *      @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
  61  * $FreeBSD: src/sys/netinet/tcp_input.c,v 1.107.2.16 2001/08/22 00:59:12 silby Exp $
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/kernel.h>
  73 #include <sys/sysctl.h>
  74 #include <sys/malloc.h>
  75 #include <sys/mbuf.h>
  76 #include <sys/proc.h>           /* for proc0 declaration */
  77 #include <sys/protosw.h>
  78 #include <sys/socket.h>
  79 #include <sys/socketvar.h>
  80 #include <sys/syslog.h>
  81 #include <sys/mcache.h>
  82 #include <sys/kasl.h>
  83 #include <kern/cpu_number.h>    /* before tcp_seq.h, for tcp_random18() */
  84
  85 #include <machine/endian.h>
  86
  87 #include <net/if.h>
  88 #include <net/if_types.h>
  89 #include <net/route.h>
  90 #include <net/ntstat.h>
  91 #include <net/dlil.h>
  92
  93 #include <netinet/in.h>
  94 #include <netinet/in_systm.h>
  95 #include <netinet/ip.h>
  96 #include <netinet/ip_icmp.h>    /* for ICMP_BANDLIM             */
  97 #include <netinet/in_var.h>
  98 #include <netinet/icmp_var.h>   /* for ICMP_BANDLIM     */
  99 #include <netinet/in_pcb.h>
 100 #include <netinet/ip_var.h>
 101 #include <mach/sdt.h>
 102 #if INET6
 103 #include <netinet/ip6.h>
 104 #include <netinet/icmp6.h>
 105 #include <netinet6/nd6.h>
 106 #include <netinet6/ip6_var.h>
 107 #include <netinet6/in6_pcb.h>
 108 #endif
 109 #include <netinet/tcp.h>
 110 #include <netinet/tcp_fsm.h>
 111 #include <netinet/tcp_seq.h>
 112 #include <netinet/tcp_timer.h>
 113 #include <netinet/tcp_var.h>
 114 #include <netinet/tcp_cc.h>
 115 #include <dev/random/randomdev.h>
 116 #include <kern/zalloc.h>
 117 #if INET6
 118 #include <netinet6/tcp6_var.h>
 119 #endif
 120 #include <netinet/tcpip.h>
 121 #if TCPDEBUG
 122 #include <netinet/tcp_debug.h>
 123 u_char tcp_saveipgen[40]; /* the size must be of max ip header, now IPv6 */
 124 struct tcphdr tcp_savetcp;
 125 #endif /* TCPDEBUG */
 126
 127 #if IPSEC
 128 #include <netinet6/ipsec.h>
 129 #if INET6
 130 #include <netinet6/ipsec6.h>
 131 #endif
 132 #include <netkey/key.h>
 133 #endif /*IPSEC*/
 134
 135 #if CONFIG_MACF_NET || CONFIG_MACF_SOCKET
 136 #include <security/mac_framework.h>
 137 #endif /* CONFIG_MACF_NET || CONFIG_MACF_SOCKET */
 138
 139 #include <sys/kdebug.h>
 140 #include <netinet/lro_ext.h>
 141 #if MPTCP
 142 #include <netinet/mptcp_var.h>
 143 #include <netinet/mptcp.h>
 144 #include <netinet/mptcp_opt.h>
 145 #endif /* MPTCP */
 146
 147 #define DBG_LAYER_BEG           NETDBG_CODE(DBG_NETTCP, 0)
 148 #define DBG_LAYER_END           NETDBG_CODE(DBG_NETTCP, 2)
 149 #define DBG_FNC_TCP_INPUT       NETDBG_CODE(DBG_NETTCP, (3 << 8))
 150 #define DBG_FNC_TCP_NEWCONN     NETDBG_CODE(DBG_NETTCP, (7 << 8))
 151
 152 tcp_cc  tcp_ccgen;
 153
 154 #if IPSEC
 155 extern int ipsec_bypass;
 156 #endif
 157
 158 extern int32_t total_sbmb_cnt;
 159
 160 struct  tcpstat tcpstat;
 161
 162 static int log_in_vain = 0;
 163 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW | CTLFLAG_LOCKED,
 164     &log_in_vain, 0, "Log all incoming TCP connections");
 165
 166 static int blackhole = 0;
 167 SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW | CTLFLAG_LOCKED,
 168         &blackhole, 0, "Do not send RST when dropping refused connections");
 169
 170 int tcp_delack_enabled = 3;
 171 SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW | CTLFLAG_LOCKED,
 172     &tcp_delack_enabled, 0,
 173     "Delay ACK to try and piggyback it onto a data packet");
 174
 175 int tcp_lq_overflow = 1;
 176 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_lq_overflow, CTLFLAG_RW | CTLFLAG_LOCKED,
 177     &tcp_lq_overflow, 0,
 178     "Listen Queue Overflow");
 179
 180 int tcp_recv_bg = 0;
 181 SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbg, CTLFLAG_RW | CTLFLAG_LOCKED,
 182     &tcp_recv_bg, 0,
 183     "Receive background");
 184
 185 #if TCP_DROP_SYNFIN
 186 static int drop_synfin = 1;
 187 SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW | CTLFLAG_LOCKED,
 188     &drop_synfin, 0, "Drop TCP packets with SYN+FIN set");
 189 #endif
 190
 191 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW|CTLFLAG_LOCKED, 0,
 192     "TCP Segment Reassembly Queue");
 193
 194 __private_extern__ int tcp_reass_maxseg = 0;
 195 SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RW | CTLFLAG_LOCKED,
 196     &tcp_reass_maxseg, 0,
 197     "Global maximum number of TCP Segments in Reassembly Queue");
 198
 199 __private_extern__ int tcp_reass_qsize = 0;
 200 SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_RD | CTLFLAG_LOCKED,
 201     &tcp_reass_qsize, 0,
 202     "Global number of TCP Segments currently in Reassembly Queue");
 203
 204 static int tcp_reass_overflows = 0;
 205 SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD | CTLFLAG_LOCKED,
 206     &tcp_reass_overflows, 0,
 207     "Global number of TCP Segment Reassembly Queue Overflows");
 208
 209
 210 __private_extern__ int slowlink_wsize = 8192;
 211 SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowlink_wsize, CTLFLAG_RW | CTLFLAG_LOCKED,
 212         &slowlink_wsize, 0, "Maximum advertised window size for slowlink");
 213
 214 int maxseg_unacked = 8;
 215 SYSCTL_INT(_net_inet_tcp, OID_AUTO, maxseg_unacked, CTLFLAG_RW | CTLFLAG_LOCKED,
 216         &maxseg_unacked, 0, "Maximum number of outstanding segments left unacked");
 217
 218 int     tcp_do_rfc3465 = 1;
 219 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW | CTLFLAG_LOCKED,
 220         &tcp_do_rfc3465, 0, "");
 221
 222 int     tcp_do_rfc3465_lim2 = 1;
 223 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465_lim2, CTLFLAG_RW | CTLFLAG_LOCKED,
 224         &tcp_do_rfc3465_lim2, 0, "Appropriate bytes counting w/ L=2*SMSS");
 225
 226 int     rtt_samples_per_slot = 20;
 227 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rtt_samples_per_slot, CTLFLAG_RW | CTLFLAG_LOCKED,
 228         &rtt_samples_per_slot, 0, "Number of RTT samples stored for rtt history");
 229
 230 int     tcp_allowed_iaj = ALLOWED_IAJ;
 231 SYSCTL_INT(_net_inet_tcp, OID_AUTO, recv_allowed_iaj, CTLFLAG_RW | CTLFLAG_LOCKED,
 232         &tcp_allowed_iaj, 0, "Allowed inter-packet arrival jiter");
 233
 234 int     tcp_acc_iaj_high_thresh = ACC_IAJ_HIGH_THRESH;
 235 SYSCTL_INT(_net_inet_tcp, OID_AUTO, acc_iaj_high_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
 236         &tcp_acc_iaj_high_thresh, 0, "Used in calculating maximum accumulated IAJ");
 237
 238 u_int32_t tcp_do_autorcvbuf = 1;
 239 SYSCTL_INT(_net_inet_tcp, OID_AUTO, doautorcvbuf, CTLFLAG_RW | CTLFLAG_LOCKED,
 240         &tcp_do_autorcvbuf, 0, "Enable automatic socket buffer tuning");
 241
 242 u_int32_t tcp_autorcvbuf_inc_shift = 3;
 243 SYSCTL_INT(_net_inet_tcp, OID_AUTO, autorcvbufincshift, CTLFLAG_RW | CTLFLAG_LOCKED,
 244         &tcp_autorcvbuf_inc_shift, 0, "Shift for increment in receive socket buffer size");
 245
 246 u_int32_t tcp_autorcvbuf_max = 512 * 1024;
 247 SYSCTL_INT(_net_inet_tcp, OID_AUTO, autorcvbufmax, CTLFLAG_RW | CTLFLAG_LOCKED,
 248         &tcp_autorcvbuf_max, 0, "Maximum receive socket buffer size");
 249
 250 int sw_lro = 0;
 251 SYSCTL_INT(_net_inet_tcp, OID_AUTO, lro, CTLFLAG_RW | CTLFLAG_LOCKED,
 252         &sw_lro, 0, "Used to coalesce TCP packets");
 253
 254 int lrodebug = 0;
 255 SYSCTL_INT(_net_inet_tcp, OID_AUTO, lrodbg, CTLFLAG_RW | CTLFLAG_LOCKED,
 256         &lrodebug, 0, "Used to debug SW LRO");
 257
 258 int lro_start = 4;
 259 SYSCTL_INT(_net_inet_tcp, OID_AUTO, lro_startcnt, CTLFLAG_RW | CTLFLAG_LOCKED,
 260         &lro_start, 0, "Segments for starting LRO computed as power of 2");
 261
 262 extern int tcp_do_autosendbuf;
 263
 264 int limited_txmt = 1;
 265 SYSCTL_INT(_net_inet_tcp, OID_AUTO, limited_transmit, CTLFLAG_RW | CTLFLAG_LOCKED,
 266         &limited_txmt, 0, "Enable limited transmit");
 267
 268 int early_rexmt = 1;
 269 SYSCTL_INT(_net_inet_tcp, OID_AUTO, early_rexmt, CTLFLAG_RW | CTLFLAG_LOCKED,
 270         &early_rexmt, 0, "Enable Early Retransmit");
 271
 272 int sack_ackadv = 1;
 273 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sack_ackadv, CTLFLAG_RW | CTLFLAG_LOCKED,
 274         &sack_ackadv, 0, "Use SACK with cumulative ack advancement as a dupack");
 275
 276 #if CONFIG_IFEF_NOWINDOWSCALE
 277 int tcp_obey_ifef_nowindowscale = 0;
 278 SYSCTL_INT(_net_inet_tcp, OID_AUTO, obey_ifef_nowindowscale, CTLFLAG_RW | CTLFLAG_LOCKED,
 279         &tcp_obey_ifef_nowindowscale, 0, "");
 280 #endif
 281
 282 /* This limit will determine when the receive socket buffer tuning will
 283  * kick in. Currently it will start when the bw*delay measured in
 284  * last RTT is more than half of the current hiwat on the buffer.
 285  */
 286 uint32_t tcp_rbuf_hiwat_shift = 1;
 287
 288 /* This limit will determine when the socket buffer will be increased
 289  * to accommodate an application reading slowly. When the amount of
 290  * space left in the buffer is less than one forth of the bw*delay
 291  * measured in last RTT.
 292  */
 293 uint32_t tcp_rbuf_win_shift = 2;
 294
 295 extern int tcp_TCPTV_MIN;
 296 extern int tcp_acc_iaj_high;
 297 extern int tcp_acc_iaj_react_limit;
 298 extern struct zone *tcp_reass_zone;
 299
 300 int tcprexmtthresh = 3;
 301
 302 u_int32_t tcp_now;
 303 struct timeval tcp_uptime;      /* uptime when tcp_now was last updated */
 304 lck_spin_t *tcp_uptime_lock;    /* Used to sychronize updates to tcp_now */
 305
 306 struct inpcbhead tcb;
 307 #define tcb6    tcb  /* for KAME src sync over BSD*'s */
 308 struct inpcbinfo tcbinfo;
 309
 310 static void tcp_dooptions(struct tcpcb *, u_char *, int, struct tcphdr *,
 311     struct tcpopt *, unsigned int);
 312 static void      tcp_pulloutofband(struct socket *,
 313             struct tcphdr *, struct mbuf *, int);
 314 static int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *,
 315     struct ifnet *);
 316 static void     tcp_xmit_timer(struct tcpcb *, int, u_int32_t, tcp_seq);
 317 static inline unsigned int tcp_maxmtu(struct rtentry *);
 318 static inline int tcp_stretch_ack_enable(struct tcpcb *tp);
 319 static inline void tcp_adaptive_rwtimo_check(struct tcpcb *, int);
 320
 321 #if TRAFFIC_MGT
 322 static inline void update_iaj_state(struct tcpcb *tp, uint32_t tlen, int reset_size);
 323 void compute_iaj(struct tcpcb *tp, int nlropkts, int lro_delay_factor);
 324 static void compute_iaj_meat(struct tcpcb *tp, uint32_t cur_iaj);
 325 #endif /* TRAFFIC_MGT */
 326
 327 #if INET6
 328 static inline unsigned int tcp_maxmtu6(struct rtentry *);
 329 #endif
 330
 331 static void tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sb,
 332         struct tcpopt *to, u_int32_t tlen);
 333
 334 void tcp_sbrcv_trim(struct tcpcb *tp, struct sockbuf *sb);
 335 static void tcp_sbsnd_trim(struct sockbuf *sbsnd);
 336 static inline void tcp_sbrcv_tstmp_check(struct tcpcb *tp);
 337 static inline void tcp_sbrcv_reserve(struct tcpcb *tp, struct sockbuf *sb,
 338         u_int32_t newsize, u_int32_t idealsize);
 339 static void tcp_bad_rexmt_restore_state(struct tcpcb *tp, struct tcphdr *th);
 340 static int tcp_detect_bad_rexmt(struct tcpcb *tp, struct tcpopt *to);
 341 static void tcp_compute_rtt(struct tcpcb *tp, struct tcpopt *to,
 342         struct tcphdr *th);
 343
 344 /*
 345  * Constants used for resizing receive socket buffer
 346  * when timestamps are not supported
 347  */
 348 #define TCPTV_RCVNOTS_QUANTUM 100
 349 #define TCP_RCVNOTS_BYTELEVEL 204800
 350
 351 /*
 352  * Constants used for limiting early retransmits
 353  * to 10 per minute.
 354  */
 355 #define TCP_EARLY_REXMT_WIN (60 * TCP_RETRANSHZ) /* 60 seconds */
 356 #define TCP_EARLY_REXMT_LIMIT 10
 357
 358 extern void     add_to_time_wait(struct tcpcb *, uint32_t delay);
 359 extern void postevent(struct socket *, struct sockbuf *, int);
 360
 361 extern  void    ipfwsyslog( int level, const char *format,...);
 362 extern int fw_verbose;
 363
 364 #if IPFIREWALL
 365 extern void ipfw_stealth_stats_incr_tcp(void);
 366
 367 #define log_in_vain_log( a ) {            \
 368         if ( (log_in_vain == 3 ) && (fw_verbose == 2)) {        /* Apple logging, log to ipfw.log */ \
 369                 ipfwsyslog a ;  \
 370         } else if ( (log_in_vain == 4 ) && (fw_verbose == 2)) {   \
 371                 ipfw_stealth_stats_incr_tcp();                    \
 372         }                       \
 373         else log a ;            \
 374 }
 375 #else
 376 #define log_in_vain_log( a ) { log a; }
 377 #endif
 378
 379 int tcp_rcvunackwin = TCPTV_UNACKWIN;
 380 int tcp_maxrcvidle = TCPTV_MAXRCVIDLE;
 381 int tcp_rcvsspktcnt = TCP_RCV_SS_PKTCOUNT;
 382 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rcvsspktcnt, CTLFLAG_RW | CTLFLAG_LOCKED,
 383         &tcp_rcvsspktcnt, 0, "packets to be seen before receiver stretches acks");
 384
 385 #define DELAY_ACK(tp, th) \
 386         (CC_ALGO(tp)->delay_ack != NULL && CC_ALGO(tp)->delay_ack(tp, th))
 387
 388 static int tcp_dropdropablreq(struct socket *head);
 389 static void tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th);
 390
 391 static void update_base_rtt(struct tcpcb *tp, uint32_t rtt);
 392 uint32_t get_base_rtt(struct tcpcb *tp);
 393 void tcp_set_background_cc(struct socket *so);
 394 void tcp_set_foreground_cc(struct socket *so);
 395 static void tcp_set_new_cc(struct socket *so, uint16_t cc_index);
 396 static void tcp_bwmeas_check(struct tcpcb *tp);
 397
 398 #if TRAFFIC_MGT
 399 void
 400 reset_acc_iaj(struct tcpcb *tp)
 401 {
 402         tp->acc_iaj = 0;
 403         tp->iaj_rwintop = 0;
 404         CLEAR_IAJ_STATE(tp);
 405 }
 406
 407 static inline void
 408 update_iaj_state(struct tcpcb *tp, uint32_t size, int rst_size)
 409 {
 410         if (rst_size > 0)
 411                 tp->iaj_size = 0;
 412         if (tp->iaj_size == 0 || size >= tp->iaj_size) {
 413                 tp->iaj_size = size;
 414                 tp->iaj_rcv_ts = tcp_now;
 415                 tp->iaj_small_pkt = 0;
 416         }
 417 }
 418
 419 /* For every 32 bit unsigned integer(v), this function will find the
 420  * largest integer n such that (n*n <= v). This takes at most 16 iterations
 421  * irrespective of the value of v and does not involve multiplications.
 422  */
 423 static inline int
 424 isqrt(unsigned int val) {
 425         unsigned int sqrt_cache[11] = {0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100};
 426         unsigned int temp, g=0, b=0x8000, bshft=15;
 427         if ( val <= 100) {
 428                 for (g = 0; g <= 10; ++g) {
 429                         if (sqrt_cache[g] > val) {
 430                                 g--;
 431                                 break;
 432                         } else if (sqrt_cache[g] == val) {
 433                                 break;
 434                         }
 435                 }
 436         } else {
 437                 do {
 438                         temp = (((g << 1) + b) << (bshft--));
 439                         if (val >= temp) {
 440                                 g += b;
 441                                 val -= temp;
 442                         }
 443                         b >>= 1;
 444                 } while ( b > 0 && val > 0);
 445         }
 446         return(g);
 447 }
 448
 449 /*
 450 * With LRO, roughly estimate the inter arrival time between
 451 * each sub coalesced packet as an average. Count the delay
 452 * cur_iaj to be the delay between the last packet received
 453 * and the first packet of the LRO stream. Due to round off errors
 454 * cur_iaj may be the same as lro_delay_factor. Averaging has
 455 * round off errors too. lro_delay_factor may be close to 0
 456 * in steady state leading to lower values fed to compute_iaj_meat.
 457 */
 458 void
 459 compute_iaj(struct tcpcb *tp, int nlropkts, int lro_delay_factor)
 460 {
 461         uint32_t cur_iaj = tcp_now - tp->iaj_rcv_ts;
 462         uint32_t timediff = 0;
 463
 464         if (cur_iaj >= lro_delay_factor) {
 465                 cur_iaj = cur_iaj - lro_delay_factor;
 466         }
 467
 468         compute_iaj_meat(tp, cur_iaj);
 469
 470         if (nlropkts <= 1)
 471                 return;
 472
 473         nlropkts--;
 474
 475         timediff = lro_delay_factor/nlropkts;
 476
 477         while (nlropkts > 0)
 478         {
 479                 compute_iaj_meat(tp, timediff);
 480                 nlropkts--;
 481         }
 482 }
 483
 484 static
 485 void compute_iaj_meat(struct tcpcb *tp, uint32_t cur_iaj)
 486 {
 487         /* When accumulated IAJ reaches MAX_ACC_IAJ in milliseconds,
 488          * throttle the receive window to a minimum of MIN_IAJ_WIN packets
 489          */
 490 #define MAX_ACC_IAJ (tcp_acc_iaj_high_thresh + tcp_acc_iaj_react_limit)
 491 #define IAJ_DIV_SHIFT 4
 492 #define IAJ_ROUNDUP_CONST (1 << (IAJ_DIV_SHIFT - 1))
 493
 494         uint32_t allowed_iaj, acc_iaj = 0;
 495
 496         uint32_t mean, temp;
 497         int32_t cur_iaj_dev;
 498
 499         cur_iaj_dev = (cur_iaj - tp->avg_iaj);
 500
 501         /* Allow a jitter of "allowed_iaj" milliseconds. Some connections
 502          * may have a constant jitter more than that. We detect this by
 503          * using standard deviation.
 504          */
 505         allowed_iaj = tp->avg_iaj + tp->std_dev_iaj;
 506         if (allowed_iaj < tcp_allowed_iaj)
 507                 allowed_iaj = tcp_allowed_iaj;
 508
 509         /* Initially when the connection starts, the senders congestion
 510          * window is small. During this period we avoid throttling a
 511          * connection because we do not have a good starting point for
 512          * allowed_iaj. IAJ_IGNORE_PKTCNT is used to quietly gloss over
 513          * the first few packets.
 514          */
 515         if (tp->iaj_pktcnt > IAJ_IGNORE_PKTCNT) {
 516                 if ( cur_iaj <= allowed_iaj ) {
 517                         if (tp->acc_iaj >= 2)
 518                                 acc_iaj = tp->acc_iaj - 2;
 519                         else
 520                                 acc_iaj = 0;
 521
 522                 } else {
 523                         acc_iaj = tp->acc_iaj + (cur_iaj - allowed_iaj);
 524                 }
 525
 526                 if (acc_iaj > MAX_ACC_IAJ)
 527                         acc_iaj = MAX_ACC_IAJ;
 528                 tp->acc_iaj = acc_iaj;
 529         }
 530
 531         /* Compute weighted average where the history has a weight of
 532          * 15 out of 16 and the current value has a weight of 1 out of 16.
 533          * This will make the short-term measurements have more weight.
 534          *
 535          * The addition of 8 will help to round-up the value
 536          * instead of round-down
 537          */
 538         tp->avg_iaj = (((tp->avg_iaj << IAJ_DIV_SHIFT) - tp->avg_iaj)
 539                 + cur_iaj + IAJ_ROUNDUP_CONST) >> IAJ_DIV_SHIFT;
 540
 541         /* Compute Root-mean-square of deviation where mean is a weighted
 542          * average as described above.
 543          */
 544         temp = tp->std_dev_iaj * tp->std_dev_iaj;
 545         mean = (((temp << IAJ_DIV_SHIFT) - temp)
 546                 + (cur_iaj_dev * cur_iaj_dev)
 547                 + IAJ_ROUNDUP_CONST) >> IAJ_DIV_SHIFT;
 548
 549         tp->std_dev_iaj = isqrt(mean);
 550
 551         DTRACE_TCP3(iaj, struct tcpcb *, tp, uint32_t, cur_iaj,
 552                 uint32_t, allowed_iaj);
 553
 554         return;
 555 }
 556 #endif /* TRAFFIC_MGT */
 557
 558 /* Check if enough amount of data has been acknowledged since
 559  * bw measurement was started
 560  */
 561 static void
 562 tcp_bwmeas_check(struct tcpcb *tp)
 563 {
 564         int32_t bw_meas_bytes;
 565         uint32_t bw, bytes, elapsed_time;
 566         bw_meas_bytes = tp->snd_una - tp->t_bwmeas->bw_start;
 567         if ((tp->t_flagsext & TF_BWMEAS_INPROGRESS) != 0 &&
 568             bw_meas_bytes >= (int32_t)(tp->t_bwmeas->bw_size)) {
 569                 bytes = bw_meas_bytes;
 570                 elapsed_time = tcp_now - tp->t_bwmeas->bw_ts;
 571                 if (elapsed_time > 0) {
 572                         bw = bytes / elapsed_time;
 573                         if ( bw > 0) {
 574                                 if (tp->t_bwmeas->bw_sndbw > 0) {
 575                                         tp->t_bwmeas->bw_sndbw =
 576                                             (((tp->t_bwmeas->bw_sndbw << 3) - tp->t_bwmeas->bw_sndbw) + bw) >> 3;
 577                                 } else {
 578                                         tp->t_bwmeas->bw_sndbw = bw;
 579                                 }
 580                         }
 581                 }
 582                 tp->t_flagsext &= ~(TF_BWMEAS_INPROGRESS);
 583         }
 584 }
 585
 586 static int
 587 tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m,
 588     struct ifnet *ifp)
 589 {
 590         struct tseg_qent *q;
 591         struct tseg_qent *p = NULL;
 592         struct tseg_qent *nq;
 593         struct tseg_qent *te = NULL;
 594         struct inpcb *inp = tp->t_inpcb;
 595         struct socket *so = inp->inp_socket;
 596         int flags = 0;
 597         int dowakeup = 0;
 598         struct mbuf *oodata = NULL;
 599         int copy_oodata = 0;
 600         boolean_t cell = IFNET_IS_CELLULAR(ifp);
 601         boolean_t wifi = (!cell && IFNET_IS_WIFI(ifp));
 602
 603         /*
 604          * Call with th==0 after become established to
 605          * force pre-ESTABLISHED data up to user socket.
 606          */
 607         if (th == NULL)
 608                 goto present;
 609
 610         /* If the reassembly queue already has entries or if we are going to add
 611          * a new one, then the connection has reached a loss state.
 612          * Reset the stretch-ack algorithm at this point.
 613          */
 614         if ((tp->t_flags & TF_STRETCHACK) != 0)
 615                 tcp_reset_stretch_ack(tp);
 616
 617         /* When the connection reaches a loss state, we need to send more acks
 618          * for a period of time so that the sender's congestion window will
 619          * open. Wait until we see some packets on the connection before
 620          * stretching acks again.
 621          */
 622         tp->t_flagsext |= TF_RCVUNACK_WAITSS;
 623         tp->rcv_waitforss = 0;
 624
 625
 626 #if TRAFFIC_MGT
 627         if (tp->acc_iaj > 0)
 628                 reset_acc_iaj(tp);
 629 #endif /* TRAFFIC_MGT */
 630
 631         /*
 632          * Limit the number of segments in the reassembly queue to prevent
 633          * holding on to too many segments (and thus running out of mbufs).
 634          * Make sure to let the missing segment through which caused this
 635          * queue.  Always keep one global queue entry spare to be able to
 636          * process the missing segment.
 637          */
 638         if (th->th_seq != tp->rcv_nxt &&
 639             tcp_reass_qsize + 1 >= tcp_reass_maxseg) {
 640                 tcp_reass_overflows++;
 641                 tcpstat.tcps_rcvmemdrop++;
 642                 m_freem(m);
 643                 *tlenp = 0;
 644                 return (0);
 645         }
 646
 647         /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
 648         te = (struct tseg_qent *) zalloc_noblock(tcp_reass_zone);
 649         if (te == NULL) {
 650                 tcpstat.tcps_rcvmemdrop++;
 651                 m_freem(m);
 652                 return (0);
 653         }
 654         tcp_reass_qsize++;
 655
 656         /*
 657          * Find a segment which begins after this one does.
 658          */
 659         LIST_FOREACH(q, &tp->t_segq, tqe_q) {
 660                 if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
 661                         break;
 662                 p = q;
 663         }
 664
 665         /*
 666          * If there is a preceding segment, it may provide some of
 667          * our data already.  If so, drop the data from the incoming
 668          * segment.  If it provides all of our data, drop us.
 669          */
 670         if (p != NULL) {
 671                 register int i;
 672                 /* conversion to int (in i) handles seq wraparound */
 673                 i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
 674                 if (i > 0) {
 675                         if (i >= *tlenp) {
 676                                 tcpstat.tcps_rcvduppack++;
 677                                 tcpstat.tcps_rcvdupbyte += *tlenp;
 678                                 if (nstat_collect) {
 679                                         nstat_route_rx(inp->inp_route.ro_rt, 1, *tlenp, NSTAT_RX_FLAG_DUPLICATE);
 680                                         INP_ADD_STAT(inp, cell, wifi, rxpackets, 1);
 681                                         INP_ADD_STAT(inp, cell, wifi, rxbytes, *tlenp);
 682                                         tp->t_stat.rxduplicatebytes += *tlenp;
 683                                 }
 684                                 m_freem(m);
 685                                 zfree(tcp_reass_zone, te);
 686                                 te = NULL;
 687                                 tcp_reass_qsize--;
 688                                 /*
 689                                  * Try to present any queued data
 690                                  * at the left window edge to the user.
 691                                  * This is needed after the 3-WHS
 692                                  * completes.
 693                                  */
 694                                 goto present;
 695                         }
 696                         m_adj(m, i);
 697                         *tlenp -= i;
 698                         th->th_seq += i;
 699                 }
 700         }
 701         tcpstat.tcps_rcvoopack++;
 702         tcpstat.tcps_rcvoobyte += *tlenp;
 703         if (nstat_collect) {
 704                 nstat_route_rx(inp->inp_route.ro_rt, 1, *tlenp, NSTAT_RX_FLAG_OUT_OF_ORDER);
 705                 INP_ADD_STAT(inp, cell, wifi, rxpackets, 1);
 706                 INP_ADD_STAT(inp, cell, wifi, rxbytes, *tlenp);
 707                 tp->t_stat.rxoutoforderbytes += *tlenp;
 708         }
 709
 710         /*
 711          * While we overlap succeeding segments trim them or,
 712          * if they are completely covered, dequeue them.
 713          */
 714         while (q) {
 715                 register int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
 716                 if (i <= 0)
 717                         break;
 718                 if (i < q->tqe_len) {
 719                         q->tqe_th->th_seq += i;
 720                         q->tqe_len -= i;
 721                         m_adj(q->tqe_m, i);
 722                         break;
 723                 }
 724
 725                 nq = LIST_NEXT(q, tqe_q);
 726                 LIST_REMOVE(q, tqe_q);
 727                 m_freem(q->tqe_m);
 728                 zfree(tcp_reass_zone, q);
 729                 tcp_reass_qsize--;
 730                 q = nq;
 731         }
 732
 733         /* Insert the new segment queue entry into place. */
 734         te->tqe_m = m;
 735         te->tqe_th = th;
 736         te->tqe_len = *tlenp;
 737
 738         if (p == NULL) {
 739                 LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
 740         } else {
 741                 LIST_INSERT_AFTER(p, te, tqe_q);
 742         }
 743
 744         /*
 745          * New out-of-order data exists, and is pointed to by
 746          * queue entry te. Set copy_oodata to 1 so out-of-order data
 747          * can be copied off to sockbuf after in-order data
 748          * is copied off.
 749          */
 750         if (!(so->so_state & SS_CANTRCVMORE))
 751                 copy_oodata = 1;
 752
 753 present:
 754         /*
 755          * Present data to user, advancing rcv_nxt through
 756          * completed sequence space.
 757          */
 758         if (!TCPS_HAVEESTABLISHED(tp->t_state))
 759                 return (0);
 760         q = LIST_FIRST(&tp->t_segq);
 761         if (!q || q->tqe_th->th_seq != tp->rcv_nxt) {
 762                 /* Stop using LRO once out of order packets arrive */
 763                 if (tp->t_flagsext & TF_LRO_OFFLOADED) {
 764                         tcp_lro_remove_state(inp->inp_laddr, inp->inp_faddr,
 765                                 th->th_dport, th->th_sport);
 766                         tp->t_flagsext &= ~TF_LRO_OFFLOADED;
 767                 }
 768
 769                 /*
 770                  * continue processing if out-of-order data
 771                  * can be delivered
 772                  */
 773                 if (q && (so->so_flags & SOF_ENABLE_MSGS))
 774                         goto msg_unordered_delivery;
 775
 776                 return (0);
 777         }
 778         do {
 779                 tp->rcv_nxt += q->tqe_len;
 780                 flags = q->tqe_th->th_flags & TH_FIN;
 781                 nq = LIST_NEXT(q, tqe_q);
 782                 LIST_REMOVE(q, tqe_q);
 783                 if (so->so_state & SS_CANTRCVMORE) {
 784                         m_freem(q->tqe_m);
 785                 } else {
 786                         so_recv_data_stat(so, q->tqe_m, 0); /* XXXX */
 787                         if (so->so_flags & SOF_ENABLE_MSGS) {
 788                                 /*
 789                                  * Append the inorder data as a message to the
 790                                  * receive socket buffer. Also check to see if
 791                                  * the data we are about to deliver is the same
 792                                  * data that we wanted to pass up to the user
 793                                  * out of order. If so, reset copy_oodata --
 794                                  * the received data filled a gap, and
 795                                  * is now in order!
 796                                  */
 797                                 if (q == te)
 798                                         copy_oodata = 0;
 799                         }
 800                         if (sbappendstream_rcvdemux(so, q->tqe_m,
 801                             q->tqe_th->th_seq - (tp->irs + 1), 0))
 802                                 dowakeup = 1;
 803                         if (tp->t_flagsext & TF_LRO_OFFLOADED) {
 804                                 tcp_update_lro_seq(tp->rcv_nxt,
 805                                  inp->inp_laddr, inp->inp_faddr,
 806                                  th->th_dport, th->th_sport);
 807                         }
 808                 }
 809                 zfree(tcp_reass_zone, q);
 810                 tcp_reass_qsize--;
 811                 q = nq;
 812         } while (q && q->tqe_th->th_seq == tp->rcv_nxt);
 813
 814 #if INET6
 815         if ((inp->inp_vflag & INP_IPV6) != 0) {
 816
 817                 KERNEL_DEBUG(DBG_LAYER_BEG,
 818                      ((inp->inp_fport << 16) | inp->inp_lport),
 819                      (((inp->in6p_laddr.s6_addr16[0] & 0xffff) << 16) |
 820                       (inp->in6p_faddr.s6_addr16[0] & 0xffff)),
 821                      0,0,0);
 822         }
 823         else
 824 #endif
 825         {
 826                 KERNEL_DEBUG(DBG_LAYER_BEG,
 827                      ((inp->inp_fport << 16) | inp->inp_lport),
 828                      (((inp->inp_laddr.s_addr & 0xffff) << 16) |
 829                       (inp->inp_faddr.s_addr & 0xffff)),
 830                      0,0,0);
 831         }
 832
 833 msg_unordered_delivery:
 834         /* Deliver out-of-order data as a message */
 835         if (te && (so->so_flags & SOF_ENABLE_MSGS) && copy_oodata && te->tqe_len) {
 836                 /*
 837                  * make a copy of the mbuf to be delivered up to
 838                  * the user, and add it to the sockbuf
 839                  */
 840                 oodata = m_copym(te->tqe_m, 0, M_COPYALL, M_DONTWAIT);
 841                 if (oodata != NULL) {
 842                         if (sbappendmsgstream_rcv(&so->so_rcv, oodata,
 843                                 te->tqe_th->th_seq - (tp->irs + 1), 1)) {
 844                                 dowakeup = 1;
 845                                 tcpstat.tcps_msg_unopkts++;
 846                         } else {
 847                                 tcpstat.tcps_msg_unoappendfail++;
 848                         }
 849                 }
 850         }
 851
 852         if (dowakeup)
 853                 sorwakeup(so); /* done with socket lock held */
 854         return (flags);
 855 }
 856
 857 /*
 858  * Reduce congestion window.
 859  */
 860 static void
 861 tcp_reduce_congestion_window(
 862         struct tcpcb    *tp)
 863 {
 864         /*
 865          * If the current tcp cc module has
 866          * defined a hook for tasks to run
 867          * before entering FR, call it
 868          */
 869         if (CC_ALGO(tp)->pre_fr != NULL)
 870                 CC_ALGO(tp)->pre_fr(tp);
 871         ENTER_FASTRECOVERY(tp);
 872         tp->snd_recover = tp->snd_max;
 873         tp->t_timer[TCPT_REXMT] = 0;
 874         tp->t_rtttime = 0;
 875         tp->ecn_flags |= TE_SENDCWR;
 876         tp->snd_cwnd = tp->snd_ssthresh +
 877                  tp->t_maxseg * tcprexmtthresh;
 878 }
 879
 880 /*
 881  * The application wants to get an event if there
 882  * is a stall during read. Set the initial keepalive
 883  * timeout to be equal to twice RTO.
 884  */
 885 static inline void
 886 tcp_adaptive_rwtimo_check(struct tcpcb *tp, int tlen)
 887 {
 888         if (tp->t_adaptive_rtimo > 0 && tlen > 0 &&
 889                 tp->t_state == TCPS_ESTABLISHED) {
 890                 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
 891                         (TCP_REXMTVAL(tp) << 1));
 892                 tp->t_flagsext |= TF_DETECT_READSTALL;
 893                 tp->t_rtimo_probes = 0;
 894         }
 895 }
 896
 897 inline void
 898 tcp_keepalive_reset(struct tcpcb *tp)
 899 {
 900         tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
 901                 TCP_CONN_KEEPIDLE(tp));
 902         tp->t_flagsext &= ~(TF_DETECT_READSTALL);
 903         tp->t_rtimo_probes = 0;
 904 }
 905
 906 /*
 907  * TCP input routine, follows pages 65-76 of the
 908  * protocol specification dated September, 1981 very closely.
 909  */
 910 #if INET6
 911 int
 912 tcp6_input(struct mbuf **mp, int *offp, int proto)
 913 {
 914 #pragma unused(proto)
 915         register struct mbuf *m = *mp;
 916         uint32_t ia6_flags;
 917         struct ifnet *ifp = m->m_pkthdr.rcvif;
 918
 919         IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), return IPPROTO_DONE);
 920
 921         /* Expect 32-bit aligned data pointer on strict-align platforms */
 922         MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
 923
 924         /*
 925          * draft-itojun-ipv6-tcp-to-anycast
 926          * better place to put this in?
 927          */
 928         if (ip6_getdstifaddr_info(m, NULL, &ia6_flags) == 0) {
 929                 if (ia6_flags & IN6_IFF_ANYCAST) {
 930                         struct ip6_hdr *ip6;
 931
 932                         ip6 = mtod(m, struct ip6_hdr *);
 933                         icmp6_error(m, ICMP6_DST_UNREACH,
 934                             ICMP6_DST_UNREACH_ADDR,
 935                             (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
 936
 937                         IF_TCP_STATINC(ifp, icmp6unreach);
 938
 939                         return (IPPROTO_DONE);
 940                 }
 941         }
 942
 943         tcp_input(m, *offp);
 944         return (IPPROTO_DONE);
 945 }
 946 #endif
 947
 948 /* Depending on the usage of mbuf space in the system, this function
 949  * will return true or false. This is used to determine if a socket
 950  * buffer can take more memory from the system for auto-tuning or not.
 951  */
 952 u_int8_t
 953 tcp_cansbgrow(struct sockbuf *sb)
 954 {
 955         /* Calculate the host level space limit in terms of MSIZE buffers.
 956          * We can use a maximum of half of the available mbuf space for
 957          * socket buffers.
 958          */
 959         u_int32_t mblim = ((nmbclusters >> 1) << (MCLSHIFT - MSIZESHIFT));
 960
 961         /* Calculate per sb limit in terms of bytes. We optimize this limit
 962          * for upto 16 socket buffers.
 963          */
 964
 965         u_int32_t sbspacelim = ((nmbclusters >> 4) << MCLSHIFT);
 966
 967         if ((total_sbmb_cnt < mblim) &&
 968                 (sb->sb_hiwat < sbspacelim)) {
 969                 return(1);
 970         }
 971         return(0);
 972 }
 973
 974 static void
 975 tcp_sbrcv_reserve(struct tcpcb *tp, struct sockbuf *sbrcv,
 976         u_int32_t newsize, u_int32_t idealsize)
 977 {
 978
 979         /* newsize should not exceed max */
 980         newsize = min(newsize, tcp_autorcvbuf_max);
 981
 982         /* The receive window scale negotiated at the
 983          * beginning of the connection will also set a
 984          * limit on the socket buffer size
 985          */
 986         newsize = min(newsize, TCP_MAXWIN << tp->rcv_scale);
 987
 988         /* Set new socket buffer size */
 989         if (newsize > sbrcv->sb_hiwat &&
 990                 (sbreserve(sbrcv, newsize) == 1)) {
 991                 sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize,
 992                         (idealsize != 0) ? idealsize : newsize),
 993                         tcp_autorcvbuf_max);
 994
 995                 /* Again check the limit set by the advertised
 996                  * window scale
 997                  */
 998                 sbrcv->sb_idealsize = min(sbrcv->sb_idealsize,
 999                         TCP_MAXWIN << tp->rcv_scale);
1000         }
1001 }
1002
1003 /*
1004  * This function is used to grow  a receive socket buffer. It
1005  * will take into account system-level memory usage and the
1006  * bandwidth available on the link to make a decision.
1007  */
1008 static void
1009 tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sbrcv,
1010         struct tcpopt *to, u_int32_t pktlen) {
1011
1012         /*
1013          * Do not grow the receive socket buffer if
1014          * - auto resizing is disabled, globally or on this socket
1015          * - the high water mark has already reached the maximum
1016          * - the stream is in background and receive side is being
1017          * throttled
1018          * - if there are segments in reassembly queue indicating loss,
1019          * do not need to increase recv window during recovery as more
1020          * data is not going to be sent.
1021          */
1022         if (tcp_do_autorcvbuf == 0 ||
1023                 (sbrcv->sb_flags & SB_AUTOSIZE) == 0 ||
1024                 tcp_cansbgrow(sbrcv) == 0 ||
1025                 sbrcv->sb_hiwat >= tcp_autorcvbuf_max ||
1026                 (tp->t_flagsext & TF_RECV_THROTTLE) ||
1027                 !LIST_EMPTY(&tp->t_segq)) {
1028                 /* Can not resize the socket buffer, just return */
1029                 goto out;
1030         }
1031
1032         if (TSTMP_GT(tcp_now,
1033                 tp->rfbuf_ts + TCPTV_RCVBUFIDLE)) {
1034                 /* If there has been an idle period in the
1035                  * connection, just restart the measurement
1036                  */
1037                 goto out;
1038         }
1039
1040         if (!TSTMP_SUPPORTED(tp)) {
1041                 /*
1042                  * Timestamp option is not supported on this connection.
1043                  * If the connection reached a state to indicate that
1044                  * the receive socket buffer needs to grow, increase
1045                  * the high water mark.
1046                  */
1047                 if (TSTMP_GEQ(tcp_now,
1048                         tp->rfbuf_ts + TCPTV_RCVNOTS_QUANTUM)) {
1049                         if (tp->rfbuf_cnt >= TCP_RCVNOTS_BYTELEVEL) {
1050                                 tcp_sbrcv_reserve(tp, sbrcv,
1051                                         tcp_autorcvbuf_max, 0);
1052                         }
1053                         goto out;
1054                 } else {
1055                         tp->rfbuf_cnt += pktlen;
1056                         return;
1057                 }
1058         } else if (to->to_tsecr != 0) {
1059                 /* If the timestamp shows that one RTT has
1060                  * completed, we can stop counting the
1061                  * bytes. Here we consider increasing
1062                  * the socket buffer if it fits the following
1063                  * criteria:
1064                  * 1. the bandwidth measured in last rtt, is more
1065                  * than half of sb_hiwat, this will help to scale the
1066                  * buffer according to the bandwidth on the link.
1067                  * 2. the space left in sbrcv is less than
1068                  * one forth of the bandwidth measured in last rtt, this
1069                  * will help to accommodate an application reading slowly.
1070                  */
1071                 if (TSTMP_GEQ(to->to_tsecr, tp->rfbuf_ts)) {
1072                         if ((tp->rfbuf_cnt > (sbrcv->sb_hiwat -
1073                                 (sbrcv->sb_hiwat >> tcp_rbuf_hiwat_shift)) ||
1074                                 (sbrcv->sb_hiwat - sbrcv->sb_cc) <
1075                                 (tp->rfbuf_cnt >> tcp_rbuf_win_shift))) {
1076                                 u_int32_t rcvbuf_inc;
1077                                 /*
1078                                  * Increment the receive window by a multiple of
1079                                  * maximum sized segments. This will prevent a
1080                                  * connection from sending smaller segments on
1081                                  * wire if it is limited by the receive window.
1082                                  *
1083                                  * Set the ideal size based on current bandwidth
1084                                  * measurements. We set the ideal size on receive
1085                                  * socket buffer to be twice the bandwidth delay
1086                                  * product.
1087                                  */
1088                                 rcvbuf_inc = tp->t_maxseg << tcp_autorcvbuf_inc_shift;
1089                                 tcp_sbrcv_reserve(tp, sbrcv,
1090                                         sbrcv->sb_hiwat + rcvbuf_inc,
1091                                         (tp->rfbuf_cnt * 2));
1092                         }
1093                         goto out;
1094                 } else {
1095                         tp->rfbuf_cnt += pktlen;
1096                         return;
1097                 }
1098         }
1099 out:
1100         /* Restart the measurement */
1101         tp->rfbuf_ts = 0;
1102         tp->rfbuf_cnt = 0;
1103         return;
1104 }
1105
1106 /* This function will trim the excess space added to the socket buffer
1107  * to help a slow-reading app. The ideal-size of a socket buffer depends
1108  * on the link bandwidth or it is set by an application and we aim to
1109  * reach that size.
1110  */
1111 void
1112 tcp_sbrcv_trim(struct tcpcb *tp, struct sockbuf *sbrcv) {
1113         if (tcp_do_autorcvbuf == 1 && sbrcv->sb_idealsize > 0 &&
1114                 sbrcv->sb_hiwat > sbrcv->sb_idealsize) {
1115                 int32_t trim;
1116                 /* compute the difference between ideal and current sizes */
1117                 u_int32_t diff = sbrcv->sb_hiwat - sbrcv->sb_idealsize;
1118
1119                 /* Compute the maximum advertised window for
1120                  * this connection.
1121                  */
1122                 u_int32_t advwin = tp->rcv_adv - tp->rcv_nxt;
1123
1124                 /* How much can we trim the receive socket buffer?
1125                  * 1. it can not be trimmed beyond the max rcv win advertised
1126                  * 2. if possible, leave 1/16 of bandwidth*delay to
1127                  * avoid closing the win completely
1128                  */
1129                 u_int32_t leave = max(advwin, (sbrcv->sb_idealsize >> 4));
1130
1131                 /* Sometimes leave can be zero, in that case leave at least
1132                  * a few segments worth of space.
1133                  */
1134                 if (leave == 0)
1135                         leave = tp->t_maxseg << tcp_autorcvbuf_inc_shift;
1136
1137                 trim = sbrcv->sb_hiwat - (sbrcv->sb_cc + leave);
1138                 trim = imin(trim, (int32_t)diff);
1139
1140                 if (trim > 0)
1141                         sbreserve(sbrcv, (sbrcv->sb_hiwat - trim));
1142         }
1143 }
1144
1145 /* We may need to trim the send socket buffer size for two reasons:
1146  * 1. if the rtt seen on the connection is climbing up, we do not
1147  * want to fill the buffers any more.
1148  * 2. if the congestion win on the socket backed off, there is no need
1149  * to hold more mbufs for that connection than what the cwnd will allow.
1150  */
1151 void
1152 tcp_sbsnd_trim(struct sockbuf *sbsnd) {
1153         if (tcp_do_autosendbuf == 1 &&
1154                 ((sbsnd->sb_flags & (SB_AUTOSIZE | SB_TRIM)) ==
1155                         (SB_AUTOSIZE | SB_TRIM)) &&
1156                 (sbsnd->sb_idealsize > 0) &&
1157                 (sbsnd->sb_hiwat > sbsnd->sb_idealsize)) {
1158                 u_int32_t trim = 0;
1159                 if (sbsnd->sb_cc <= sbsnd->sb_idealsize) {
1160                         trim = sbsnd->sb_hiwat - sbsnd->sb_idealsize;
1161                 } else {
1162                         trim = sbsnd->sb_hiwat - sbsnd->sb_cc;
1163                 }
1164                 sbreserve(sbsnd, (sbsnd->sb_hiwat - trim));
1165         }
1166         if (sbsnd->sb_hiwat <= sbsnd->sb_idealsize)
1167                 sbsnd->sb_flags &= ~(SB_TRIM);
1168 }
1169
1170 /*
1171  * If timestamp option was not negotiated on this connection
1172  * and this connection is on the receiving side of a stream
1173  * then we can not measure the delay on the link accurately.
1174  * Instead of enabling automatic receive socket buffer
1175  * resizing, just give more space to the receive socket buffer.
1176  */
1177 static inline void
1178 tcp_sbrcv_tstmp_check(struct tcpcb *tp) {
1179         struct socket *so = tp->t_inpcb->inp_socket;
1180         u_int32_t newsize = 2 * tcp_recvspace;
1181         struct sockbuf *sbrcv = &so->so_rcv;
1182
1183         if ((tp->t_flags & (TF_REQ_TSTMP | TF_RCVD_TSTMP)) !=
1184                 (TF_REQ_TSTMP | TF_RCVD_TSTMP) &&
1185                 (sbrcv->sb_flags & SB_AUTOSIZE) != 0) {
1186                 tcp_sbrcv_reserve(tp, sbrcv, newsize, 0);
1187         }
1188 }
1189
1190 /* A receiver will evaluate the flow of packets on a connection
1191  * to see if it can reduce ack traffic. The receiver will start
1192  * stretching acks if all of the following conditions are met:
1193  * 1. tcp_delack_enabled is set to 3
1194  * 2. If the bytes received in the last 100ms is greater than a threshold
1195  *      defined by maxseg_unacked
1196  * 3. If the connection has not been idle for tcp_maxrcvidle period.
1197  * 4. If the connection has seen enough packets to let the slow-start
1198  *      finish after connection establishment or after some packet loss.
1199  *
1200  * The receiver will stop stretching acks if there is congestion/reordering
1201  * as indicated by packets on reassembly queue or an ECN. If the delayed-ack
1202  * timer fires while stretching acks, it means that the packet flow has gone
1203  * below the threshold defined by maxseg_unacked and the receiver will stop
1204  * stretching acks. The receiver gets no indication when slow-start is completed
1205  * or when the connection reaches an idle state. That is why we use
1206  * tcp_rcvsspktcnt to cover slow-start and tcp_maxrcvidle to identify idle
1207  * state.
1208  */
1209 static inline int
1210 tcp_stretch_ack_enable(struct tcpcb *tp)
1211 {
1212         if (!(tp->t_flagsext & TF_NOSTRETCHACK) &&
1213                 tp->rcv_by_unackwin >= (maxseg_unacked * tp->t_maxseg) &&
1214                 TSTMP_GT(tp->rcv_unackwin + tcp_maxrcvidle, tcp_now) &&
1215                 (!(tp->t_flagsext & TF_RCVUNACK_WAITSS) ||
1216                 (tp->rcv_waitforss >= tcp_rcvsspktcnt))) {
1217                 return(1);
1218         }
1219
1220         return(0);
1221 }
1222
1223 /* Reset the state related to stretch-ack algorithm. This will make
1224  * the receiver generate an ack every other packet. The receiver
1225  * will start re-evaluating the rate at which packets come to decide
1226  * if it can benefit by lowering the ack traffic.
1227  */
1228 void
1229 tcp_reset_stretch_ack(struct tcpcb *tp)
1230 {
1231         tp->t_flags &= ~(TF_STRETCHACK);
1232         tp->rcv_by_unackwin = 0;
1233         tp->rcv_unackwin = tcp_now + tcp_rcvunackwin;
1234 }
1235
1236 /*
1237  * The last packet was a retransmission, check if this ack
1238  * indicates that the retransmission was spurious.
1239  *
1240  * If the connection supports timestamps, we could use it to
1241  * detect if the last retransmit was not needed. Otherwise,
1242  * we check if the ACK arrived within RTT/2 window, then it
1243  * was a mistake to do the retransmit in the first place.
1244  *
1245  * This function will return 1 if it is a spurious retransmit,
1246  * 0 otherwise.
1247  */
1248 static int
1249 tcp_detect_bad_rexmt(struct tcpcb *tp, struct tcpopt *to)
1250 {
1251         int32_t tdiff, bad_rexmt_win;
1252         tdiff = (int32_t)(tcp_now - tp->t_rxtstart);
1253         bad_rexmt_win = (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
1254
1255         if (TSTMP_SUPPORTED(tp) && tp->t_rxtstart > 0 &&
1256                 (to->to_flags & TOF_TS) != 0 &&
1257                 to->to_tsecr != 0 &&
1258                 TSTMP_LT(to->to_tsecr, tp->t_rxtstart)) {
1259                 return (1);
1260         } else if (tp->t_rxtshift == 1 &&
1261                 tdiff < bad_rexmt_win) {
1262                 return(1);
1263         }
1264         return(0);
1265 }
1266
1267
1268 /*
1269  * Restore congestion window state if a spurious timeout
1270  * was detected.
1271  */
1272 static void
1273 tcp_bad_rexmt_restore_state(struct tcpcb *tp, struct tcphdr *th)
1274 {
1275         if (TSTMP_SUPPORTED(tp)) {
1276                 u_int32_t fsize, acked;
1277                 fsize = tp->snd_max - th->th_ack;
1278                 acked = BYTES_ACKED(th, tp);
1279
1280                 /*
1281                  * Implement bad retransmit recovery as
1282                  * described in RFC 4015.
1283                  */
1284                 tp->snd_ssthresh = tp->snd_ssthresh_prev;
1285
1286                 /* Initialize cwnd to the initial window */
1287                 if (CC_ALGO(tp)->cwnd_init != NULL)
1288                         CC_ALGO(tp)->cwnd_init(tp);
1289
1290                 tp->snd_cwnd = fsize + min(acked, tp->snd_cwnd);
1291
1292         } else {
1293                 tp->snd_cwnd = tp->snd_cwnd_prev;
1294                 tp->snd_ssthresh = tp->snd_ssthresh_prev;
1295                 if (tp->t_flags & TF_WASFRECOVERY)
1296                         ENTER_FASTRECOVERY(tp);
1297         }
1298         tp->snd_recover = tp->snd_recover_prev;
1299         tp->snd_nxt = tp->snd_max;
1300         tp->t_rxtshift = 0;
1301         tp->t_rxtstart = 0;
1302
1303         /* Fix send socket buffer to reflect the change in cwnd */
1304         tcp_bad_rexmt_fix_sndbuf(tp);
1305
1306         /*
1307          * This RTT might reflect the extra delay induced
1308          * by the network. Skip using this sample for RTO
1309          * calculation and mark the connection so we can
1310          * recompute RTT when the next eligible sample is
1311          * found.
1312          */
1313         tp->t_flagsext |= TF_RECOMPUTE_RTT;
1314         tp->t_badrexmt_time = tcp_now;
1315         tp->t_rtttime = 0;
1316 }
1317
1318 void
1319 tcp_input(m, off0)
1320         struct mbuf *m;
1321         int off0;
1322 {
1323         register struct tcphdr *th;
1324         register struct ip *ip = NULL;
1325         register struct inpcb *inp;
1326         u_char *optp = NULL;
1327         int optlen = 0;
1328         int tlen, off;
1329         int drop_hdrlen;
1330         register struct tcpcb *tp = 0;
1331         register int thflags;
1332         struct socket *so = 0;
1333         int todrop, acked, ourfinisacked, needoutput = 0;
1334         struct in_addr laddr;
1335 #if INET6
1336         struct in6_addr laddr6;
1337 #endif
1338         int dropsocket = 0;
1339         int iss = 0, nosock = 0;
1340         u_int32_t tiwin, sack_bytes_acked = 0;
1341         struct tcpopt to;               /* options in this segment */
1342         struct sockaddr_in *next_hop = NULL;
1343 #if TCPDEBUG
1344         short ostate = 0;
1345 #endif
1346         struct m_tag *fwd_tag;
1347         u_char ip_ecn = IPTOS_ECN_NOTECT;
1348         unsigned int ifscope, nocell = 0;
1349         uint8_t isconnected, isdisconnected;
1350         struct ifnet *ifp = m->m_pkthdr.rcvif;
1351         int pktf_sw_lro_pkt = (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) ? 1 : 0;
1352         int nlropkts = (pktf_sw_lro_pkt == 1) ? m->m_pkthdr.lro_npkts : 1;
1353         int turnoff_lro = 0, win;
1354 #if MPTCP
1355         struct mptcb *mp_tp = NULL;
1356         uint16_t mptcp_csum = 0;
1357 #endif /* MPTCP */
1358         boolean_t cell = IFNET_IS_CELLULAR(ifp);
1359         boolean_t wifi = (!cell && IFNET_IS_WIFI(ifp));
1360
1361 #define TCP_INC_VAR(stat, npkts) do {                   \
1362                 stat += npkts;                          \
1363 } while (0)
1364
1365         TCP_INC_VAR(tcpstat.tcps_rcvtotal, nlropkts);
1366
1367         /* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */
1368         if (!SLIST_EMPTY(&m->m_pkthdr.tags)) {
1369                 fwd_tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID,
1370                     KERNEL_TAG_TYPE_IPFORWARD, NULL);
1371         } else {
1372                 fwd_tag = NULL;
1373         }
1374         if (fwd_tag != NULL) {
1375                 struct ip_fwd_tag *ipfwd_tag =
1376                         (struct ip_fwd_tag *)(fwd_tag+1);
1377
1378                 next_hop = ipfwd_tag->next_hop;
1379                 m_tag_delete(m, fwd_tag);
1380         }
1381
1382 #if INET6
1383         struct ip6_hdr *ip6 = NULL;
1384         int isipv6;
1385 #endif /* INET6 */
1386         int rstreason; /* For badport_bandlim accounting purposes */
1387         struct proc *proc0=current_proc();
1388
1389         KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_START,0,0,0,0,0);
1390
1391 #if INET6
1392         isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
1393 #endif
1394         bzero((char *)&to, sizeof(to));
1395
1396 #if INET6
1397         if (isipv6) {
1398                 /*
1399                  * Expect 32-bit aligned data pointer on
1400                  * strict-align platforms
1401                  */
1402                 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
1403
1404                 /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */
1405                 ip6 = mtod(m, struct ip6_hdr *);
1406                 tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
1407                 th = (struct tcphdr *)(void *)((caddr_t)ip6 + off0);
1408
1409                 if (tcp_input_checksum(AF_INET6, m, th, off0, tlen))
1410                         goto dropnosock;
1411
1412                 KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport),
1413                      (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
1414                      th->th_seq, th->th_ack, th->th_win);
1415                 /*
1416                  * Be proactive about unspecified IPv6 address in source.
1417                  * As we use all-zero to indicate unbounded/unconnected pcb,
1418                  * unspecified IPv6 address can be used to confuse us.
1419                  *
1420                  * Note that packets with unspecified IPv6 destination is
1421                  * already dropped in ip6_input.
1422                  */
1423                 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
1424                         /* XXX stat */
1425                         IF_TCP_STATINC(ifp, unspecv6);
1426                         goto dropnosock;
1427                 }
1428                 DTRACE_TCP5(receive, struct mbuf *, m, struct inpcb *, NULL,
1429                         struct ip6_hdr *, ip6, struct tcpcb *, NULL,
1430                         struct tcphdr *, th);
1431
1432                 ip_ecn = (ntohl(ip6->ip6_flow) >> 20) & IPTOS_ECN_MASK;
1433         } else
1434 #endif /* INET6 */
1435         {
1436         /*
1437          * Get IP and TCP header together in first mbuf.
1438          * Note: IP leaves IP header in first mbuf.
1439          */
1440         if (off0 > sizeof (struct ip)) {
1441                 ip_stripoptions(m, (struct mbuf *)0);
1442                 off0 = sizeof(struct ip);
1443         }
1444         if (m->m_len < sizeof (struct tcpiphdr)) {
1445                 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) {
1446                         tcpstat.tcps_rcvshort++;
1447                         return;
1448                 }
1449         }
1450
1451         /* Expect 32-bit aligned data pointer on strict-align platforms */
1452         MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
1453
1454         ip = mtod(m, struct ip *);
1455         th = (struct tcphdr *)(void *)((caddr_t)ip + off0);
1456         tlen = ip->ip_len;
1457
1458         if (tcp_input_checksum(AF_INET, m, th, off0, tlen))
1459                 goto dropnosock;
1460
1461 #if INET6
1462         /* Re-initialization for later version check */
1463         ip->ip_v = IPVERSION;
1464 #endif
1465         ip_ecn = (ip->ip_tos & IPTOS_ECN_MASK);
1466
1467         DTRACE_TCP5(receive, struct mbuf *, m, struct inpcb *, NULL,
1468                 struct ip *, ip, struct tcpcb *, NULL, struct tcphdr *, th);
1469
1470         KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport),
1471                 (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
1472                   th->th_seq, th->th_ack, th->th_win);
1473
1474         }
1475
1476         /*
1477          * Check that TCP offset makes sense,
1478          * pull out TCP options and adjust length.              XXX
1479          */
1480         off = th->th_off << 2;
1481         if (off < sizeof (struct tcphdr) || off > tlen) {
1482                 tcpstat.tcps_rcvbadoff++;
1483                 IF_TCP_STATINC(ifp, badformat);
1484                 goto dropnosock;
1485         }
1486         tlen -= off;    /* tlen is used instead of ti->ti_len */
1487         if (off > sizeof (struct tcphdr)) {
1488 #if INET6
1489                 if (isipv6) {
1490                         IP6_EXTHDR_CHECK(m, off0, off, return);
1491                         ip6 = mtod(m, struct ip6_hdr *);
1492                         th = (struct tcphdr *)(void *)((caddr_t)ip6 + off0);
1493                 } else
1494 #endif /* INET6 */
1495                 {
1496                         if (m->m_len < sizeof(struct ip) + off) {
1497                                 if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) {
1498                                         tcpstat.tcps_rcvshort++;
1499                                         return;
1500                                 }
1501                                 ip = mtod(m, struct ip *);
1502                                 th = (struct tcphdr *)(void *)((caddr_t)ip + off0);
1503                         }
1504                 }
1505                 optlen = off - sizeof (struct tcphdr);
1506                 optp = (u_char *)(th + 1);
1507                 /*
1508                  * Do quick retrieval of timestamp options ("options
1509                  * prediction?").  If timestamp is the only option and it's
1510                  * formatted as recommended in RFC 1323 appendix A, we
1511                  * quickly get the values now and not bother calling
1512                  * tcp_dooptions(), etc.
1513                  */
1514                 if ((optlen == TCPOLEN_TSTAMP_APPA ||
1515                         (optlen > TCPOLEN_TSTAMP_APPA &&
1516                         optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
1517                         *(u_int32_t *)(void *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
1518                         (th->th_flags & TH_SYN) == 0) {
1519                         to.to_flags |= TOF_TS;
1520                         to.to_tsval = ntohl(*(u_int32_t *)(void *)(optp + 4));
1521                         to.to_tsecr = ntohl(*(u_int32_t *)(void *)(optp + 8));
1522                         optp = NULL;    /* we've parsed the options */
1523                 }
1524         }
1525         thflags = th->th_flags;
1526
1527 #if TCP_DROP_SYNFIN
1528         /*
1529          * If the drop_synfin option is enabled, drop all packets with
1530          * both the SYN and FIN bits set. This prevents e.g. nmap from
1531          * identifying the TCP/IP stack.
1532          *
1533          * This is a violation of the TCP specification.
1534          */
1535         if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN)) {
1536                 IF_TCP_STATINC(ifp, synfin);
1537                 goto dropnosock;
1538         }
1539 #endif
1540
1541         /*
1542          * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options,
1543          * until after ip6_savecontrol() is called and before other functions
1544          * which don't want those proto headers.
1545          * Because ip6_savecontrol() is going to parse the mbuf to
1546          * search for data to be passed up to user-land, it wants mbuf
1547          * parameters to be unchanged.
1548          */
1549         drop_hdrlen = off0 + off;
1550
1551         /* Since this is an entry point for input processing of tcp packets, we
1552          * can update the tcp clock here.
1553          */
1554         calculate_tcp_clock();
1555
1556         /*
1557          * Record the interface where this segment arrived on; this does not
1558          * affect normal data output (for non-detached TCP) as it provides a
1559          * hint about which route and interface to use for sending in the
1560          * absence of a PCB, when scoped routing (and thus source interface
1561          * selection) are enabled.
1562          */
1563         if ((m->m_pkthdr.pkt_flags & PKTF_LOOP) || m->m_pkthdr.rcvif == NULL)
1564                 ifscope = IFSCOPE_NONE;
1565         else
1566                 ifscope = m->m_pkthdr.rcvif->if_index;
1567
1568         /*
1569          * Convert TCP protocol specific fields to host format.
1570          */
1571
1572 #if BYTE_ORDER != BIG_ENDIAN
1573         NTOHL(th->th_seq);
1574         NTOHL(th->th_ack);
1575         NTOHS(th->th_win);
1576         NTOHS(th->th_urp);
1577 #endif
1578
1579         /*
1580          * Locate pcb for segment.
1581          */
1582 findpcb:
1583
1584         isconnected = FALSE;
1585         isdisconnected = FALSE;
1586
1587 #if IPFIREWALL_FORWARD
1588         if (next_hop != NULL
1589 #if INET6
1590             && isipv6 == 0 /* IPv6 support is not yet */
1591 #endif /* INET6 */
1592             ) {
1593                 /*
1594                  * Diverted. Pretend to be the destination.
1595                  * already got one like this?
1596                  */
1597                 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
1598                         ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif);
1599                 if (!inp) {
1600                         /*
1601                          * No, then it's new. Try find the ambushing socket
1602                          */
1603                         if (!next_hop->sin_port) {
1604                                 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src,
1605                                     th->th_sport, next_hop->sin_addr,
1606                                     th->th_dport, 1, m->m_pkthdr.rcvif);
1607                         } else {
1608                                 inp = in_pcblookup_hash(&tcbinfo,
1609                                     ip->ip_src, th->th_sport,
1610                                     next_hop->sin_addr,
1611                                     ntohs(next_hop->sin_port), 1,
1612                                     m->m_pkthdr.rcvif);
1613                         }
1614                 }
1615         } else
1616 #endif  /* IPFIREWALL_FORWARD */
1617       {
1618 #if INET6
1619         if (isipv6)
1620                 inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport,
1621                                          &ip6->ip6_dst, th->th_dport, 1,
1622                                          m->m_pkthdr.rcvif);
1623         else
1624 #endif /* INET6 */
1625         inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
1626             ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif);
1627       }
1628
1629         /*
1630          * Use the interface scope information from the PCB for outbound
1631          * segments.  If the PCB isn't present and if scoped routing is
1632          * enabled, tcp_respond will use the scope of the interface where
1633          * the segment arrived on.
1634          */
1635         if (inp != NULL && (inp->inp_flags & INP_BOUND_IF))
1636                 ifscope = inp->inp_boundifp->if_index;
1637
1638         /*
1639          * If the PCB is present and the socket isn't allowed to use
1640          * the cellular interface, indicate it as such for tcp_respond.
1641          */
1642         if (inp != NULL && (inp->inp_flags & INP_NO_IFT_CELLULAR))
1643                 nocell = 1;
1644
1645 #if IPSEC
1646         if (ipsec_bypass == 0)  {
1647 #if INET6
1648                 if (isipv6) {
1649                         if (inp != NULL && ipsec6_in_reject_so(m, inp->inp_socket)) {
1650                                 IPSEC_STAT_INCREMENT(ipsec6stat.in_polvio);
1651                                 if (in_pcb_checkstate(inp, WNT_RELEASE, 0) == WNT_STOPUSING)
1652                                         inp = NULL;     // pretend we didn't find it
1653
1654                                 IF_TCP_STATINC(ifp, badformatipsec);
1655
1656                                 goto dropnosock;
1657                         }
1658                 } else
1659 #endif /* INET6 */
1660                         if (inp != NULL && ipsec4_in_reject_so(m, inp->inp_socket)) {
1661                                 IPSEC_STAT_INCREMENT(ipsecstat.in_polvio);
1662                                 if (in_pcb_checkstate(inp, WNT_RELEASE, 0) == WNT_STOPUSING)
1663                                         inp = NULL;     // pretend we didn't find it
1664
1665                                 IF_TCP_STATINC(ifp, badformatipsec);
1666
1667                                 goto dropnosock;
1668                         }
1669         }
1670 #endif /*IPSEC*/
1671
1672         /*
1673          * If the state is CLOSED (i.e., TCB does not exist) then
1674          * all data in the incoming segment is discarded.
1675          * If the TCB exists but is in CLOSED state, it is embryonic,
1676          * but should either do a listen or a connect soon.
1677          */
1678         if (inp == NULL) {
1679                 if (log_in_vain) {
1680 #if INET6
1681                         char dbuf[MAX_IPv6_STR_LEN], sbuf[MAX_IPv6_STR_LEN];
1682 #else /* INET6 */
1683                         char dbuf[MAX_IPv4_STR_LEN], sbuf[MAX_IPv4_STR_LEN];
1684 #endif /* INET6 */
1685
1686 #if INET6
1687                         if (isipv6) {
1688                                 inet_ntop(AF_INET6, &ip6->ip6_dst, dbuf, sizeof(dbuf));
1689                                 inet_ntop(AF_INET6, &ip6->ip6_src, sbuf, sizeof(sbuf));
1690                         } else
1691 #endif
1692                         {
1693                                 inet_ntop(AF_INET, &ip->ip_dst, dbuf, sizeof(dbuf));
1694                                 inet_ntop(AF_INET, &ip->ip_src, sbuf, sizeof(sbuf));
1695                         }
1696                         switch (log_in_vain) {
1697                         case 1:
1698                                 if(thflags & TH_SYN)
1699                                         log(LOG_INFO,
1700                                                 "Connection attempt to TCP %s:%d from %s:%d\n",
1701                                                 dbuf, ntohs(th->th_dport),
1702                                                 sbuf,
1703                                                 ntohs(th->th_sport));
1704                                 break;
1705                         case 2:
1706                                 log(LOG_INFO,
1707                                         "Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n",
1708                                         dbuf, ntohs(th->th_dport), sbuf,
1709                                         ntohs(th->th_sport), thflags);
1710                                 break;
1711                         case 3:
1712                         case 4:
1713                                 if ((thflags & TH_SYN) && !(thflags & TH_ACK) &&
1714                                         !(m->m_flags & (M_BCAST | M_MCAST)) &&
1715 #if INET6
1716                                         ((isipv6 && !IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) ||
1717                                          (!isipv6 && ip->ip_dst.s_addr != ip->ip_src.s_addr))
1718 #else
1719                                         ip->ip_dst.s_addr != ip->ip_src.s_addr
1720 #endif
1721                                          )
1722                                         log_in_vain_log((LOG_INFO,
1723                                                 "Stealth Mode connection attempt to TCP %s:%d from %s:%d\n",
1724                                                 dbuf, ntohs(th->th_dport),
1725                                                 sbuf,
1726                                                 ntohs(th->th_sport)));
1727                                 break;
1728                         default:
1729                                 break;
1730                         }
1731                 }
1732                 if (blackhole) {
1733                         if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type != IFT_LOOP)
1734
1735                                 switch (blackhole) {
1736                                 case 1:
1737                                         if (thflags & TH_SYN)
1738                                                 goto dropnosock;
1739                                         break;
1740                                 case 2:
1741                                         goto dropnosock;
1742                                 default:
1743                                         goto dropnosock;
1744                                 }
1745                 }
1746                 rstreason = BANDLIM_RST_CLOSEDPORT;
1747                 IF_TCP_STATINC(ifp, noconnnolist);
1748                 goto dropwithresetnosock;
1749         }
1750         so = inp->inp_socket;
1751         if (so == NULL) {
1752                 /* This case shouldn't happen  as the socket shouldn't be null
1753                  * if inp_state isn't set to INPCB_STATE_DEAD
1754                  * But just in case, we pretend we didn't find the socket if we hit this case
1755                  * as this isn't cause for a panic (the socket might be leaked however)...
1756                  */
1757                 inp = NULL;
1758 #if TEMPDEBUG
1759                 printf("tcp_input: no more socket for inp=%x. This shouldn't happen\n", inp);
1760 #endif
1761                 goto dropnosock;
1762         }
1763
1764         tcp_lock(so, 1, 0);
1765         if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
1766                 tcp_unlock(so, 1, (void *)2);
1767                 inp = NULL;     // pretend we didn't find it
1768                 goto dropnosock;
1769         }
1770
1771         tp = intotcpcb(inp);
1772         if (tp == 0) {
1773                 rstreason = BANDLIM_RST_CLOSEDPORT;
1774                 IF_TCP_STATINC(ifp, noconnlist);
1775                 goto dropwithreset;
1776         }
1777         if (tp->t_state == TCPS_CLOSED)
1778                 goto drop;
1779
1780         /* Unscale the window into a 32-bit value. */
1781         if ((thflags & TH_SYN) == 0)
1782                 tiwin = th->th_win << tp->snd_scale;
1783         else
1784                 tiwin = th->th_win;
1785
1786 #if CONFIG_MACF_NET
1787         if (mac_inpcb_check_deliver(inp, m, AF_INET, SOCK_STREAM))
1788                 goto drop;
1789 #endif
1790
1791         /* Avoid processing packets while closing a listen socket */
1792         if (tp->t_state == TCPS_LISTEN &&
1793                 (so->so_options & SO_ACCEPTCONN) == 0)
1794                 goto drop;
1795
1796         if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
1797 #if TCPDEBUG
1798                 if (so->so_options & SO_DEBUG) {
1799                         ostate = tp->t_state;
1800 #if INET6
1801                         if (isipv6)
1802                                 bcopy((char *)ip6, (char *)tcp_saveipgen,
1803                                       sizeof(*ip6));
1804                         else
1805 #endif /* INET6 */
1806                         bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip));
1807                         tcp_savetcp = *th;
1808                 }
1809 #endif
1810                 if (so->so_options & SO_ACCEPTCONN) {
1811                     register struct tcpcb *tp0 = tp;
1812                         struct socket *so2;
1813                         struct socket *oso;
1814                         struct sockaddr_storage from;
1815 #if INET6
1816                         struct inpcb *oinp = sotoinpcb(so);
1817 #endif /* INET6 */
1818                         struct ifnet *head_ifscope;
1819                         unsigned int head_nocell, head_recvanyif;
1820
1821                         /* Get listener's bound-to-interface, if any */
1822                         head_ifscope = (inp->inp_flags & INP_BOUND_IF) ?
1823                             inp->inp_boundifp : NULL;
1824                         /* Get listener's no-cellular information, if any */
1825                         head_nocell = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0;
1826                         /* Get listener's recv-any-interface, if any */
1827                         head_recvanyif = (inp->inp_flags & INP_RECV_ANYIF);
1828
1829                         /*
1830                          * If the state is LISTEN then ignore segment if it contains an RST.
1831                          * If the segment contains an ACK then it is bad and send a RST.
1832                          * If it does not contain a SYN then it is not interesting; drop it.
1833                          * If it is from this socket, drop it, it must be forged.
1834                          */
1835                         if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
1836                                 IF_TCP_STATINC(ifp, listbadsyn);
1837
1838                                 if (thflags & TH_RST) {
1839                                         goto drop;
1840                                 }
1841                                 if (thflags & TH_ACK) {
1842                                         tp = NULL;
1843                                         tcpstat.tcps_badsyn++;
1844                                         rstreason = BANDLIM_RST_OPENPORT;
1845                                         goto dropwithreset;
1846                                 }
1847
1848                                 /* We come here if there is no SYN set */
1849                                 tcpstat.tcps_badsyn++;
1850                                 goto drop;
1851                         }
1852                         KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_START,0,0,0,0,0);
1853                         if (th->th_dport == th->th_sport) {
1854 #if INET6
1855                                 if (isipv6) {
1856                                         if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
1857                                                        &ip6->ip6_src))
1858                                                 goto drop;
1859                                 } else
1860 #endif /* INET6 */
1861                                         if (ip->ip_dst.s_addr == ip->ip_src.s_addr)
1862                                                 goto drop;
1863                         }
1864                         /*
1865                          * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
1866                          * in_broadcast() should never return true on a received
1867                          * packet with M_BCAST not set.
1868                          *
1869                          * Packets with a multicast source address should also
1870                          * be discarded.
1871                          */
1872                         if (m->m_flags & (M_BCAST|M_MCAST))
1873                                 goto drop;
1874 #if INET6
1875                         if (isipv6) {
1876                                 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
1877                                         IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
1878                                         goto drop;
1879                         } else
1880 #endif
1881                         if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
1882                                 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
1883                                 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
1884                                 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
1885                                 goto drop;
1886
1887
1888 #if INET6
1889                         /*
1890                          * If deprecated address is forbidden,
1891                          * we do not accept SYN to deprecated interface
1892                          * address to prevent any new inbound connection from
1893                          * getting established.
1894                          * When we do not accept SYN, we send a TCP RST,
1895                          * with deprecated source address (instead of dropping
1896                          * it).  We compromise it as it is much better for peer
1897                          * to send a RST, and RST will be the final packet
1898                          * for the exchange.
1899                          *
1900                          * If we do not forbid deprecated addresses, we accept
1901                          * the SYN packet.  RFC 4862 forbids dropping SYN in
1902                          * this case.
1903                          */
1904                         if (isipv6 && !ip6_use_deprecated) {
1905                                 uint32_t ia6_flags;
1906
1907                                 if (ip6_getdstifaddr_info(m, NULL,
1908                                     &ia6_flags) == 0) {
1909                                         if (ia6_flags & IN6_IFF_DEPRECATED) {
1910                                                 tp = NULL;
1911                                                 rstreason = BANDLIM_RST_OPENPORT;
1912                                                 IF_TCP_STATINC(ifp, deprecate6);
1913                                                 goto dropwithreset;
1914                                         }
1915                                 }
1916                         }
1917 #endif
1918                         if (so->so_filt) {
1919 #if INET6
1920                                 if (isipv6) {
1921                                         struct sockaddr_in6     *sin6 = (struct sockaddr_in6*)&from;
1922
1923                                         sin6->sin6_len = sizeof(*sin6);
1924                                         sin6->sin6_family = AF_INET6;
1925                                         sin6->sin6_port = th->th_sport;
1926                                         sin6->sin6_flowinfo = 0;
1927                                         sin6->sin6_addr = ip6->ip6_src;
1928                                         sin6->sin6_scope_id = 0;
1929                                 }
1930                                 else
1931 #endif
1932                                 {
1933                                         struct sockaddr_in *sin = (struct sockaddr_in*)&from;
1934
1935                                         sin->sin_len = sizeof(*sin);
1936                                         sin->sin_family = AF_INET;
1937                                         sin->sin_port = th->th_sport;
1938                                         sin->sin_addr = ip->ip_src;
1939                                 }
1940                                 so2 = sonewconn(so, 0, (struct sockaddr*)&from);
1941                         } else {
1942                                 so2 = sonewconn(so, 0, NULL);
1943                         }
1944                         if (so2 == 0) {
1945                                 tcpstat.tcps_listendrop++;
1946                                 if (tcp_dropdropablreq(so)) {
1947                                         if (so->so_filt)
1948                                                 so2 = sonewconn(so, 0, (struct sockaddr*)&from);
1949                                         else
1950                                                 so2 = sonewconn(so, 0, NULL);
1951                                 }
1952                                 if (!so2)
1953                                         goto drop;
1954                         }
1955
1956                         /* Point "inp" and "tp" in tandem to new socket */
1957                         inp = (struct inpcb *)so2->so_pcb;
1958                         tp = intotcpcb(inp);
1959
1960                         oso = so;
1961                         tcp_unlock(so, 0, 0); /* Unlock but keep a reference on listener for now */
1962
1963                         so = so2;
1964                         tcp_lock(so, 1, 0);
1965                         /*
1966                          * Mark socket as temporary until we're
1967                          * committed to keeping it.  The code at
1968                          * ``drop'' and ``dropwithreset'' check the
1969                          * flag dropsocket to see if the temporary
1970                          * socket created here should be discarded.
1971                          * We mark the socket as discardable until
1972                          * we're committed to it below in TCPS_LISTEN.
1973                          * There are some error conditions in which we
1974                          * have to drop the temporary socket.
1975                          */
1976                         dropsocket++;
1977                         /*
1978                          * Inherit INP_BOUND_IF from listener; testing if
1979                          * head_ifscope is non-NULL is sufficient, since it
1980                          * can only be set to a non-zero value earlier if
1981                          * the listener has such a flag set.
1982                          */
1983                         if (head_ifscope != NULL) {
1984                                 inp->inp_flags |= INP_BOUND_IF;
1985                                 inp->inp_boundifp = head_ifscope;
1986                         } else {
1987                                 inp->inp_flags &= ~INP_BOUND_IF;
1988                         }
1989                         /*
1990                          * Inherit INP_NO_IFT_CELLULAR from listener.
1991                          */
1992                         if (head_nocell) {
1993                                 inp->inp_flags |= INP_NO_IFT_CELLULAR;
1994                         }
1995                         /*
1996                          * Inherit {IN,IN6}_RECV_ANYIF from listener.
1997                          */
1998                         if (head_recvanyif)
1999                                 inp->inp_flags |= INP_RECV_ANYIF;
2000                         else
2001                                 inp->inp_flags &= ~INP_RECV_ANYIF;
2002 #if INET6
2003                         if (isipv6)
2004                                 inp->in6p_laddr = ip6->ip6_dst;
2005                         else {
2006                                 inp->inp_vflag &= ~INP_IPV6;
2007                                 inp->inp_vflag |= INP_IPV4;
2008 #endif /* INET6 */
2009                                 inp->inp_laddr = ip->ip_dst;
2010 #if INET6
2011                         }
2012 #endif /* INET6 */
2013                         inp->inp_lport = th->th_dport;
2014                         if (in_pcbinshash(inp, 0) != 0) {
2015                                 /*
2016                                  * Undo the assignments above if we failed to
2017                                  * put the PCB on the hash lists.
2018                                  */
2019 #if INET6
2020                                 if (isipv6)
2021                                         inp->in6p_laddr = in6addr_any;
2022                                 else
2023 #endif /* INET6 */
2024                                         inp->inp_laddr.s_addr = INADDR_ANY;
2025                                 inp->inp_lport = 0;
2026                                 tcp_lock(oso, 0, 0);    /* release ref on parent */
2027                                 tcp_unlock(oso, 1, 0);
2028                                 goto drop;
2029                         }
2030 #if INET6
2031                         if (isipv6) {
2032                                 /*
2033                                  * Inherit socket options from the listening
2034                                  * socket.
2035                                  * Note that in6p_inputopts are not (even
2036                                  * should not be) copied, since it stores
2037                                  * previously received options and is used to
2038                                  * detect if each new option is different than
2039                                  * the previous one and hence should be passed
2040                                  * to a user.
2041                                  * If we copied in6p_inputopts, a user would
2042                                  * not be able to receive options just after
2043                                  * calling the accept system call.
2044                                  */
2045                                 inp->inp_flags |=
2046                                         oinp->inp_flags & INP_CONTROLOPTS;
2047                                 if (oinp->in6p_outputopts)
2048                                         inp->in6p_outputopts =
2049                                                 ip6_copypktopts(oinp->in6p_outputopts,
2050                                                                 M_NOWAIT);
2051                         } else
2052 #endif /* INET6 */
2053                                 inp->inp_options = ip_srcroute();
2054                         tcp_lock(oso, 0, 0);
2055 #if IPSEC
2056                         /* copy old policy into new socket's */
2057                         if (sotoinpcb(oso)->inp_sp)
2058                         {
2059                                 int error = 0;
2060                                 /* Is it a security hole here to silently fail to copy the policy? */
2061                                 if (inp->inp_sp != NULL)
2062                                         error = ipsec_init_policy(so, &inp->inp_sp);
2063                                 if (error != 0 || ipsec_copy_policy(sotoinpcb(oso)->inp_sp, inp->inp_sp))
2064                                         printf("tcp_input: could not copy policy\n");
2065                         }
2066 #endif
2067                         /* inherit states from the listener */
2068                         DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
2069                                 struct tcpcb *, tp, int32_t, TCPS_LISTEN);
2070                         tp->t_state = TCPS_LISTEN;
2071                         tp->t_flags |= tp0->t_flags & (TF_NOPUSH|TF_NOOPT|TF_NODELAY);
2072                         tp->t_flagsext |= (tp0->t_flagsext & TF_RXTFINDROP);
2073                         tp->t_keepinit = tp0->t_keepinit;
2074                         tp->t_keepcnt = tp0->t_keepcnt;
2075                         tp->t_keepintvl = tp0->t_keepintvl;
2076                         tp->t_adaptive_wtimo = tp0->t_adaptive_wtimo;
2077                         tp->t_adaptive_rtimo = tp0->t_adaptive_rtimo;
2078                         tp->t_inpcb->inp_ip_ttl = tp0->t_inpcb->inp_ip_ttl;
2079                         if ((so->so_flags & SOF_NOTSENT_LOWAT) != 0)
2080                                 tp->t_notsent_lowat = tp0->t_notsent_lowat;
2081
2082                         /* now drop the reference on the listener */
2083                         tcp_unlock(oso, 1, 0);
2084
2085                         tcp_set_max_rwinscale(tp, so);
2086
2087                         KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_END,0,0,0,0,0);
2088                 }
2089         }
2090         lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
2091
2092         if (tp->t_state == TCPS_ESTABLISHED && tlen > 0) {
2093                 /*
2094                  * Evaluate the rate of arrival of packets to see if the
2095                  * receiver can reduce the ack traffic. The algorithm to
2096                  * stretch acks will be enabled if the connection meets
2097                  * certain criteria defined in tcp_stretch_ack_enable function.
2098                  */
2099                 if ((tp->t_flagsext & TF_RCVUNACK_WAITSS) != 0) {
2100                         TCP_INC_VAR(tp->rcv_waitforss, nlropkts);
2101                 }
2102                 if (tcp_stretch_ack_enable(tp)) {
2103                         tp->t_flags |= TF_STRETCHACK;
2104                         tp->t_flagsext &= ~(TF_RCVUNACK_WAITSS);
2105                         tp->rcv_waitforss = 0;
2106                 } else {
2107                         tp->t_flags &= ~(TF_STRETCHACK);
2108                 }
2109                 if (TSTMP_GT(tp->rcv_unackwin, tcp_now)) {
2110                         tp->rcv_by_unackwin += (tlen + off);
2111                 } else {
2112                         tp->rcv_unackwin = tcp_now + tcp_rcvunackwin;
2113                         tp->rcv_by_unackwin = tlen + off;
2114                 }
2115         }
2116
2117         /*
2118          * Keep track of how many bytes were received in the LRO packet
2119          */
2120         if ((pktf_sw_lro_pkt) && (nlropkts > 2))  {
2121                 tp->t_lropktlen += tlen;
2122         }
2123         /*
2124          * Explicit Congestion Notification - Flag that we need to send ECT if
2125          *      + The IP Congestion experienced flag was set.
2126          *      + Socket is in established state
2127          *      + We negotiated ECN in the TCP setup
2128          *      + This isn't a pure ack (tlen > 0)
2129          *      + The data is in the valid window
2130          *
2131          *      TE_SENDECE will be cleared when we receive a packet with TH_CWR set.
2132          */
2133         if (ip_ecn == IPTOS_ECN_CE && tp->t_state == TCPS_ESTABLISHED &&
2134                 ((tp->ecn_flags & (TE_ECN_ON)) == (TE_ECN_ON)) && tlen > 0 &&
2135                 SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
2136                 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
2137                 tp->ecn_flags |= TE_SENDECE;
2138         }
2139
2140         /*
2141          * Clear TE_SENDECE if TH_CWR is set. This is harmless, so we don't
2142          * bother doing extensive checks for state and whatnot.
2143          */
2144         if ((thflags & TH_CWR) == TH_CWR) {
2145                 tp->ecn_flags &= ~TE_SENDECE;
2146         }
2147
2148         /*
2149          * If we received an  explicit notification of congestion in
2150          * ip tos ecn bits or by the CWR bit in TCP header flags, reset
2151          * the ack-strteching state.
2152          */
2153         if (tp->t_state == TCPS_ESTABLISHED &&
2154                 (ip_ecn == IPTOS_ECN_CE ||
2155                 (thflags & TH_CWR)))
2156                 tcp_reset_stretch_ack(tp);
2157
2158         /*
2159          * Try to determine if we are receiving a packet after a long time.
2160          * Use our own approximation of idletime to roughly measure remote
2161          * end's idle time. Since slowstart is used after an idle period
2162          * we want to avoid doing LRO if the remote end is not up to date
2163          * on initial window support and starts with 1 or 2 packets as its IW.
2164          */
2165          if (sw_lro && (tp->t_flagsext & TF_LRO_OFFLOADED) &&
2166                 ((tcp_now - tp->t_rcvtime) >= (TCP_IDLETIMEOUT(tp)))) {
2167                 turnoff_lro = 1;
2168          }
2169
2170         /* Update rcvtime as a new segment was received on the connection */
2171         tp->t_rcvtime = tcp_now;
2172
2173         /*
2174          * Segment received on connection.
2175          * Reset idle time and keep-alive timer.
2176          */
2177         if (TCPS_HAVEESTABLISHED(tp->t_state))
2178                 tcp_keepalive_reset(tp);
2179
2180         /*
2181          * Process options if not in LISTEN state,
2182          * else do it below (after getting remote address).
2183          */
2184         if (tp->t_state != TCPS_LISTEN && optp) {
2185                 tcp_dooptions(tp, optp, optlen, th, &to, ifscope);
2186 #if MPTCP
2187                 mptcp_csum = mptcp_input_csum(tp, m, drop_hdrlen);
2188                 if (mptcp_csum) {
2189                         tp->t_mpflags |= TMPF_SND_MPFAIL;
2190                         tp->t_mpflags &= ~TMPF_EMBED_DSN;
2191                         mptcp_notify_mpfail(so);
2192                         m_freem(m);
2193                         tcpstat.tcps_mp_badcsum++;
2194                         tcp_check_timer_state(tp);
2195                         tcp_unlock(so, 1, 0);
2196                         KERNEL_DEBUG(DBG_FNC_TCP_INPUT |
2197                             DBG_FUNC_END,0,0,0,0,0);
2198                         return;
2199                 }
2200                 mptcp_insert_rmap(tp, m);
2201 #endif /* MPTCP */
2202         }
2203         if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
2204                 if (to.to_flags & TOF_TS) {
2205                         tp->t_flags |= TF_RCVD_TSTMP;
2206                         tp->ts_recent = to.to_tsval;
2207                         tp->ts_recent_age = tcp_now;
2208                 }
2209                 if (to.to_flags & TOF_MSS)
2210                         tcp_mss(tp, to.to_mss, ifscope);
2211                 if (SACK_ENABLED(tp)) {
2212                         if (!(to.to_flags & TOF_SACK))
2213                                 tp->t_flagsext &= ~(TF_SACK_ENABLE);
2214                         else
2215                                 tp->t_flags |= TF_SACK_PERMIT;
2216                 }
2217         }
2218
2219 #if TRAFFIC_MGT
2220         /* Compute inter-packet arrival jitter. According to RFC 3550, inter-packet
2221          * arrival jitter is defined as the difference in packet spacing at the
2222          * receiver compared to the sender for a pair of packets. When two packets
2223          * of maximum segment size come one after the other with consecutive
2224          * sequence numbers, we consider them as packets sent together at the
2225          * sender and use them as a pair to compute inter-packet arrival jitter.
2226          * This metric indicates the delay induced by the network components due
2227          * to queuing in edge/access routers.
2228          */
2229         if (tp->t_state == TCPS_ESTABLISHED &&
2230             (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK|TH_ECE|TH_PUSH)) == TH_ACK &&
2231             ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
2232             ((to.to_flags & TOF_TS) == 0 ||
2233             TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
2234             th->th_seq == tp->rcv_nxt &&
2235             LIST_EMPTY(&tp->t_segq)) {
2236                 int seg_size = tlen;
2237                 if (tp->iaj_pktcnt <= IAJ_IGNORE_PKTCNT) {
2238                         TCP_INC_VAR(tp->iaj_pktcnt, nlropkts);
2239                 }
2240
2241                 if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) {
2242                         seg_size = m->m_pkthdr.lro_pktlen;
2243                 }
2244                 if ( tp->iaj_size == 0 || seg_size > tp->iaj_size ||
2245                         (seg_size == tp->iaj_size && tp->iaj_rcv_ts == 0)) {
2246                         /* State related to inter-arrival jitter is uninitialized
2247                          * or we are trying to find a good first packet to start
2248                          * computing the metric
2249                          */
2250                         update_iaj_state(tp, seg_size, 0);
2251                 } else {
2252                         if (seg_size == tp->iaj_size) {
2253                                 /* Compute inter-arrival jitter taking this packet
2254                                  * as the second packet
2255                                  */
2256                                 if (pktf_sw_lro_pkt)
2257                                         compute_iaj(tp, nlropkts,
2258                                             m->m_pkthdr.lro_elapsed);
2259                                 else
2260                                         compute_iaj(tp, 1, 0);
2261                         }
2262                         if (seg_size  < tp->iaj_size) {
2263                                 /* There is a smaller packet in the stream.
2264                                  * Some times the maximum size supported on a path can
2265                                  * change if there is a new link with smaller MTU.
2266                                  * The receiver will not know about this change.
2267                                  * If there are too many packets smaller than iaj_size,
2268                                  * we try to learn the iaj_size again.
2269                                  */
2270                                 TCP_INC_VAR(tp->iaj_small_pkt, nlropkts);
2271                                 if (tp->iaj_small_pkt > RESET_IAJ_SIZE_THRESH) {
2272                                         update_iaj_state(tp, seg_size, 1);
2273                                 } else {
2274                                         CLEAR_IAJ_STATE(tp);
2275                                 }
2276                         } else {
2277                                 update_iaj_state(tp, seg_size, 0);
2278                         }
2279                 }
2280         } else {
2281                 CLEAR_IAJ_STATE(tp);
2282         }
2283 #endif /* TRAFFIC_MGT */
2284
2285         /*
2286          * Header prediction: check for the two common cases
2287          * of a uni-directional data xfer.  If the packet has
2288          * no control flags, is in-sequence, the window didn't
2289          * change and we're not retransmitting, it's a
2290          * candidate.  If the length is zero and the ack moved
2291          * forward, we're the sender side of the xfer.  Just
2292          * free the data acked & wake any higher level process
2293          * that was blocked waiting for space.  If the length
2294          * is non-zero and the ack didn't move, we're the
2295          * receiver side.  If we're getting packets in-order
2296          * (the reassembly queue is empty), add the data to
2297          * the socket buffer and note that we need a delayed ack.
2298          * Make sure that the hidden state-flags are also off.
2299          * Since we check for TCPS_ESTABLISHED above, it can only
2300          * be TH_NEEDSYN.
2301          */
2302         if (tp->t_state == TCPS_ESTABLISHED &&
2303             (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK|TH_ECE)) == TH_ACK &&
2304             ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
2305             ((to.to_flags & TOF_TS) == 0 ||
2306              TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
2307             th->th_seq == tp->rcv_nxt &&
2308             tiwin && tiwin == tp->snd_wnd &&
2309             tp->snd_nxt == tp->snd_max) {
2310
2311                 /*
2312                  * If last ACK falls within this segment's sequence numbers,
2313                  * record the timestamp.
2314                  * NOTE that the test is modified according to the latest
2315                  * proposal of the tcplw@cray.com list (Braden 1993/04/26).
2316                  */
2317                 if ((to.to_flags & TOF_TS) != 0 &&
2318                    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
2319                         tp->ts_recent_age = tcp_now;
2320                         tp->ts_recent = to.to_tsval;
2321                 }
2322
2323                 /* Force acknowledgment if we received a FIN */
2324
2325                 if (thflags & TH_FIN)
2326                         tp->t_flags |= TF_ACKNOW;
2327
2328                 if (tlen == 0) {
2329                         if (SEQ_GT(th->th_ack, tp->snd_una) &&
2330                             SEQ_LEQ(th->th_ack, tp->snd_max) &&
2331                             tp->snd_cwnd >= tp->snd_ssthresh &&
2332                             (!IN_FASTRECOVERY(tp) &&
2333                             ((!(SACK_ENABLED(tp)) && tp->t_dupacks < tp->t_rexmtthresh) ||
2334                              (SACK_ENABLED(tp) && to.to_nsacks == 0 &&
2335                               TAILQ_EMPTY(&tp->snd_holes))))) {
2336                                 /*
2337                                  * this is a pure ack for outstanding data.
2338                                  */
2339                                 ++tcpstat.tcps_predack;
2340
2341                                 /*
2342                                  * "bad retransmit" recovery
2343                                  */
2344                                 if (tp->t_rxtshift > 0 &&
2345                                         tcp_detect_bad_rexmt(tp, &to)) {
2346                                         ++tcpstat.tcps_sndrexmitbad;
2347                                         tcp_bad_rexmt_restore_state(tp, th);
2348
2349                                         DTRACE_TCP5(cc, void, NULL,
2350                                                 struct inpcb *, tp->t_inpcb,
2351                                                 struct tcpcb *, tp, struct tcphdr *, th,
2352                                                 int32_t, TCP_CC_BAD_REXMT_RECOVERY);
2353                                 }
2354
2355                                 /* Recalculate the RTT */
2356                                 tcp_compute_rtt(tp, &to, th);
2357
2358                                 acked = BYTES_ACKED(th, tp);
2359                                 tcpstat.tcps_rcvackpack++;
2360                                 tcpstat.tcps_rcvackbyte += acked;
2361
2362                                 /* Handle an ack that is in sequence during congestion
2363                                  * avoidance phase. The calculations in this function
2364                                  * assume that snd_una is not updated yet.
2365                                  */
2366                                 if (CC_ALGO(tp)->inseq_ack_rcvd != NULL)
2367                                         CC_ALGO(tp)->inseq_ack_rcvd(tp, th);
2368
2369                                 DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp,
2370                                         struct tcpcb *, tp, struct tcphdr *, th,
2371                                         int32_t, TCP_CC_INSEQ_ACK_RCVD);
2372
2373                                 sbdrop(&so->so_snd, acked);
2374                                 if (so->so_flags & SOF_ENABLE_MSGS) {
2375                                         VERIFY(acked <= so->so_msg_state->msg_serial_bytes);
2376                                         so->so_msg_state->msg_serial_bytes -= acked;
2377                                 }
2378                                 tcp_sbsnd_trim(&so->so_snd);
2379
2380                                 if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
2381                                     SEQ_LEQ(th->th_ack, tp->snd_recover))
2382                                         tp->snd_recover = th->th_ack - 1;
2383                                 tp->snd_una = th->th_ack;
2384
2385                                 /*
2386                                  * pull snd_wl2 up to prevent seq wrap relative
2387                                  * to th_ack.
2388                                  */
2389                                 tp->snd_wl2 = th->th_ack;
2390
2391                                 if (tp->t_dupacks > 0) {
2392                                         tp->t_dupacks = 0;
2393                                         tp->t_rexmtthresh = tcprexmtthresh;
2394                                 }
2395
2396                                 m_freem(m);
2397
2398                                 /*
2399                                  * If all outstanding data are acked, stop
2400                                  * retransmit timer, otherwise restart timer
2401                                  * using current (possibly backed-off) value.
2402                                  * If process is waiting for space,
2403                                  * wakeup/selwakeup/signal.  If data
2404                                  * are ready to send, let tcp_output
2405                                  * decide between more output or persist.
2406                                  */
2407                                 if (tp->snd_una == tp->snd_max)
2408                                         tp->t_timer[TCPT_REXMT] = 0;
2409                                 else if (tp->t_timer[TCPT_PERSIST] == 0)
2410                                         tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
2411
2412                                 if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 &&
2413                                         tp->t_bwmeas != NULL)
2414                                         tcp_bwmeas_check(tp);
2415                                 sowwakeup(so); /* has to be done with socket lock held */
2416                                 if ((so->so_snd.sb_cc) || (tp->t_flags & TF_ACKNOW)) {
2417                                         (void) tcp_output(tp);
2418                                 }
2419
2420                                 tcp_check_timer_state(tp);
2421                                 tcp_unlock(so, 1, 0);
2422                                 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
2423                                 return;
2424                         }
2425                 } else if (th->th_ack == tp->snd_una &&
2426                     LIST_EMPTY(&tp->t_segq) &&
2427                     tlen <= tcp_sbspace(tp)) {
2428                         /*
2429                          * this is a pure, in-sequence data packet
2430                          * with nothing on the reassembly queue and
2431                          * we have enough buffer space to take it.
2432                          */
2433
2434                         /*
2435                          * If this is a connection in steady state, start
2436                          * coalescing packets belonging to this flow.
2437                          */
2438                         if (turnoff_lro) {
2439                                 tcp_lro_remove_state(tp->t_inpcb->inp_laddr,
2440                                         tp->t_inpcb->inp_faddr,
2441                                         tp->t_inpcb->inp_lport,
2442                                         tp->t_inpcb->inp_fport);
2443                                 tp->t_flagsext &= ~TF_LRO_OFFLOADED;
2444                                 tp->t_idleat = tp->rcv_nxt;
2445                         } else if (sw_lro && !pktf_sw_lro_pkt && !isipv6 &&
2446                             (so->so_flags & SOF_USELRO) &&
2447                             !IFNET_IS_CELLULAR(m->m_pkthdr.rcvif) &&
2448                             (m->m_pkthdr.rcvif->if_type != IFT_LOOP) &&
2449                             ((th->th_seq - tp->irs) >
2450                             (tp->t_maxseg << lro_start)) &&
2451                             ((tp->t_idleat == 0) || ((th->th_seq -
2452                              tp->t_idleat) > (tp->t_maxseg << lro_start)))) {
2453                                 tp->t_flagsext |= TF_LRO_OFFLOADED;
2454                                 tcp_start_coalescing(ip, th, tlen);
2455                                 tp->t_idleat = 0;
2456                         }
2457
2458                         /* Clean receiver SACK report if present */
2459                         if (SACK_ENABLED(tp) && tp->rcv_numsacks)
2460                                 tcp_clean_sackreport(tp);
2461                         ++tcpstat.tcps_preddat;
2462                         tp->rcv_nxt += tlen;
2463                         /*
2464                          * Pull snd_wl1 up to prevent seq wrap relative to
2465                          * th_seq.
2466                          */
2467                         tp->snd_wl1 = th->th_seq;
2468                         /*
2469                          * Pull rcv_up up to prevent seq wrap relative to
2470                          * rcv_nxt.
2471                          */
2472                         tp->rcv_up = tp->rcv_nxt;
2473                         TCP_INC_VAR(tcpstat.tcps_rcvpack, nlropkts);
2474                         tcpstat.tcps_rcvbyte += tlen;
2475                         if (nstat_collect) {
2476                                 if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) {
2477                                         INP_ADD_STAT(inp, cell, wifi, rxpackets,
2478                                             m->m_pkthdr.lro_npkts);
2479                                 } else {
2480                                         INP_ADD_STAT(inp, cell, wifi, rxpackets, 1);
2481                                 }
2482                                 INP_ADD_STAT(inp, cell, wifi, rxbytes, tlen);
2483                         }
2484
2485                         /*
2486                          * Calculate the RTT on the receiver only if the
2487                          * connection is in streaming mode and the last
2488                          * packet was not an end-of-write
2489                          */
2490                         if ((tp->t_flags & TF_STRETCHACK) &&
2491                                 !(tp->t_flagsext & TF_STREAMEOW))
2492                                 tcp_compute_rtt(tp, &to, th);
2493
2494                         tcp_sbrcv_grow(tp, &so->so_rcv, &to, tlen);
2495
2496                         /*
2497                          * Add data to socket buffer.
2498                          */
2499                         so_recv_data_stat(so, m, 0);
2500                         m_adj(m, drop_hdrlen);  /* delayed header drop */
2501
2502                         /*
2503                          * If message delivery (SOF_ENABLE_MSGS) is enabled on
2504                          * this socket, deliver the packet received as an
2505                          * in-order message with sequence number attached to it.
2506                          */
2507                         if (sbappendstream_rcvdemux(so, m,
2508                             th->th_seq - (tp->irs + 1), 0)) {
2509                                 sorwakeup(so);
2510                         }
2511 #if INET6
2512                         if (isipv6) {
2513                                 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
2514                                         (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
2515                                         th->th_seq, th->th_ack, th->th_win);
2516                         }
2517                         else
2518 #endif
2519                         {
2520                                 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
2521                                         (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
2522                                         th->th_seq, th->th_ack, th->th_win);
2523                         }
2524                         TCP_INC_VAR(tp->t_unacksegs, nlropkts);
2525                         if (DELAY_ACK(tp, th))  {
2526                                 if ((tp->t_flags & TF_DELACK) == 0) {
2527                                         tp->t_flags |= TF_DELACK;
2528                                         tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
2529                                 }
2530                         } else {
2531                                 tp->t_flags |= TF_ACKNOW;
2532                                 tcp_output(tp);
2533                         }
2534
2535                         tcp_adaptive_rwtimo_check(tp, tlen);
2536
2537                         tcp_check_timer_state(tp);
2538                         tcp_unlock(so, 1, 0);
2539                         KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
2540                         return;
2541                 }
2542         }
2543
2544         /*
2545          * Calculate amount of space in receive window,
2546          * and then do TCP input processing.
2547          * Receive window is amount of space in rcv queue,
2548          * but not less than advertised window.
2549          */
2550         lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
2551         win = tcp_sbspace(tp);
2552         if (win < 0)
2553                 win = 0;
2554         else {  /* clip rcv window to 4K for modems */
2555                 if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0)
2556                         win = min(win, slowlink_wsize);
2557         }
2558         tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
2559 #if MPTCP
2560         /*
2561          * Ensure that the subflow receive window isn't greater
2562          * than the connection level receive window.
2563          */
2564         if ((tp->t_mpflags & TMPF_MPTCP_TRUE) &&
2565             (mp_tp = tptomptp(tp))) {
2566                 MPT_LOCK(mp_tp);
2567                 if (tp->rcv_wnd > mp_tp->mpt_rcvwnd) {
2568                         tp->rcv_wnd = mp_tp->mpt_rcvwnd;
2569                         tcpstat.tcps_mp_reducedwin++;
2570                 }
2571                 MPT_UNLOCK(mp_tp);
2572         }
2573 #endif /* MPTCP */
2574
2575         switch (tp->t_state) {
2576
2577         /*
2578          * Initialize tp->rcv_nxt, and tp->irs, select an initial
2579          * tp->iss, and send a segment:
2580          *     <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
2581          * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
2582          * Fill in remote peer address fields if not previously specified.
2583          * Enter SYN_RECEIVED state, and process any other fields of this
2584          * segment in this state.
2585          */
2586         case TCPS_LISTEN: {
2587                 register struct sockaddr_in *sin;
2588 #if INET6
2589                 register struct sockaddr_in6 *sin6;
2590 #endif
2591
2592                 lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
2593 #if INET6
2594                 if (isipv6) {
2595                         MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6,
2596                                M_SONAME, M_NOWAIT);
2597                         if (sin6 == NULL)
2598                                 goto drop;
2599                         bzero(sin6, sizeof(*sin6));
2600                         sin6->sin6_family = AF_INET6;
2601                         sin6->sin6_len = sizeof(*sin6);
2602                         sin6->sin6_addr = ip6->ip6_src;
2603                         sin6->sin6_port = th->th_sport;
2604                         laddr6 = inp->in6p_laddr;
2605                         if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
2606                                 inp->in6p_laddr = ip6->ip6_dst;
2607                         if (in6_pcbconnect(inp, (struct sockaddr *)sin6,
2608                                            proc0)) {
2609                                 inp->in6p_laddr = laddr6;
2610                                 FREE(sin6, M_SONAME);
2611                                 goto drop;
2612                         }
2613                         FREE(sin6, M_SONAME);
2614                 } else
2615 #endif
2616             {
2617                         lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
2618                         MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME,
2619                        M_NOWAIT);
2620                         if (sin == NULL)
2621                                 goto drop;
2622                         sin->sin_family = AF_INET;
2623                         sin->sin_len = sizeof(*sin);
2624                         sin->sin_addr = ip->ip_src;
2625                         sin->sin_port = th->th_sport;
2626                         bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero));
2627                         laddr = inp->inp_laddr;
2628                         if (inp->inp_laddr.s_addr == INADDR_ANY)
2629                                 inp->inp_laddr = ip->ip_dst;
2630                         if (in_pcbconnect(inp, (struct sockaddr *)sin, proc0,
2631                             IFSCOPE_NONE, NULL)) {
2632                                 inp->inp_laddr = laddr;
2633                                 FREE(sin, M_SONAME);
2634                                 goto drop;
2635                         }
2636                         FREE(sin, M_SONAME);
2637                 }
2638
2639                 tcp_dooptions(tp, optp, optlen, th, &to, ifscope);
2640
2641                 if (SACK_ENABLED(tp)) {
2642                         if (!(to.to_flags & TOF_SACK))
2643                                 tp->t_flagsext &= ~(TF_SACK_ENABLE);
2644                         else
2645                                 tp->t_flags |= TF_SACK_PERMIT;
2646                 }
2647
2648                 if (iss)
2649                         tp->iss = iss;
2650                 else {
2651                         tp->iss = tcp_new_isn(tp);
2652                 }
2653                 tp->irs = th->th_seq;
2654                 tcp_sendseqinit(tp);
2655                 tcp_rcvseqinit(tp);
2656                 tp->snd_recover = tp->snd_una;
2657                 /*
2658                  * Initialization of the tcpcb for transaction;
2659                  *   set SND.WND = SEG.WND,
2660                  *   initialize CCsend and CCrecv.
2661                  */
2662                 tp->snd_wnd = tiwin;    /* initial send-window */
2663                 tp->t_flags |= TF_ACKNOW;
2664                 tp->t_unacksegs = 0;
2665                 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
2666                         struct tcpcb *, tp, int32_t, TCPS_SYN_RECEIVED);
2667                 tp->t_state = TCPS_SYN_RECEIVED;
2668                 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
2669                         TCP_CONN_KEEPINIT(tp));
2670                 dropsocket = 0;         /* committed to socket */
2671
2672                 if (inp->inp_flowhash == 0)
2673                         inp->inp_flowhash = inp_calc_flowhash(inp);
2674 #if INET6
2675                 /* update flowinfo - RFC 6437 */
2676                 if (inp->inp_flow == 0 &&
2677                     inp->in6p_flags & IN6P_AUTOFLOWLABEL) {
2678                         inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
2679                         inp->inp_flow |=
2680                             (htonl(inp->inp_flowhash) & IPV6_FLOWLABEL_MASK);
2681                 }
2682 #endif /* INET6 */
2683
2684                 /* reset the incomp processing flag */
2685                 so->so_flags &= ~(SOF_INCOMP_INPROGRESS);
2686                 tcpstat.tcps_accepts++;
2687                 if ((thflags & (TH_ECE | TH_CWR)) == (TH_ECE | TH_CWR)) {
2688                         /* ECN-setup SYN */
2689                         tp->ecn_flags |= (TE_SETUPRECEIVED | TE_SENDIPECT);
2690                 }
2691
2692 #if CONFIG_IFEF_NOWINDOWSCALE
2693                 if (tcp_obey_ifef_nowindowscale && m->m_pkthdr.rcvif != NULL &&
2694                     (m->m_pkthdr.rcvif->if_eflags & IFEF_NOWINDOWSCALE)) {
2695                         /* Window scaling is not enabled on this interface */
2696                         tp->t_flags &= ~TF_REQ_SCALE;
2697                 }
2698 #endif
2699                 goto trimthenstep6;
2700                 }
2701
2702         /*
2703          * If the state is SYN_RECEIVED:
2704          *      if seg contains an ACK, but not for our SYN/ACK, send a RST.
2705          */
2706         case TCPS_SYN_RECEIVED:
2707                 if ((thflags & TH_ACK) &&
2708                     (SEQ_LEQ(th->th_ack, tp->snd_una) ||
2709                      SEQ_GT(th->th_ack, tp->snd_max))) {
2710                                 rstreason = BANDLIM_RST_OPENPORT;
2711                                 IF_TCP_STATINC(ifp, ooopacket);
2712                                 goto dropwithreset;
2713                 }
2714
2715                 /*
2716                  * In SYN_RECEIVED state, if we recv some SYNS with
2717                  * window scale and others without, window scaling should
2718                  * be disabled. Otherwise the window advertised will be
2719                  * lower if we assume scaling and the other end does not.
2720                  */
2721                 if ((thflags & TH_SYN) &&
2722                     !(to.to_flags & TOF_SCALE))
2723                         tp->t_flags &= ~TF_RCVD_SCALE;
2724                 break;
2725
2726         /*
2727          * If the state is SYN_SENT:
2728          *      if seg contains an ACK, but not for our SYN, drop the input.
2729          *      if seg contains a RST, then drop the connection.
2730          *      if seg does not contain SYN, then drop it.
2731          * Otherwise this is an acceptable SYN segment
2732          *      initialize tp->rcv_nxt and tp->irs
2733          *      if seg contains ack then advance tp->snd_una
2734          *      if SYN has been acked change to ESTABLISHED else SYN_RCVD state
2735          *      arrange for segment to be acked (eventually)
2736          *      continue processing rest of data/controls, beginning with URG
2737          */
2738         case TCPS_SYN_SENT:
2739                 if ((thflags & TH_ACK) &&
2740                     (SEQ_LEQ(th->th_ack, tp->iss) ||
2741                      SEQ_GT(th->th_ack, tp->snd_max))) {
2742                         rstreason = BANDLIM_UNLIMITED;
2743                         IF_TCP_STATINC(ifp, ooopacket);
2744                         goto dropwithreset;
2745                 }
2746                 if (thflags & TH_RST) {
2747                         if ((thflags & TH_ACK) != 0) {
2748                                 soevent(so,
2749                                     (SO_FILT_HINT_LOCKED |
2750                                     SO_FILT_HINT_CONNRESET));
2751                                 tp = tcp_drop(tp, ECONNREFUSED);
2752                                 postevent(so, 0, EV_RESET);
2753                         }
2754                         goto drop;
2755                 }
2756                 if ((thflags & TH_SYN) == 0)
2757                         goto drop;
2758                 tp->snd_wnd = th->th_win;       /* initial send window */
2759
2760                 tp->irs = th->th_seq;
2761                 tcp_rcvseqinit(tp);
2762                 if (thflags & TH_ACK) {
2763                         tcpstat.tcps_connects++;
2764
2765                         if ((thflags & (TH_ECE | TH_CWR)) == (TH_ECE)) {
2766                                 /* ECN-setup SYN-ACK */
2767                                 tp->ecn_flags |= TE_SETUPRECEIVED;
2768                         }
2769                         else {
2770                                 /* non-ECN-setup SYN-ACK */
2771                                 tp->ecn_flags &= ~TE_SENDIPECT;
2772                         }
2773
2774 #if CONFIG_MACF_NET && CONFIG_MACF_SOCKET
2775                         /* XXXMAC: recursive lock: SOCK_LOCK(so); */
2776                         mac_socketpeer_label_associate_mbuf(m, so);
2777                         /* XXXMAC: SOCK_UNLOCK(so); */
2778 #endif
2779                         /* Do window scaling on this connection? */
2780                         if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
2781                                 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
2782                                 tp->snd_scale = tp->requested_s_scale;
2783                                 tp->rcv_scale = tp->request_r_scale;
2784                         }
2785                         tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN << tp->rcv_scale);
2786                         tp->snd_una++;          /* SYN is acked */
2787                         /*
2788                          * If there's data, delay ACK; if there's also a FIN
2789                          * ACKNOW will be turned on later.
2790                          */
2791                         TCP_INC_VAR(tp->t_unacksegs, nlropkts);
2792                         if (DELAY_ACK(tp, th) && tlen != 0 ) {
2793                                 if ((tp->t_flags & TF_DELACK) == 0) {
2794                                         tp->t_flags |= TF_DELACK;
2795                                         tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
2796                                 }
2797                         }
2798                         else {
2799                                 tp->t_flags |= TF_ACKNOW;
2800                         }
2801                         /*
2802                          * Received <SYN,ACK> in SYN_SENT[*] state.
2803                          * Transitions:
2804                          *      SYN_SENT  --> ESTABLISHED
2805                          *      SYN_SENT* --> FIN_WAIT_1
2806                          */
2807                         tp->t_starttime = tcp_now;
2808                         tcp_sbrcv_tstmp_check(tp);
2809                         if (tp->t_flags & TF_NEEDFIN) {
2810                                 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
2811                                         struct tcpcb *, tp, int32_t, TCPS_FIN_WAIT_1);
2812                                 tp->t_state = TCPS_FIN_WAIT_1;
2813                                 tp->t_flags &= ~TF_NEEDFIN;
2814                                 thflags &= ~TH_SYN;
2815                         } else {
2816                                 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
2817                                         struct tcpcb *, tp, int32_t, TCPS_ESTABLISHED);
2818                                 tp->t_state = TCPS_ESTABLISHED;
2819                                 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
2820                                         TCP_CONN_KEEPIDLE(tp));
2821                                 if (nstat_collect)
2822                                         nstat_route_connect_success(tp->t_inpcb->inp_route.ro_rt);
2823                         }
2824 #if MPTCP
2825                         /*
2826                          * Do not send the connect notification for additional
2827                          * subflows until ACK for 3-way handshake arrives.
2828                          */
2829                         if ((!(tp->t_mpflags & TMPF_MPTCP_TRUE)) &&
2830                             (tp->t_mpflags & TMPF_SENT_JOIN)) {
2831                                 isconnected = FALSE;
2832                         } else
2833 #endif /* MPTCP */
2834                                 isconnected = TRUE;
2835                 } else {
2836                         /*
2837                          *  Received initial SYN in SYN-SENT[*] state => simul-
2838                          *  taneous open.  If segment contains CC option and there is
2839                          *  a cached CC, apply TAO test; if it succeeds, connection is
2840                          *  half-synchronized.  Otherwise, do 3-way handshake:
2841                          *        SYN-SENT -> SYN-RECEIVED
2842                          *        SYN-SENT* -> SYN-RECEIVED*
2843                          */
2844                         tp->t_flags |= TF_ACKNOW;
2845                         tp->t_timer[TCPT_REXMT] = 0;
2846                         DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
2847                                 struct tcpcb *, tp, int32_t, TCPS_SYN_RECEIVED);
2848                         tp->t_state = TCPS_SYN_RECEIVED;
2849
2850                 }
2851
2852 trimthenstep6:
2853                 /*
2854                  * Advance th->th_seq to correspond to first data byte.
2855                  * If data, trim to stay within window,
2856                  * dropping FIN if necessary.
2857                  */
2858                 th->th_seq++;
2859                 if (tlen > tp->rcv_wnd) {
2860                         todrop = tlen - tp->rcv_wnd;
2861                         m_adj(m, -todrop);
2862                         tlen = tp->rcv_wnd;
2863                         thflags &= ~TH_FIN;
2864                         tcpstat.tcps_rcvpackafterwin++;
2865                         tcpstat.tcps_rcvbyteafterwin += todrop;
2866                 }
2867                 tp->snd_wl1 = th->th_seq - 1;
2868                 tp->rcv_up = th->th_seq;
2869                 /*
2870                  *  Client side of transaction: already sent SYN and data.
2871                  *  If the remote host used T/TCP to validate the SYN,
2872                  *  our data will be ACK'd; if so, enter normal data segment
2873                  *  processing in the middle of step 5, ack processing.
2874                  *  Otherwise, goto step 6.
2875                  */
2876                 if (thflags & TH_ACK)
2877                         goto process_ACK;
2878                 goto step6;
2879         /*
2880          * If the state is LAST_ACK or CLOSING or TIME_WAIT:
2881          *      do normal processing.
2882          *
2883          * NB: Leftover from RFC1644 T/TCP.  Cases to be reused later.
2884          */
2885         case TCPS_LAST_ACK:
2886         case TCPS_CLOSING:
2887         case TCPS_TIME_WAIT:
2888                 break;  /* continue normal processing */
2889
2890         /* Received a SYN while connection is already established.
2891          * This is a "half open connection and other anomalies" described
2892          * in RFC793 page 34, send an ACK so the remote reset the connection
2893          * or recovers by adjusting its sequence numberering
2894          */
2895         case TCPS_ESTABLISHED:
2896                 if (thflags & TH_SYN)
2897                         goto dropafterack;
2898                 break;
2899         }
2900
2901         /*
2902          * States other than LISTEN or SYN_SENT.
2903          * First check the RST flag and sequence number since reset segments
2904          * are exempt from the timestamp and connection count tests.  This
2905          * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
2906          * below which allowed reset segments in half the sequence space
2907          * to fall though and be processed (which gives forged reset
2908          * segments with a random sequence number a 50 percent chance of
2909          * killing a connection).
2910          * Then check timestamp, if present.
2911          * Then check the connection count, if present.
2912          * Then check that at least some bytes of segment are within
2913          * receive window.  If segment begins before rcv_nxt,
2914          * drop leading data (and SYN); if nothing left, just ack.
2915          *
2916          *
2917          * If the RST bit is set, check the sequence number to see
2918          * if this is a valid reset segment.
2919          * RFC 793 page 37:
2920          *   In all states except SYN-SENT, all reset (RST) segments
2921          *   are validated by checking their SEQ-fields.  A reset is
2922          *   valid if its sequence number is in the window.
2923          * Note: this does not take into account delayed ACKs, so
2924          *   we should test against last_ack_sent instead of rcv_nxt.
2925          *   The sequence number in the reset segment is normally an
2926          *   echo of our outgoing acknowlegement numbers, but some hosts
2927          *   send a reset with the sequence number at the rightmost edge
2928          *   of our receive window, and we have to handle this case.
2929          * Note 2: Paul Watson's paper "Slipping in the Window" has shown
2930          *   that brute force RST attacks are possible.  To combat this,
2931          *   we use a much stricter check while in the ESTABLISHED state,
2932          *   only accepting RSTs where the sequence number is equal to
2933          *   last_ack_sent.  In all other states (the states in which a
2934          *   RST is more likely), the more permissive check is used.
2935          * If we have multiple segments in flight, the intial reset
2936          * segment sequence numbers will be to the left of last_ack_sent,
2937          * but they will eventually catch up.
2938          * In any case, it never made sense to trim reset segments to
2939          * fit the receive window since RFC 1122 says:
2940          *   4.2.2.12  RST Segment: RFC-793 Section 3.4
2941          *
2942          *    A TCP SHOULD allow a received RST segment to include data.
2943          *
2944          *    DISCUSSION
2945          *         It has been suggested that a RST segment could contain
2946          *         ASCII text that encoded and explained the cause of the
2947          *         RST.  No standard has yet been established for such
2948          *         data.
2949          *
2950          * If the reset segment passes the sequence number test examine
2951          * the state:
2952          *    SYN_RECEIVED STATE:
2953          *      If passive open, return to LISTEN state.
2954          *      If active open, inform user that connection was refused.
2955          *    ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES:
2956          *      Inform user that connection was reset, and close tcb.
2957          *    CLOSING, LAST_ACK STATES:
2958          *      Close the tcb.
2959          *    TIME_WAIT STATE:
2960          *      Drop the segment - see Stevens, vol. 2, p. 964 and
2961          *      RFC 1337.
2962          *
2963          *      Radar 4803931: Allows for the case where we ACKed the FIN but
2964          *                     there is already a RST in flight from the peer.
2965          *                     In that case, accept the RST for non-established
2966          *                     state if it's one off from last_ack_sent.
2967
2968          */
2969         if (thflags & TH_RST) {
2970                 if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
2971                     SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
2972                     (tp->rcv_wnd == 0 &&
2973                     ((tp->last_ack_sent == th->th_seq) ||
2974                     ((tp->last_ack_sent -1) == th->th_seq)))) {
2975                         switch (tp->t_state) {
2976
2977                         case TCPS_SYN_RECEIVED:
2978                                 IF_TCP_STATINC(ifp, rstinsynrcv);
2979                                 so->so_error = ECONNREFUSED;
2980                                 goto close;
2981
2982                         case TCPS_ESTABLISHED:
2983                                 if (tp->last_ack_sent != th->th_seq) {
2984                                         tcpstat.tcps_badrst++;
2985                                         goto drop;
2986                                 }
2987                         case TCPS_FIN_WAIT_1:
2988                         case TCPS_CLOSE_WAIT:
2989                                 /*
2990                                   Drop through ...
2991                                 */
2992                         case TCPS_FIN_WAIT_2:
2993                                 so->so_error = ECONNRESET;
2994                         close:
2995                                 postevent(so, 0, EV_RESET);
2996                                 soevent(so,
2997                                     (SO_FILT_HINT_LOCKED |
2998                                     SO_FILT_HINT_CONNRESET));
2999
3000                                 tcpstat.tcps_drops++;
3001                                 tp = tcp_close(tp);
3002                                 break;
3003
3004                         case TCPS_CLOSING:
3005                         case TCPS_LAST_ACK:
3006                                 tp = tcp_close(tp);
3007                                 break;
3008
3009                         case TCPS_TIME_WAIT:
3010                                 break;
3011                         }
3012                 }
3013                 goto drop;
3014         }
3015
3016         /*
3017          * RFC 1323 PAWS: If we have a timestamp reply on this segment
3018          * and it's less than ts_recent, drop it.
3019          */
3020         if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
3021             TSTMP_LT(to.to_tsval, tp->ts_recent)) {
3022
3023                 /* Check to see if ts_recent is over 24 days old.  */
3024                 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) {
3025                         /*
3026                          * Invalidate ts_recent.  If this segment updates
3027                          * ts_recent, the age will be reset later and ts_recent
3028                          * will get a valid value.  If it does not, setting
3029                          * ts_recent to zero will at least satisfy the
3030                          * requirement that zero be placed in the timestamp
3031                          * echo reply when ts_recent isn't valid.  The
3032                          * age isn't reset until we get a valid ts_recent
3033                          * because we don't want out-of-order segments to be
3034                          * dropped when ts_recent is old.
3035                          */
3036                         tp->ts_recent = 0;
3037                 } else {
3038                         tcpstat.tcps_rcvduppack++;
3039                         tcpstat.tcps_rcvdupbyte += tlen;
3040                         tcpstat.tcps_pawsdrop++;
3041                         if (nstat_collect) {
3042                                 nstat_route_rx(tp->t_inpcb->inp_route.ro_rt,
3043                                         1, tlen, NSTAT_RX_FLAG_DUPLICATE);
3044                                 INP_ADD_STAT(inp, cell, wifi, rxpackets, 1);
3045                                 INP_ADD_STAT(inp, cell, wifi, rxbytes, tlen);
3046                                 tp->t_stat.rxduplicatebytes += tlen;
3047                         }
3048                         if (tlen)
3049                                 goto dropafterack;
3050                         goto drop;
3051                 }
3052         }
3053
3054         /*
3055          * In the SYN-RECEIVED state, validate that the packet belongs to
3056          * this connection before trimming the data to fit the receive
3057          * window.  Check the sequence number versus IRS since we know
3058          * the sequence numbers haven't wrapped.  This is a partial fix
3059          * for the "LAND" DoS attack.
3060          */
3061         if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
3062                 rstreason = BANDLIM_RST_OPENPORT;
3063                 IF_TCP_STATINC(ifp, dospacket);
3064                 goto dropwithreset;
3065         }
3066
3067         todrop = tp->rcv_nxt - th->th_seq;
3068         if (todrop > 0) {
3069                 if (thflags & TH_SYN) {
3070                         thflags &= ~TH_SYN;
3071                         th->th_seq++;
3072                         if (th->th_urp > 1)
3073                                 th->th_urp--;
3074                         else
3075                                 thflags &= ~TH_URG;
3076                         todrop--;
3077                 }
3078                 /*
3079                  * Following if statement from Stevens, vol. 2, p. 960.
3080                  */
3081                 if (todrop > tlen
3082                     || (todrop == tlen && (thflags & TH_FIN) == 0)) {
3083                         /*
3084                          * Any valid FIN must be to the left of the window.
3085                          * At this point the FIN must be a duplicate or out
3086                          * of sequence; drop it.
3087                          */
3088                         thflags &= ~TH_FIN;
3089
3090                         /*
3091                          * Send an ACK to resynchronize and drop any data.
3092                          * But keep on processing for RST or ACK.
3093                          */
3094                         tp->t_flags |= TF_ACKNOW;
3095                         if (todrop == 1) {
3096                                 /* This could be a keepalive */
3097                                 soevent(so, SO_FILT_HINT_LOCKED |
3098                                         SO_FILT_HINT_KEEPALIVE);
3099                         }
3100                         todrop = tlen;
3101                         tcpstat.tcps_rcvduppack++;
3102                         tcpstat.tcps_rcvdupbyte += todrop;
3103                 } else {
3104                         tcpstat.tcps_rcvpartduppack++;
3105                         tcpstat.tcps_rcvpartdupbyte += todrop;
3106                 }
3107                 if (nstat_collect) {
3108                         nstat_route_rx(tp->t_inpcb->inp_route.ro_rt, 1,
3109                                 todrop, NSTAT_RX_FLAG_DUPLICATE);
3110                         INP_ADD_STAT(inp, cell, wifi, rxpackets, 1);
3111                         INP_ADD_STAT(inp, cell, wifi, rxbytes, todrop);
3112                         tp->t_stat.rxduplicatebytes += todrop;
3113                 }
3114                 drop_hdrlen += todrop;  /* drop from the top afterwards */
3115                 th->th_seq += todrop;
3116                 tlen -= todrop;
3117                 if (th->th_urp > todrop)
3118                         th->th_urp -= todrop;
3119                 else {
3120                         thflags &= ~TH_URG;
3121                         th->th_urp = 0;
3122                 }
3123         }
3124
3125         /*
3126          * If new data are received on a connection after the user processes
3127          * are gone, then RST the other end.  Note that an MPTCP subflow socket
3128          * would have SS_NOFDREF set by default, so check to make sure that
3129          * we test for SOF_MP_SUBFLOW socket flag (which would be cleared when
3130          * the socket is closed.)
3131          */
3132         if (!(so->so_flags & SOF_MP_SUBFLOW) &&
3133             (so->so_state & SS_NOFDREF) &&
3134             tp->t_state > TCPS_CLOSE_WAIT && tlen) {
3135                 tp = tcp_close(tp);
3136                 tcpstat.tcps_rcvafterclose++;
3137                 rstreason = BANDLIM_UNLIMITED;
3138                 IF_TCP_STATINC(ifp, cleanup);
3139                 goto dropwithreset;
3140         }
3141
3142         /*
3143          * If segment ends after window, drop trailing data
3144          * (and PUSH and FIN); if nothing left, just ACK.
3145          */
3146         todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd);
3147         if (todrop > 0) {
3148                 tcpstat.tcps_rcvpackafterwin++;
3149                 if (todrop >= tlen) {
3150                         tcpstat.tcps_rcvbyteafterwin += tlen;
3151                         /*
3152                          * If a new connection request is received
3153                          * while in TIME_WAIT, drop the old connection
3154                          * and start over if the sequence numbers
3155                          * are above the previous ones.
3156                          */
3157                         if (thflags & TH_SYN &&
3158                             tp->t_state == TCPS_TIME_WAIT &&
3159                             SEQ_GT(th->th_seq, tp->rcv_nxt)) {
3160                                 iss = tcp_new_isn(tp);
3161                                 tp = tcp_close(tp);
3162                                 tcp_unlock(so, 1, 0);
3163                                 goto findpcb;
3164                         }
3165                         /*
3166                          * If window is closed can only take segments at
3167                          * window edge, and have to drop data and PUSH from
3168                          * incoming segments.  Continue processing, but
3169                          * remember to ack.  Otherwise, drop segment
3170                          * and ack.
3171                          */
3172                         if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
3173                                 tp->t_flags |= TF_ACKNOW;
3174                                 tcpstat.tcps_rcvwinprobe++;
3175                         } else
3176                                 goto dropafterack;
3177                 } else
3178                         tcpstat.tcps_rcvbyteafterwin += todrop;
3179                 m_adj(m, -todrop);
3180                 tlen -= todrop;
3181                 thflags &= ~(TH_PUSH|TH_FIN);
3182         }
3183
3184         /*
3185          * If last ACK falls within this segment's sequence numbers,
3186          * record its timestamp.
3187          * NOTE:
3188          * 1) That the test incorporates suggestions from the latest
3189          *    proposal of the tcplw@cray.com list (Braden 1993/04/26).
3190          * 2) That updating only on newer timestamps interferes with
3191          *    our earlier PAWS tests, so this check should be solely
3192          *    predicated on the sequence space of this segment.
3193          * 3) That we modify the segment boundary check to be
3194          *        Last.ACK.Sent <= SEG.SEQ + SEG.Len
3195          *    instead of RFC1323's
3196          *        Last.ACK.Sent < SEG.SEQ + SEG.Len,
3197          *    This modified check allows us to overcome RFC1323's
3198          *    limitations as described in Stevens TCP/IP Illustrated
3199          *    Vol. 2 p.869. In such cases, we can still calculate the
3200          *    RTT correctly when RCV.NXT == Last.ACK.Sent.
3201          */
3202         if ((to.to_flags & TOF_TS) != 0 &&
3203             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
3204             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
3205                 ((thflags & (TH_SYN|TH_FIN)) != 0))) {
3206                 tp->ts_recent_age = tcp_now;
3207                 tp->ts_recent = to.to_tsval;
3208         }
3209
3210         /*
3211          * If a SYN is in the window, then this is an
3212          * error and we send an RST and drop the connection.
3213          */
3214         if (thflags & TH_SYN) {
3215                 tp = tcp_drop(tp, ECONNRESET);
3216                 rstreason = BANDLIM_UNLIMITED;
3217                 postevent(so, 0, EV_RESET);
3218                 IF_TCP_STATINC(ifp, synwindow);
3219                 goto dropwithreset;
3220         }
3221
3222         /*
3223          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN
3224          * flag is on (half-synchronized state), then queue data for
3225          * later processing; else drop segment and return.
3226          */
3227         if ((thflags & TH_ACK) == 0) {
3228                 if (tp->t_state == TCPS_SYN_RECEIVED ||
3229                     (tp->t_flags & TF_NEEDSYN))
3230                         goto step6;
3231                 else if (tp->t_flags & TF_ACKNOW)
3232                         goto dropafterack;
3233                 else
3234                         goto drop;
3235         }
3236
3237         /*
3238          * Ack processing.
3239          */
3240
3241         switch (tp->t_state) {
3242
3243         /*
3244          * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
3245          * ESTABLISHED state and continue processing.
3246          * The ACK was checked above.
3247          */
3248         case TCPS_SYN_RECEIVED:
3249
3250                 tcpstat.tcps_connects++;
3251
3252                 /* Do window scaling? */
3253                 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3254                         (TF_RCVD_SCALE|TF_REQ_SCALE)) {
3255                         tp->snd_scale = tp->requested_s_scale;
3256                         tp->rcv_scale = tp->request_r_scale;
3257                         tp->snd_wnd = th->th_win << tp->snd_scale;
3258                         tiwin = tp->snd_wnd;
3259                 }
3260                 /*
3261                  * Make transitions:
3262                  *      SYN-RECEIVED  -> ESTABLISHED
3263                  *      SYN-RECEIVED* -> FIN-WAIT-1
3264                  */
3265                 tp->t_starttime = tcp_now;
3266                 tcp_sbrcv_tstmp_check(tp);
3267                 if (tp->t_flags & TF_NEEDFIN) {
3268                         DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
3269                                 struct tcpcb *, tp, int32_t, TCPS_FIN_WAIT_1);
3270                         tp->t_state = TCPS_FIN_WAIT_1;
3271                         tp->t_flags &= ~TF_NEEDFIN;
3272                 } else {
3273                         DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
3274                                 struct tcpcb *, tp, int32_t, TCPS_ESTABLISHED);
3275                         tp->t_state = TCPS_ESTABLISHED;
3276                         tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
3277                                 TCP_CONN_KEEPIDLE(tp));
3278                         if (nstat_collect)
3279                                 nstat_route_connect_success(tp->t_inpcb->inp_route.ro_rt);
3280                 }
3281                 /*
3282                  * If segment contains data or ACK, will call tcp_reass()
3283                  * later; if not, do so now to pass queued data to user.
3284                  */
3285                 if (tlen == 0 && (thflags & TH_FIN) == 0)
3286                         (void) tcp_reass(tp, (struct tcphdr *)0, &tlen,
3287                             NULL, ifp);
3288                 tp->snd_wl1 = th->th_seq - 1;
3289
3290                 /* FALLTHROUGH */
3291 #if MPTCP
3292                 /*
3293                  * Do not send the connect notification for additional subflows
3294                  * until ACK for 3-way handshake arrives.
3295                  */
3296                 if ((!(tp->t_mpflags & TMPF_MPTCP_TRUE)) &&
3297                     (tp->t_mpflags & TMPF_SENT_JOIN)) {
3298                         isconnected = FALSE;
3299                 } else
3300 #endif /* MPTCP */
3301                         isconnected = TRUE;
3302
3303         /*
3304          * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
3305          * ACKs.  If the ack is in the range
3306          *      tp->snd_una < th->th_ack <= tp->snd_max
3307          * then advance tp->snd_una to th->th_ack and drop
3308          * data from the retransmission queue.  If this ACK reflects
3309          * more up to date window information we update our window information.
3310          */
3311         case TCPS_ESTABLISHED:
3312         case TCPS_FIN_WAIT_1:
3313         case TCPS_FIN_WAIT_2:
3314         case TCPS_CLOSE_WAIT:
3315         case TCPS_CLOSING:
3316         case TCPS_LAST_ACK:
3317         case TCPS_TIME_WAIT:
3318                 if (SEQ_GT(th->th_ack, tp->snd_max)) {
3319                         tcpstat.tcps_rcvacktoomuch++;
3320                         goto dropafterack;
3321                 }
3322                 if (SACK_ENABLED(tp) &&
3323                     (to.to_nsacks > 0 || !TAILQ_EMPTY(&tp->snd_holes)))
3324                         tcp_sack_doack(tp, &to, th->th_ack, &sack_bytes_acked);
3325 #if MPTCP
3326                 if ((tp->t_mpuna) && (SEQ_GEQ(th->th_ack, tp->t_mpuna))) {
3327 #if 0
3328                         if ((tp->t_mpflags & TMPF_MPTCP_TRUE) &&
3329                             !(tp->t_mpflags & TMPF_MPTCP_READY)) {
3330                                 printf("%s: fallback? %x %x \n", __func__,
3331                                     th->th_ack, tp->t_mpuna);
3332                                 tp->t_mpuna = 0;
3333                         }
3334 #endif
3335                         if (tp->t_mpflags & TMPF_PREESTABLISHED) {
3336                                 /* MP TCP establishment succeeded */
3337                                 tp->t_mpuna = 0;
3338                                 if (tp->t_mpflags & TMPF_JOINED_FLOW) {
3339                                         if (tp->t_mpflags & TMPF_SENT_JOIN) {
3340                                                 tp->t_mpflags &=
3341                                                     ~TMPF_PREESTABLISHED;
3342                                                 tp->t_mpflags |=
3343                                                     TMPF_MPTCP_TRUE;
3344                                                 so->so_flags |= SOF_MPTCP_TRUE;
3345                                                 if (mptcp_dbg >= MP_ERR_DEBUG)
3346                                                         printf("MPTCP SUCCESS"
3347                                                             "%s \n",__func__);
3348                                                 tp->t_timer[TCPT_JACK_RXMT] = 0;
3349                                                 tp->t_mprxtshift = 0;
3350                                                 isconnected = TRUE;
3351                                         } else {
3352                                                 isconnected = FALSE;
3353                                         }
3354                                 } else {
3355                                         isconnected = TRUE;
3356                                         tp->t_mpflags &= ~TMPF_SENT_KEYS;
3357
3358                                 }
3359                         }
3360                 }
3361 #endif /* MPTCP */
3362                 /*
3363                  * If we have outstanding data (other than
3364                  * a window probe), this is a completely
3365                  * duplicate ack (ie, window info didn't
3366                  * change) and the ack is the biggest we've seen.
3367                  */
3368                 if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
3369                         if (tlen == 0 && tiwin == tp->snd_wnd) {
3370 process_dupack:
3371 #if MPTCP
3372                                 /*
3373                                  * MPTCP options that are ignored must
3374                                  * not be treated as duplicate ACKs.
3375                                  */
3376                                 if (to.to_flags & TOF_MPTCP) {
3377                                         goto drop;
3378                                 }
3379 #endif /* MPTCP */
3380                                 tcpstat.tcps_rcvdupack++;
3381                                 ++tp->t_dupacks;
3382                                 /*
3383                                  * Check if we need to reset the limit on early
3384                                  * retransmit
3385                                  */
3386                                 if (TSTMP_GEQ(tcp_now,
3387                                         (tp->t_early_rexmt_win + TCP_EARLY_REXMT_WIN)))
3388                                         tp->t_early_rexmt_count = 0;
3389
3390                                 /*
3391                                  * Is early retransmit needed? We check for
3392                                  * this when the connection is waiting for
3393                                  * more duplicate acks to enter fast recovery.
3394                                  */
3395                                 if (early_rexmt &&
3396                                         tp->t_early_rexmt_count < TCP_EARLY_REXMT_LIMIT &&
3397                                         !IN_FASTRECOVERY(tp) &&
3398                                         SEQ_GT(tp->snd_max, tp->snd_una) &&
3399                                         (tp->t_dupacks == 1 ||
3400                                         (SACK_ENABLED(tp) &&
3401                                         !TAILQ_EMPTY(&tp->snd_holes)))) {
3402                                         /*
3403                                          * If there are only a few outstanding
3404                                          * segments on the connection, we might need
3405                                          * to lower the retransmit threshold. This
3406                                          * will allow us to do Early Retransmit as
3407                                          * described in RFC 5827.
3408                                          */
3409                                         u_int32_t obytes, snd_off;
3410                                         int32_t snd_len;
3411                                         if (SACK_ENABLED(tp) &&
3412                                                 !TAILQ_EMPTY(&tp->snd_holes)) {
3413                                                 obytes = (tp->snd_max - tp->snd_fack) +
3414                                                         tp->sackhint.sack_bytes_rexmit;
3415                                         } else {
3416                                                 obytes = (tp->snd_max - tp->snd_una);
3417                                         }
3418
3419                                         /* In order to lower retransmit threshold the
3420                                          * following two conditions must be met.
3421                                          * 1. the amount of outstanding data is less
3422                                          * than 4*SMSS bytes
3423                                          * 2. there is no unsent data ready for
3424                                          * transmission or the advertised window
3425                                          * will limit sending new segments.
3426                                          */
3427                                         snd_off = tp->snd_max - tp->snd_una;
3428                                         snd_len = min(so->so_snd.sb_cc, tp->snd_wnd) - snd_off;
3429                                         if (obytes < (tp->t_maxseg << 2) &&
3430                                                 snd_len <= 0) {
3431                                                 u_int32_t osegs;
3432
3433
3434                                                 osegs = obytes / tp->t_maxseg;
3435                                                 if ((osegs * tp->t_maxseg) < obytes)
3436                                                         osegs++;
3437
3438                                                 /*
3439                                                  * Since the connection might have already
3440                                                  * received some dupacks, we add them to
3441                                                  * to the outstanding segments count to get
3442                                                  * the correct retransmit threshold.
3443                                                  *
3444                                                  * By checking for early retransmit after
3445                                                  * receiving some duplicate acks when SACK
3446                                                  * is supported, the connection will be able
3447                                                  * to enter fast recovery even if multiple
3448                                                  * segments are lost in the same window.
3449                                                  */
3450                                                 osegs += tp->t_dupacks;
3451                                                 if (osegs < 4) {
3452                                                         tcpstat.tcps_early_rexmt++;
3453                                                         tp->t_rexmtthresh = ((osegs - 1) > 1) ?
3454                                                                 (osegs - 1) : 1;
3455                                                         tp->t_rexmtthresh = min(tp->t_rexmtthresh,
3456                                                                 tcprexmtthresh);
3457                                                         tp->t_rexmtthresh = max(tp->t_rexmtthresh,
3458                                                                 tp->t_dupacks);
3459                                                         DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp,
3460                                                                 struct tcpcb *, tp, struct tcphdr *, th,
3461                                                                 int32_t, TCP_CC_EARLY_RETRANSMIT);
3462                                                         if (tp->t_early_rexmt_count == 0)
3463                                                                 tp->t_early_rexmt_win = tcp_now;
3464                                                         tp->t_early_rexmt_count++;
3465                                                 }
3466                                         }
3467                                 }
3468                                 /*
3469                                  * If we've seen exactly our rexmt threshold
3470                                  * of duplicate acks, assume a packet
3471                                  * has been dropped and retransmit it.
3472                                  * Kludge snd_nxt & the congestion
3473                                  * window so we send only this one
3474                                  * packet.
3475                                  *
3476                                  * We know we're losing at the current
3477                                  * window size so do congestion avoidance
3478                                  * (set ssthresh to half the current window
3479                                  * and pull our congestion window back to
3480                                  * the new ssthresh).
3481                                  *
3482                                  * Dup acks mean that packets have left the
3483                                  * network (they're now cached at the receiver)
3484                                  * so bump cwnd by the amount in the receiver
3485                                  * to keep a constant cwnd packets in the
3486                                  * network.
3487                                  */
3488                                 if (tp->t_timer[TCPT_REXMT] == 0 ||
3489                                     (th->th_ack != tp->snd_una && sack_bytes_acked == 0)) {
3490                                         tp->t_dupacks = 0;
3491                                         tp->t_rexmtthresh = tcprexmtthresh;
3492                                 } else if (tp->t_dupacks > tp->t_rexmtthresh ||
3493                                           IN_FASTRECOVERY(tp)) {
3494                                         if (SACK_ENABLED(tp) && IN_FASTRECOVERY(tp)) {
3495                                                 int awnd;
3496
3497                                                 /*
3498                                                  * Compute the amount of data in flight first.
3499                                                  * We can inject new data into the pipe iff
3500                                                  * we have less than 1/2 the original window's
3501                                                  * worth of data in flight.
3502                                                  */
3503                                                 awnd = (tp->snd_nxt - tp->snd_fack) +
3504                                                         tp->sackhint.sack_bytes_rexmit;
3505                                                 if (awnd < tp->snd_ssthresh) {
3506                                                         tp->snd_cwnd += tp->t_maxseg;
3507                                                         if (tp->snd_cwnd > tp->snd_ssthresh)
3508                                                                 tp->snd_cwnd = tp->snd_ssthresh;
3509                                                 }
3510                                         } else
3511                                                 tp->snd_cwnd += tp->t_maxseg;
3512
3513                                         DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp,
3514                                                 struct tcpcb *, tp, struct tcphdr *, th,
3515                                                 int32_t, TCP_CC_IN_FASTRECOVERY);
3516
3517                                         (void) tcp_output(tp);
3518                                         goto drop;
3519                                 } else if (tp->t_dupacks == tp->t_rexmtthresh) {
3520                                         tcp_seq onxt = tp->snd_nxt;
3521
3522                                         /*
3523                                          * If we're doing sack, check to
3524                                          * see if we're already in sack
3525                                          * recovery. If we're not doing sack,
3526                                          * check to see if we're in newreno
3527                                          * recovery.
3528                                          */
3529                                         if (SACK_ENABLED(tp)) {
3530                                                 if (IN_FASTRECOVERY(tp)) {
3531                                                         tp->t_dupacks = 0;
3532                                                         break;
3533                                                 }
3534                                         } else {
3535                                                 if (SEQ_LEQ(th->th_ack,
3536                                                     tp->snd_recover)) {
3537                                                         tp->t_dupacks = 0;
3538                                                         break;
3539                                                 }
3540                                         }
3541
3542                                         /*
3543                                          * If the current tcp cc module has
3544                                          * defined a hook for tasks to run
3545                                          * before entering FR, call it
3546                                          */
3547                                         if (CC_ALGO(tp)->pre_fr != NULL)
3548                                                 CC_ALGO(tp)->pre_fr(tp);
3549                                         ENTER_FASTRECOVERY(tp);
3550                                         tp->snd_recover = tp->snd_max;
3551                                         tp->t_timer[TCPT_REXMT] = 0;
3552                                         tp->t_rtttime = 0;
3553                                         if ((tp->ecn_flags & TE_ECN_ON) == TE_ECN_ON) {
3554                                                 tp->ecn_flags |= TE_SENDCWR;
3555                                         }
3556                                         if (SACK_ENABLED(tp)) {
3557                                                 tcpstat.tcps_sack_recovery_episode++;
3558                                                 tp->sack_newdata = tp->snd_nxt;
3559                                                 tp->snd_cwnd = tp->t_maxseg;
3560
3561                                                 DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp,
3562                                                         struct tcpcb *, tp, struct tcphdr *, th,
3563                                                         int32_t, TCP_CC_ENTER_FASTRECOVERY);
3564
3565                                                 (void) tcp_output(tp);
3566                                                 goto drop;
3567                                         }
3568                                         tp->snd_nxt = th->th_ack;
3569                                         tp->snd_cwnd = tp->t_maxseg;
3570                                         (void) tcp_output(tp);
3571                                         tp->snd_cwnd = tp->snd_ssthresh +
3572                                              tp->t_maxseg * tp->t_dupacks;
3573                                         if (SEQ_GT(onxt, tp->snd_nxt))
3574                                                 tp->snd_nxt = onxt;
3575                                         DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp,
3576                                                 struct tcpcb *, tp, struct tcphdr *, th,
3577                                                 int32_t, TCP_CC_ENTER_FASTRECOVERY);
3578                                         goto drop;
3579                                 } else if (limited_txmt &&
3580                                         ALLOW_LIMITED_TRANSMIT(tp) &&
3581                                         (!(SACK_ENABLED(tp)) || sack_bytes_acked > 0) &&
3582                                         (so->so_snd.sb_cc - (tp->snd_max - tp->snd_una)) > 0) {
3583                                         u_int32_t incr = (tp->t_maxseg * tp->t_dupacks);
3584
3585                                         /* Use Limited Transmit algorithm on the first two
3586                                          * duplicate acks when there is new data to transmit
3587                                          */
3588                                         tp->snd_cwnd += incr;
3589                                         tcpstat.tcps_limited_txt++;
3590                                         (void) tcp_output(tp);
3591
3592                                         DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp,
3593                                                 struct tcpcb *, tp, struct tcphdr *, th,
3594                                                 int32_t, TCP_CC_LIMITED_TRANSMIT);
3595
3596                                         /* Reset snd_cwnd back to normal */
3597                                         tp->snd_cwnd -= incr;
3598                                 }
3599                         } else {
3600                                 tp->t_dupacks = 0;
3601                                 tp->t_rexmtthresh = tcprexmtthresh;
3602                         }
3603                         break;
3604                 }
3605                 /*
3606                  * If the congestion window was inflated to account
3607                  * for the other side's cached packets, retract it.
3608                  */
3609                 if (IN_FASTRECOVERY(tp)) {
3610                         if (SEQ_LT(th->th_ack, tp->snd_recover)) {
3611                                 if (SACK_ENABLED(tp))
3612                                         tcp_sack_partialack(tp, th);
3613                                 else
3614                                         tcp_newreno_partial_ack(tp, th);
3615
3616                                 DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp,
3617                                         struct tcpcb *, tp, struct tcphdr *, th,
3618                                         int32_t, TCP_CC_PARTIAL_ACK);
3619                         } else {
3620                                 EXIT_FASTRECOVERY(tp);
3621                                 if (CC_ALGO(tp)->post_fr != NULL)
3622                                         CC_ALGO(tp)->post_fr(tp, th);
3623                                 DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp,
3624                                         struct tcpcb *, tp, struct tcphdr *, th,
3625                                         int32_t, TCP_CC_EXIT_FASTRECOVERY);
3626                         }
3627                 } else {
3628                         /*
3629                          * We were not in fast recovery. Reset the duplicate ack
3630                          * counter.
3631                          */
3632                         tp->t_dupacks = 0;
3633                         tp->t_rexmtthresh = tcprexmtthresh;
3634                 }
3635
3636
3637                 /*
3638                  * If we reach this point, ACK is not a duplicate,
3639                  *     i.e., it ACKs something we sent.
3640                  */
3641                 if (tp->t_flags & TF_NEEDSYN) {
3642                         /*
3643                          * T/TCP: Connection was half-synchronized, and our
3644                          * SYN has been ACK'd (so connection is now fully
3645                          * synchronized).  Go to non-starred state,
3646                          * increment snd_una for ACK of SYN, and check if
3647                          * we can do window scaling.
3648                          */
3649                         tp->t_flags &= ~TF_NEEDSYN;
3650                         tp->snd_una++;
3651                         /* Do window scaling? */
3652                         if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3653                                 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
3654                                 tp->snd_scale = tp->requested_s_scale;
3655                                 tp->rcv_scale = tp->request_r_scale;
3656                         }
3657                 }
3658
3659 process_ACK:
3660                 acked = BYTES_ACKED(th, tp);
3661                 tcpstat.tcps_rcvackpack++;
3662                 tcpstat.tcps_rcvackbyte += acked;
3663
3664                 /*
3665                  * If the last packet was a retransmit, make sure
3666                  * it was not spurious.
3667                  *
3668                  * If the ack has ECE bit set, skip bad
3669                  * retransmit recovery.
3670                  */
3671                 if (tp->t_rxtshift > 0 &&
3672                         (thflags & TH_ECE) == 0 &&
3673                         tcp_detect_bad_rexmt(tp, &to)) {
3674                         ++tcpstat.tcps_sndrexmitbad;
3675                         tcp_bad_rexmt_restore_state(tp, th);
3676
3677                         DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp,
3678                                 struct tcpcb *, tp, struct tcphdr *, th,
3679                                 int32_t, TCP_CC_BAD_REXMT_RECOVERY);
3680                 }
3681
3682                 /* Recalculate the RTT */
3683                 tcp_compute_rtt(tp, &to, th);
3684
3685                 /*
3686                  * If all outstanding data is acked, stop retransmit
3687                  * timer and remember to restart (more output or persist).
3688                  * If there is more data to be acked, restart retransmit
3689                  * timer, using current (possibly backed-off) value.
3690                  */
3691                 if (th->th_ack == tp->snd_max) {
3692                         tp->t_timer[TCPT_REXMT] = 0;
3693                         needoutput = 1;
3694                 } else if (tp->t_timer[TCPT_PERSIST] == 0)
3695                         tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
3696
3697                 /*
3698                  * If no data (only SYN) was ACK'd,
3699                  *    skip rest of ACK processing.
3700                  */
3701                 if (acked == 0)
3702                         goto step6;
3703
3704                 if ((thflags & TH_ECE) != 0 &&
3705                         ((tp->ecn_flags & TE_ECN_ON) == TE_ECN_ON)) {
3706                         /*
3707                          * Reduce the congestion window if we haven't done so.
3708                          */
3709                         if (!SACK_ENABLED(tp) && !IN_FASTRECOVERY(tp) &&
3710                                 SEQ_GEQ(th->th_ack, tp->snd_recover)) {
3711                                 tcp_reduce_congestion_window(tp);
3712                                 DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp,
3713                                         struct tcpcb *, tp, struct tcphdr *, th,
3714                                         int32_t, TCP_CC_ECN_RCVD);
3715                         }
3716                 }
3717
3718                 /*
3719                  * When new data is acked, open the congestion window.
3720                  * The specifics of how this is achieved are up to the
3721                  * congestion control algorithm in use for this connection.
3722                  *
3723                  * The calculations in this function assume that snd_una is
3724                  * not updated yet.
3725                  */
3726                 if (!IN_FASTRECOVERY(tp)) {
3727                         if (CC_ALGO(tp)->ack_rcvd != NULL)
3728                                 CC_ALGO(tp)->ack_rcvd(tp, th);
3729
3730                         DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp,
3731                                 struct tcpcb *, tp, struct tcphdr *, th,
3732                                 int32_t, TCP_CC_ACK_RCVD);
3733                 }
3734                 if (acked > so->so_snd.sb_cc) {
3735                         tp->snd_wnd -= so->so_snd.sb_cc;
3736                         sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
3737                         if (so->so_flags & SOF_ENABLE_MSGS) {
3738                                 so->so_msg_state->msg_serial_bytes -=
3739                                         (int)so->so_snd.sb_cc;
3740                         }
3741                         ourfinisacked = 1;
3742                 } else {
3743                         sbdrop(&so->so_snd, acked);
3744                         if (so->so_flags & SOF_ENABLE_MSGS) {
3745                                 so->so_msg_state->msg_serial_bytes -=
3746                                         acked;
3747                         }
3748                         tcp_sbsnd_trim(&so->so_snd);
3749                         tp->snd_wnd -= acked;
3750                         ourfinisacked = 0;
3751                 }
3752                 /* detect una wraparound */
3753                 if ( !IN_FASTRECOVERY(tp) &&
3754                     SEQ_GT(tp->snd_una, tp->snd_recover) &&
3755                     SEQ_LEQ(th->th_ack, tp->snd_recover))
3756                         tp->snd_recover = th->th_ack - 1;
3757
3758                 if (IN_FASTRECOVERY(tp) &&
3759                     SEQ_GEQ(th->th_ack, tp->snd_recover))
3760                         EXIT_FASTRECOVERY(tp);
3761
3762                 tp->snd_una = th->th_ack;
3763                 if (SACK_ENABLED(tp)) {
3764                         if (SEQ_GT(tp->snd_una, tp->snd_recover))
3765                                 tp->snd_recover = tp->snd_una;
3766                 }
3767                 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
3768                         tp->snd_nxt = tp->snd_una;
3769                 if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 &&
3770                         tp->t_bwmeas != NULL)
3771                         tcp_bwmeas_check(tp);
3772
3773                 /*
3774                  * sowwakeup must happen after snd_una, et al. are updated so that
3775                  * the sequence numbers are in sync with so_snd
3776                  */
3777                 sowwakeup(so);
3778
3779                 switch (tp->t_state) {
3780
3781                 /*
3782                  * In FIN_WAIT_1 STATE in addition to the processing
3783                  * for the ESTABLISHED state if our FIN is now acknowledged
3784                  * then enter FIN_WAIT_2.
3785                  */
3786                 case TCPS_FIN_WAIT_1:
3787                         if (ourfinisacked) {
3788                                 /*
3789                                  * If we can't receive any more
3790                                  * data, then closing user can proceed.
3791                                  * Starting the TCPT_2MSL timer is contrary to the
3792                                  * specification, but if we don't get a FIN
3793                                  * we'll hang forever.
3794                                  */
3795                                 if (so->so_state & SS_CANTRCVMORE) {
3796                                         tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp,
3797                                                 TCP_CONN_MAXIDLE(tp));
3798                                         isconnected = FALSE;
3799                                         isdisconnected = TRUE;
3800                                 }
3801                                 DTRACE_TCP4(state__change, void, NULL,
3802                                         struct inpcb *, inp,
3803                                         struct tcpcb *, tp,
3804                                         int32_t, TCPS_FIN_WAIT_2);
3805                                 tp->t_state = TCPS_FIN_WAIT_2;
3806                                 /* fall through and make sure we also recognize
3807                                  * data ACKed with the FIN
3808                                  */
3809                         }
3810                         tp->t_flags |= TF_ACKNOW;
3811                         break;
3812
3813                 /*
3814                  * In CLOSING STATE in addition to the processing for
3815                  * the ESTABLISHED state if the ACK acknowledges our FIN
3816                  * then enter the TIME-WAIT state, otherwise ignore
3817                  * the segment.
3818                  */
3819                 case TCPS_CLOSING:
3820                         if (ourfinisacked) {
3821                                 DTRACE_TCP4(state__change, void, NULL,
3822                                         struct inpcb *, inp,
3823                                         struct tcpcb *, tp,
3824                                         int32_t, TCPS_TIME_WAIT);
3825                                 tp->t_state = TCPS_TIME_WAIT;
3826                                 tcp_canceltimers(tp);
3827                                 add_to_time_wait(tp, 2 * tcp_msl);
3828                                 isconnected = FALSE;
3829                                 isdisconnected = TRUE;
3830                         }
3831                         tp->t_flags |= TF_ACKNOW;
3832                         break;
3833
3834                 /*
3835                  * In LAST_ACK, we may still be waiting for data to drain
3836                  * and/or to be acked, as well as for the ack of our FIN.
3837                  * If our FIN is now acknowledged, delete the TCB,
3838                  * enter the closed state and return.
3839                  */
3840                 case TCPS_LAST_ACK:
3841                         if (ourfinisacked) {
3842                                 tp = tcp_close(tp);
3843                                 goto drop;
3844                         }
3845                         break;
3846
3847                 /*
3848                  * In TIME_WAIT state the only thing that should arrive
3849                  * is a retransmission of the remote FIN.  Acknowledge
3850                  * it and restart the finack timer.
3851                  */
3852                 case TCPS_TIME_WAIT:
3853                         add_to_time_wait(tp, 2 * tcp_msl);
3854                         goto dropafterack;
3855                 }
3856
3857                 /*
3858                  * If there is a SACK option on the ACK and we
3859                  * haven't seen any duplicate acks before, count
3860                  * it as a duplicate ack even if the cumulative
3861                  * ack is advanced. If the receiver delayed an
3862                  * ack and detected loss afterwards, then the ack
3863                  * will advance cumulative ack and will also have
3864                  * a SACK option. So counting it as one duplicate
3865                  * ack is ok.
3866                  */
3867                 if (sack_ackadv == 1 &&
3868                         tp->t_state == TCPS_ESTABLISHED &&
3869                         SACK_ENABLED(tp) &&
3870                         sack_bytes_acked > 0 &&
3871                         tp->t_dupacks == 0 &&
3872                         SEQ_LEQ(th->th_ack, tp->snd_una) && tlen == 0) {
3873                         tcpstat.tcps_sack_ackadv++;
3874                         goto process_dupack;
3875                 }
3876         }
3877
3878 step6:
3879         /*
3880          * Update window information.
3881          * Don't look at window if no ACK: TAC's send garbage on first SYN.
3882          */
3883         if ((thflags & TH_ACK) &&
3884             (SEQ_LT(tp->snd_wl1, th->th_seq) ||
3885             (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
3886              (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
3887                 /* keep track of pure window updates */
3888                 if (tlen == 0 &&
3889                     tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
3890                         tcpstat.tcps_rcvwinupd++;
3891                 tp->snd_wnd = tiwin;
3892                 tp->snd_wl1 = th->th_seq;
3893                 tp->snd_wl2 = th->th_ack;
3894                 if (tp->snd_wnd > tp->max_sndwnd)
3895                         tp->max_sndwnd = tp->snd_wnd;
3896                 needoutput = 1;
3897         }
3898
3899         /*
3900          * Process segments with URG.
3901          */
3902         if ((thflags & TH_URG) && th->th_urp &&
3903             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
3904                 /*
3905                  * This is a kludge, but if we receive and accept
3906                  * random urgent pointers, we'll crash in
3907                  * soreceive.  It's hard to imagine someone
3908                  * actually wanting to send this much urgent data.
3909                  */
3910                 if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
3911                         th->th_urp = 0;                 /* XXX */
3912                         thflags &= ~TH_URG;             /* XXX */
3913                         goto dodata;                    /* XXX */
3914                 }
3915                 /*
3916                  * If this segment advances the known urgent pointer,
3917                  * then mark the data stream.  This should not happen
3918                  * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
3919                  * a FIN has been received from the remote side.
3920                  * In these states we ignore the URG.
3921                  *
3922                  * According to RFC961 (Assigned Protocols),
3923                  * the urgent pointer points to the last octet
3924                  * of urgent data.  We continue, however,
3925                  * to consider it to indicate the first octet
3926                  * of data past the urgent section as the original
3927                  * spec states (in one of two places).
3928                  */
3929                 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
3930                         tp->rcv_up = th->th_seq + th->th_urp;
3931                         so->so_oobmark = so->so_rcv.sb_cc +
3932                             (tp->rcv_up - tp->rcv_nxt) - 1;
3933                         if (so->so_oobmark == 0) {
3934                                 so->so_state |= SS_RCVATMARK;
3935                                 postevent(so, 0, EV_OOB);
3936                         }
3937                         sohasoutofband(so);
3938                         tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
3939                 }
3940                 /*
3941                  * Remove out of band data so doesn't get presented to user.
3942                  * This can happen independent of advancing the URG pointer,
3943                  * but if two URG's are pending at once, some out-of-band
3944                  * data may creep in... ick.
3945                  */
3946                 if (th->th_urp <= (u_int32_t)tlen
3947 #if SO_OOBINLINE
3948                      && (so->so_options & SO_OOBINLINE) == 0
3949 #endif
3950                      )
3951                         tcp_pulloutofband(so, th, m,
3952                                 drop_hdrlen);   /* hdr drop is delayed */
3953         } else {
3954                 /*
3955                  * If no out of band data is expected,
3956                  * pull receive urgent pointer along
3957                  * with the receive window.
3958                  */
3959                 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
3960                         tp->rcv_up = tp->rcv_nxt;
3961         }
3962 dodata:
3963
3964         /* Set socket's connect or disconnect state correcly before doing data.
3965          * The following might unlock the socket if there is an upcall or a socket
3966          * filter.
3967          */
3968         if (isconnected) {
3969                 soisconnected(so);
3970         } else if (isdisconnected) {
3971                 soisdisconnected(so);
3972         }
3973
3974         /* Let's check the state of pcb just to make sure that it did not get closed
3975          * when we unlocked above
3976          */
3977         if (inp->inp_state == INPCB_STATE_DEAD) {
3978                 /* Just drop the packet that we are processing and return */
3979                 goto drop;
3980         }
3981
3982         /*
3983          * Process the segment text, merging it into the TCP sequencing queue,
3984          * and arranging for acknowledgment of receipt if necessary.
3985          * This process logically involves adjusting tp->rcv_wnd as data
3986          * is presented to the user (this happens in tcp_usrreq.c,
3987          * case PRU_RCVD).  If a FIN has already been received on this
3988          * connection then we just ignore the text.
3989          */
3990         if ((tlen || (thflags & TH_FIN)) &&
3991             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
3992                 tcp_seq save_start = th->th_seq;
3993                 tcp_seq save_end = th->th_seq + tlen;
3994                 m_adj(m, drop_hdrlen);  /* delayed header drop */
3995                 /*
3996                  * Insert segment which includes th into TCP reassembly queue
3997                  * with control block tp.  Set thflags to whether reassembly now
3998                  * includes a segment with FIN.  This handles the common case
3999                  * inline (segment is the next to be received on an established
4000                  * connection, and the queue is empty), avoiding linkage into
4001                  * and removal from the queue and repetition of various
4002                  * conversions.
4003                  * Set DELACK for segments received in order, but ack
4004                  * immediately when segments are out of order (so
4005                  * fast retransmit can work).
4006                  */
4007                 if (th->th_seq == tp->rcv_nxt &&
4008                     LIST_EMPTY(&tp->t_segq) &&
4009                     TCPS_HAVEESTABLISHED(tp->t_state)) {
4010                         TCP_INC_VAR(tp->t_unacksegs, nlropkts);
4011                         /*
4012                          * Calculate the RTT on the receiver only if the
4013                          * connection is in streaming mode and the last
4014                          * packet was not an end-of-write
4015                          */
4016                         if ((tp->t_flags & TF_STRETCHACK) &&
4017                                 !(tp->t_flagsext & TF_STREAMEOW))
4018                                 tcp_compute_rtt(tp, &to, th);
4019
4020                         if (DELAY_ACK(tp, th) &&
4021                                 ((tp->t_flags & TF_ACKNOW) == 0) ) {
4022                                 if ((tp->t_flags & TF_DELACK) == 0) {
4023                                         tp->t_flags |= TF_DELACK;
4024                                         tp->t_timer[TCPT_DELACK] =
4025                                                 OFFSET_FROM_START(tp, tcp_delack);
4026                                 }
4027                         }
4028                         else {
4029                                 tp->t_flags |= TF_ACKNOW;
4030                         }
4031                         tp->rcv_nxt += tlen;
4032                         thflags = th->th_flags & TH_FIN;
4033                         TCP_INC_VAR(tcpstat.tcps_rcvpack, nlropkts);
4034                         tcpstat.tcps_rcvbyte += tlen;
4035                         if (nstat_collect) {
4036                                 if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) {
4037                                         INP_ADD_STAT(inp, cell, wifi, rxpackets,
4038                                             m->m_pkthdr.lro_npkts);
4039                                 } else {
4040                                         INP_ADD_STAT(inp, cell, wifi, rxpackets, 1);
4041                                 }
4042                                 INP_ADD_STAT(inp, cell, wifi, rxbytes, tlen);
4043                         }
4044                         tcp_sbrcv_grow(tp, &so->so_rcv, &to, tlen);
4045                         so_recv_data_stat(so, m, drop_hdrlen);
4046
4047                         if (sbappendstream_rcvdemux(so, m,
4048                             th->th_seq - (tp->irs + 1), 0)) {
4049                                 sorwakeup(so);
4050                         }
4051                 } else {
4052                         thflags = tcp_reass(tp, th, &tlen, m, ifp);
4053                         tp->t_flags |= TF_ACKNOW;
4054                 }
4055
4056                 if (tlen > 0 && SACK_ENABLED(tp))
4057                         tcp_update_sack_list(tp, save_start, save_end);
4058
4059                 tcp_adaptive_rwtimo_check(tp, tlen);
4060
4061                 if (tp->t_flags & TF_DELACK)
4062                 {
4063 #if INET6
4064                         if (isipv6) {
4065                                 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
4066                                         (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
4067                                         th->th_seq, th->th_ack, th->th_win);
4068                         }
4069                         else
4070 #endif
4071                         {
4072                                 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
4073                                         (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
4074                                         th->th_seq, th->th_ack, th->th_win);
4075                         }
4076
4077                 }
4078         } else {
4079                 m_freem(m);
4080                 thflags &= ~TH_FIN;
4081         }
4082
4083         /*
4084          * If FIN is received ACK the FIN and let the user know
4085          * that the connection is closing.
4086          */
4087         if (thflags & TH_FIN) {
4088                 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
4089                         socantrcvmore(so);
4090                         postevent(so, 0, EV_FIN);
4091                         /*
4092                          * If connection is half-synchronized
4093                          * (ie NEEDSYN flag on) then delay ACK,
4094                          * so it may be piggybacked when SYN is sent.
4095                          * Otherwise, since we received a FIN then no
4096                          * more input can be expected, send ACK now.
4097                          */
4098                         TCP_INC_VAR(tp->t_unacksegs, nlropkts);
4099                         if (DELAY_ACK(tp, th) && (tp->t_flags & TF_NEEDSYN)) {
4100                                 if ((tp->t_flags & TF_DELACK) == 0) {
4101                                         tp->t_flags |= TF_DELACK;
4102                                         tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
4103                                 }
4104                         }
4105                         else {
4106                                 tp->t_flags |= TF_ACKNOW;
4107                         }
4108                         tp->rcv_nxt++;
4109                 }
4110                 switch (tp->t_state) {
4111
4112                 /*
4113                  * In SYN_RECEIVED and ESTABLISHED STATES
4114                  * enter the CLOSE_WAIT state.
4115                  */
4116                 case TCPS_SYN_RECEIVED:
4117                         tp->t_starttime = tcp_now;
4118                 case TCPS_ESTABLISHED:
4119                         DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
4120                                 struct tcpcb *, tp, int32_t, TCPS_CLOSE_WAIT);
4121                         tp->t_state = TCPS_CLOSE_WAIT;
4122                         break;
4123
4124                 /*
4125                  * If still in FIN_WAIT_1 STATE FIN has not been acked so
4126                  * enter the CLOSING state.
4127                  */
4128                 case TCPS_FIN_WAIT_1:
4129                         DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
4130                                 struct tcpcb *, tp, int32_t, TCPS_CLOSING);
4131                         tp->t_state = TCPS_CLOSING;
4132                         break;
4133
4134                 /*
4135                  * In FIN_WAIT_2 state enter the TIME_WAIT state,
4136                  * starting the time-wait timer, turning off the other
4137                  * standard timers.
4138                  */
4139                 case TCPS_FIN_WAIT_2:
4140                         DTRACE_TCP4(state__change, void, NULL,
4141                                 struct inpcb *, inp,
4142                                 struct tcpcb *, tp,
4143                                 int32_t, TCPS_TIME_WAIT);
4144                         tp->t_state = TCPS_TIME_WAIT;
4145                         tcp_canceltimers(tp);
4146                         if (tp->cc_recv != 0 &&
4147                                 ((int)(tcp_now - tp->t_starttime)) < tcp_msl) {
4148                                 /* For transaction client, force ACK now. */
4149                                 tp->t_flags |= TF_ACKNOW;
4150                                 tp->t_unacksegs = 0;
4151                         }
4152                         add_to_time_wait(tp, 2 * tcp_msl);
4153                         soisdisconnected(so);
4154                         break;
4155
4156                 /*
4157                  * In TIME_WAIT state restart the 2 MSL time_wait timer.
4158                  */
4159                 case TCPS_TIME_WAIT:
4160                         add_to_time_wait(tp, 2 * tcp_msl);
4161                         break;
4162                 }
4163         }
4164 #if TCPDEBUG
4165         if (so->so_options & SO_DEBUG)
4166                 tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
4167                           &tcp_savetcp, 0);
4168 #endif
4169
4170         /*
4171          * Return any desired output.
4172          */
4173         if (needoutput || (tp->t_flags & TF_ACKNOW)) {
4174                 (void) tcp_output(tp);
4175         }
4176
4177         tcp_check_timer_state(tp);
4178
4179
4180         tcp_unlock(so, 1, 0);
4181         KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
4182         return;
4183
4184 dropafterack:
4185         /*
4186          * Generate an ACK dropping incoming segment if it occupies
4187          * sequence space, where the ACK reflects our state.
4188          *
4189          * We can now skip the test for the RST flag since all
4190          * paths to this code happen after packets containing
4191          * RST have been dropped.
4192          *
4193          * In the SYN-RECEIVED state, don't send an ACK unless the
4194          * segment we received passes the SYN-RECEIVED ACK test.
4195          * If it fails send a RST.  This breaks the loop in the
4196          * "LAND" DoS attack, and also prevents an ACK storm
4197          * between two listening ports that have been sent forged
4198          * SYN segments, each with the source address of the other.
4199          */
4200         if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
4201             (SEQ_GT(tp->snd_una, th->th_ack) ||
4202              SEQ_GT(th->th_ack, tp->snd_max)) ) {
4203                 rstreason = BANDLIM_RST_OPENPORT;
4204                 IF_TCP_STATINC(ifp, dospacket);
4205                 goto dropwithreset;
4206         }
4207 #if TCPDEBUG
4208         if (so->so_options & SO_DEBUG)
4209                 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
4210                           &tcp_savetcp, 0);
4211 #endif
4212         m_freem(m);
4213         tp->t_flags |= TF_ACKNOW;
4214         (void) tcp_output(tp);
4215
4216         /* Don't need to check timer state as we should have done it during tcp_output */
4217         tcp_unlock(so, 1, 0);
4218         KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
4219         return;
4220 dropwithresetnosock:
4221         nosock = 1;
4222 dropwithreset:
4223         /*
4224          * Generate a RST, dropping incoming segment.
4225          * Make ACK acceptable to originator of segment.
4226          * Don't bother to respond if destination was broadcast/multicast.
4227          */
4228         if ((thflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST))
4229                 goto drop;
4230 #if INET6
4231         if (isipv6) {
4232                 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
4233                     IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
4234                         goto drop;
4235         } else
4236 #endif /* INET6 */
4237         if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
4238             IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
4239             ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
4240             in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
4241                 goto drop;
4242         /* IPv6 anycast check is done at tcp6_input() */
4243
4244         /*
4245          * Perform bandwidth limiting.
4246          */
4247 #if ICMP_BANDLIM
4248         if (badport_bandlim(rstreason) < 0)
4249                 goto drop;
4250 #endif
4251
4252 #if TCPDEBUG
4253         if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
4254                 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
4255                           &tcp_savetcp, 0);
4256 #endif
4257         if (thflags & TH_ACK)
4258                 /* mtod() below is safe as long as hdr dropping is delayed */
4259                 tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack,
4260                     TH_RST, ifscope, nocell);
4261         else {
4262                 if (thflags & TH_SYN)
4263                         tlen++;
4264                 /* mtod() below is safe as long as hdr dropping is delayed */
4265                 tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
4266                     (tcp_seq)0, TH_RST|TH_ACK, ifscope, nocell);
4267         }
4268         /* destroy temporarily created socket */
4269         if (dropsocket) {
4270                 (void) soabort(so);
4271                 tcp_unlock(so, 1, 0);
4272         } else if ((inp != NULL) && (nosock == 0)) {
4273                 tcp_unlock(so, 1, 0);
4274         }
4275         KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
4276         return;
4277 dropnosock:
4278         nosock = 1;
4279 drop:
4280         /*
4281          * Drop space held by incoming segment and return.
4282          */
4283 #if TCPDEBUG
4284         if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
4285                 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
4286                           &tcp_savetcp, 0);
4287 #endif
4288         m_freem(m);
4289         /* destroy temporarily created socket */
4290         if (dropsocket) {
4291                 (void) soabort(so);
4292                 tcp_unlock(so, 1, 0);
4293         }
4294         else if (nosock == 0) {
4295                 tcp_unlock(so, 1, 0);
4296         }
4297         KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
4298         return;
4299 }
4300
4301 static void
4302 tcp_dooptions(tp, cp, cnt, th, to, input_ifscope)
4303 /*
4304  * Parse TCP options and place in tcpopt.
4305  */
4306         struct tcpcb *tp;
4307         u_char *cp;
4308         int cnt;
4309         struct tcphdr *th;
4310         struct tcpopt *to;
4311         unsigned int input_ifscope;
4312 {
4313         u_short mss = 0;
4314         int opt, optlen;
4315
4316         for (; cnt > 0; cnt -= optlen, cp += optlen) {
4317                 opt = cp[0];
4318                 if (opt == TCPOPT_EOL)
4319                         break;
4320                 if (opt == TCPOPT_NOP)
4321                         optlen = 1;
4322                 else {
4323                         if (cnt < 2)
4324                                 break;
4325                         optlen = cp[1];
4326                         if (optlen < 2 || optlen > cnt)
4327                                 break;
4328                 }
4329                 switch (opt) {
4330
4331                 default:
4332                         continue;
4333
4334                 case TCPOPT_MAXSEG:
4335                         if (optlen != TCPOLEN_MAXSEG)
4336                                 continue;
4337                         if (!(th->th_flags & TH_SYN))
4338                                 continue;
4339                         bcopy((char *) cp + 2, (char *) &mss, sizeof(mss));
4340                         NTOHS(mss);
4341                         break;
4342
4343                 case TCPOPT_WINDOW:
4344                         if (optlen != TCPOLEN_WINDOW)
4345                                 continue;
4346                         if (!(th->th_flags & TH_SYN))
4347                                 continue;
4348                         to->to_flags |= TOF_SCALE;
4349                         tp->t_flags |= TF_RCVD_SCALE;
4350                         tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
4351                         break;
4352
4353                 case TCPOPT_TIMESTAMP:
4354                         if (optlen != TCPOLEN_TIMESTAMP)
4355                                 continue;
4356                         to->to_flags |= TOF_TS;
4357                         bcopy((char *)cp + 2,
4358                             (char *)&to->to_tsval, sizeof(to->to_tsval));
4359                         NTOHL(to->to_tsval);
4360                         bcopy((char *)cp + 6,
4361                             (char *)&to->to_tsecr, sizeof(to->to_tsecr));
4362                         NTOHL(to->to_tsecr);
4363                         /*
4364                          * A timestamp received in a SYN makes
4365                          * it ok to send timestamp requests and replies.
4366                          */
4367                         if (th->th_flags & TH_SYN) {
4368                                 tp->t_flags |= TF_RCVD_TSTMP;
4369                                 tp->ts_recent = to->to_tsval;
4370                                 tp->ts_recent_age = tcp_now;
4371                         }
4372                         break;
4373                 case TCPOPT_SACK_PERMITTED:
4374                         if (!tcp_do_sack ||
4375                             optlen != TCPOLEN_SACK_PERMITTED)
4376                                 continue;
4377                         if (th->th_flags & TH_SYN)
4378                                 to->to_flags |= TOF_SACK;
4379                         break;
4380                 case TCPOPT_SACK:
4381                         if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
4382                                 continue;
4383                         to->to_nsacks = (optlen - 2) / TCPOLEN_SACK;
4384                         to->to_sacks = cp + 2;
4385                         tcpstat.tcps_sack_rcv_blocks++;
4386
4387                         break;
4388
4389 #if MPTCP
4390                 case TCPOPT_MULTIPATH:
4391                         tcp_do_mptcp_options(tp, cp, th, to, optlen);
4392                         break;
4393 #endif /* MPTCP */
4394                 }
4395         }
4396         if (th->th_flags & TH_SYN)
4397                 tcp_mss(tp, mss, input_ifscope);        /* sets t_maxseg */
4398 }
4399
4400 /*
4401  * Pull out of band byte out of a segment so
4402  * it doesn't appear in the user's data queue.
4403  * It is still reflected in the segment length for
4404  * sequencing purposes.
4405  */
4406 static void
4407 tcp_pulloutofband(so, th, m, off)
4408         struct socket *so;
4409         struct tcphdr *th;
4410         register struct mbuf *m;
4411         int off;                /* delayed to be droped hdrlen */
4412 {
4413         int cnt = off + th->th_urp - 1;
4414
4415         while (cnt >= 0) {
4416                 if (m->m_len > cnt) {
4417                         char *cp = mtod(m, caddr_t) + cnt;
4418                         struct tcpcb *tp = sototcpcb(so);
4419
4420                         tp->t_iobc = *cp;
4421                         tp->t_oobflags |= TCPOOB_HAVEDATA;
4422                         bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
4423                         m->m_len--;
4424                         if (m->m_flags & M_PKTHDR)
4425                                 m->m_pkthdr.len--;
4426                         return;
4427                 }
4428                 cnt -= m->m_len;
4429                 m = m->m_next;
4430                 if (m == 0)
4431                         break;
4432         }
4433         panic("tcp_pulloutofband");
4434 }
4435
4436 uint32_t
4437 get_base_rtt(struct tcpcb *tp)
4438 {
4439         uint32_t base_rtt = 0, i;
4440         for (i = 0; i < N_RTT_BASE; ++i) {
4441                 if (tp->rtt_hist[i] != 0 &&
4442                         (base_rtt == 0 || tp->rtt_hist[i] < base_rtt))
4443                         base_rtt = tp->rtt_hist[i];
4444         }
4445         return base_rtt;
4446 }
4447
4448 /* Each value of RTT base represents the minimum RTT seen in a minute.
4449  * We keep upto N_RTT_BASE minutes worth of history.
4450  */
4451 void
4452 update_base_rtt(struct tcpcb *tp, uint32_t rtt)
4453 {
4454         int32_t i, qdelay;
4455         u_int32_t base_rtt;
4456
4457         if (++tp->rtt_count >= rtt_samples_per_slot) {
4458 #if TRAFFIC_MGT
4459                 /*
4460                  * If the recv side is being throttled, check if the
4461                  * current RTT is closer to the base RTT seen in
4462                  * first (recent) two slots. If so, unthrottle the stream.
4463                  */
4464                 if (tp->t_flagsext & TF_RECV_THROTTLE) {
4465                         base_rtt = min(tp->rtt_hist[0], tp->rtt_hist[1]);
4466                         qdelay = tp->t_rttcur - base_rtt;
4467                         if (qdelay < target_qdelay)
4468                                 tp->t_flagsext &= ~(TF_RECV_THROTTLE);
4469                 }
4470 #endif /* TRAFFIC_MGT */
4471
4472                 for (i = (N_RTT_BASE-1); i > 0; --i) {
4473                         tp->rtt_hist[i] = tp->rtt_hist[i-1];
4474                 }
4475                 tp->rtt_hist[0] = rtt;
4476                 tp->rtt_count = 0;
4477         } else {
4478                 tp->rtt_hist[0] = min(tp->rtt_hist[0], rtt);
4479         }
4480 }
4481
4482 /*
4483  * If we have a timestamp reply, update smoothed RTT. If no timestamp is
4484  * present but transmit timer is running and timed sequence number was
4485  * acked, update smoothed RTT.
4486  *
4487  * If timestamps are supported, a receiver can update RTT even if
4488  * there is no outstanding data.
4489  *
4490  * Some boxes send broken timestamp replies during the SYN+ACK phase,
4491  * ignore timestamps of 0or we could calculate a huge RTT and blow up
4492  * the retransmit timer.
4493  */
4494 static void
4495 tcp_compute_rtt(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th)
4496 {
4497         VERIFY(to != NULL && th != NULL);
4498         if (((to->to_flags & TOF_TS) != 0) &&
4499                 (to->to_tsecr != 0) &&
4500                 TSTMP_GEQ(tcp_now, to->to_tsecr)) {
4501                 tcp_xmit_timer(tp, tcp_now - to->to_tsecr,
4502                         to->to_tsecr, th->th_ack);
4503         } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) {
4504                 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime, 0,
4505                         th->th_ack);
4506         }
4507 }
4508
4509 /*
4510  * Collect new round-trip time estimate
4511  * and update averages and current timeout.
4512  */
4513 static void
4514 tcp_xmit_timer(register struct tcpcb *tp, int rtt,
4515         u_int32_t tsecr, tcp_seq th_ack)
4516 {
4517         register int delta;
4518
4519         if (tp->t_flagsext & TF_RECOMPUTE_RTT) {
4520                 if (SEQ_GT(th_ack, tp->snd_una) &&
4521                     SEQ_LEQ(th_ack, tp->snd_max) &&
4522                     (tsecr == 0 ||
4523                     TSTMP_GEQ(tsecr, tp->t_badrexmt_time))) {
4524                         /*
4525                          * We received a new ACk after a
4526                          * spurious timeout. Adapt retransmission
4527                          * timer as described in rfc 4015.
4528                          */
4529                         tp->t_flagsext &= ~(TF_RECOMPUTE_RTT);
4530                         tp->t_badrexmt_time = 0;
4531                         tp->t_srtt = max(tp->t_srtt_prev, rtt);
4532                         tp->t_srtt = tp->t_srtt << TCP_RTT_SHIFT;
4533                         tp->t_rttvar = max(tp->t_rttvar_prev, (rtt >> 1));
4534                         tp->t_rttvar = tp->t_rttvar << TCP_RTTVAR_SHIFT;
4535
4536                         if (tp->t_rttbest > (tp->t_srtt + tp->t_rttvar))
4537                                 tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
4538
4539                         goto compute_rto;
4540                 } else {
4541                         return;
4542                 }
4543         }
4544
4545         tcpstat.tcps_rttupdated++;
4546         tp->t_rttupdated++;
4547
4548         if (rtt > 0) {
4549                 tp->t_rttcur = rtt;
4550                 update_base_rtt(tp, rtt);
4551         }
4552
4553         if (tp->t_srtt != 0) {
4554                 /*
4555                  * srtt is stored as fixed point with 5 bits after the
4556                  * binary point (i.e., scaled by 32).  The following magic
4557                  * is equivalent to the smoothing algorithm in rfc793 with
4558                  * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
4559                  * point).
4560                  *
4561                  * Freebsd adjusts rtt to origin 0 by subtracting 1
4562                  * from the provided rtt value. This was required because
4563                  * of the way t_rtttime was initiailised to 1 before.
4564                  * Since we changed t_rtttime to be based on
4565                  * tcp_now, this extra adjustment is not needed.
4566                  */
4567                 delta = (rtt << TCP_DELTA_SHIFT)
4568                         - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
4569
4570                 if ((tp->t_srtt += delta) <= 0)
4571                         tp->t_srtt = 1;
4572
4573                 /*
4574                  * We accumulate a smoothed rtt variance (actually, a
4575                  * smoothed mean difference), then set the retransmit
4576                  * timer to smoothed rtt + 4 times the smoothed variance.
4577                  * rttvar is stored as fixed point with 4 bits after the
4578                  * binary point (scaled by 16).  The following is
4579                  * equivalent to rfc793 smoothing with an alpha of .75
4580                  * (rttvar = rttvar*3/4 + |delta| / 4).  This replaces
4581                  * rfc793's wired-in beta.
4582                  */
4583                 if (delta < 0)
4584                         delta = -delta;
4585                 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
4586                 if ((tp->t_rttvar += delta) <= 0)
4587                         tp->t_rttvar = 1;
4588                 if (tp->t_rttbest == 0  ||
4589                         tp->t_rttbest > (tp->t_srtt + tp->t_rttvar))
4590                         tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
4591         } else {
4592                 /*
4593                  * No rtt measurement yet - use the unsmoothed rtt.
4594                  * Set the variance to half the rtt (so our first
4595                  * retransmit happens at 3*rtt).
4596                  */
4597                 tp->t_srtt = rtt << TCP_RTT_SHIFT;
4598                 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
4599         }
4600
4601 compute_rto:
4602         nstat_route_rtt(tp->t_inpcb->inp_route.ro_rt, tp->t_srtt,
4603                 tp->t_rttvar);
4604         tp->t_rtttime = 0;
4605         tp->t_rxtshift = 0;
4606         tp->t_rxtstart = 0;
4607
4608         /*
4609          * the retransmit should happen at rtt + 4 * rttvar.
4610          * Because of the way we do the smoothing, srtt and rttvar
4611          * will each average +1/2 tick of bias.  When we compute
4612          * the retransmit timer, we want 1/2 tick of rounding and
4613          * 1 extra tick because of +-1/2 tick uncertainty in the
4614          * firing of the timer.  The bias will give us exactly the
4615          * 1.5 tick we need.  But, because the bias is
4616          * statistical, we have to test that we don't drop below
4617          * the minimum feasible timer (which is 2 ticks).
4618          */
4619         TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
4620                 max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX,
4621                 TCP_ADD_REXMTSLOP(tp));
4622
4623         /*
4624          * We received an ack for a packet that wasn't retransmitted;
4625          * it is probably safe to discard any error indications we've
4626          * received recently.  This isn't quite right, but close enough
4627          * for now (a route might have failed after we sent a segment,
4628          * and the return path might not be symmetrical).
4629          */
4630         tp->t_softerror = 0;
4631 }
4632
4633 static inline unsigned int
4634 tcp_maxmtu(struct rtentry *rt)
4635 {
4636         unsigned int maxmtu;
4637
4638         RT_LOCK_ASSERT_HELD(rt);
4639         if (rt->rt_rmx.rmx_mtu == 0)
4640                 maxmtu = rt->rt_ifp->if_mtu;
4641         else
4642                 maxmtu = MIN(rt->rt_rmx.rmx_mtu, rt->rt_ifp->if_mtu);
4643
4644         return (maxmtu);
4645 }
4646
4647 #if INET6
4648 static inline unsigned int
4649 tcp_maxmtu6(struct rtentry *rt)
4650 {
4651         unsigned int maxmtu;
4652         struct nd_ifinfo *ndi;
4653
4654         RT_LOCK_ASSERT_HELD(rt);
4655         lck_rw_lock_shared(nd_if_rwlock);
4656         if ((ndi = ND_IFINFO(rt->rt_ifp)) != NULL && !ndi->initialized)
4657                 ndi = NULL;
4658         if (ndi != NULL)
4659                 lck_mtx_lock(&ndi->lock);
4660         if (rt->rt_rmx.rmx_mtu == 0)
4661                 maxmtu = IN6_LINKMTU(rt->rt_ifp);
4662         else
4663                 maxmtu = MIN(rt->rt_rmx.rmx_mtu, IN6_LINKMTU(rt->rt_ifp));
4664         if (ndi != NULL)
4665                 lck_mtx_unlock(&ndi->lock);
4666         lck_rw_done(nd_if_rwlock);
4667
4668         return (maxmtu);
4669 }
4670 #endif
4671
4672 /*
4673  * Determine a reasonable value for maxseg size.
4674  * If the route is known, check route for mtu.
4675  * If none, use an mss that can be handled on the outgoing
4676  * interface without forcing IP to fragment; if bigger than
4677  * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
4678  * to utilize large mbufs.  If no route is found, route has no mtu,
4679  * or the destination isn't local, use a default, hopefully conservative
4680  * size (usually 512 or the default IP max size, but no more than the mtu
4681  * of the interface), as we can't discover anything about intervening
4682  * gateways or networks.  We also initialize the congestion/slow start
4683  * window to be a single segment if the destination isn't local.
4684  * While looking at the routing entry, we also initialize other path-dependent
4685  * parameters from pre-set or cached values in the routing entry.
4686  *
4687  * Also take into account the space needed for options that we
4688  * send regularly.  Make maxseg shorter by that amount to assure
4689  * that we can send maxseg amount of data even when the options
4690  * are present.  Store the upper limit of the length of options plus
4691  * data in maxopd.
4692  *
4693  * NOTE that this routine is only called when we process an incoming
4694  * segment, for outgoing segments only tcp_mssopt is called.
4695  *
4696  */
4697 void
4698 tcp_mss(tp, offer, input_ifscope)
4699         struct tcpcb *tp;
4700         int offer;
4701         unsigned int input_ifscope;
4702 {
4703         register struct rtentry *rt;
4704         struct ifnet *ifp;
4705         register int rtt, mss;
4706         u_int32_t bufsize;
4707         struct inpcb *inp;
4708         struct socket *so;
4709         struct rmxp_tao *taop;
4710         int origoffer = offer;
4711         u_int32_t sb_max_corrected;
4712         int isnetlocal = 0;
4713 #if INET6
4714         int isipv6;
4715         int min_protoh;
4716 #endif
4717
4718         inp = tp->t_inpcb;
4719 #if INET6
4720         isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
4721         min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr)
4722                             : sizeof (struct tcpiphdr);
4723 #else
4724 #define min_protoh  (sizeof (struct tcpiphdr))
4725 #endif
4726
4727 #if INET6
4728         if (isipv6) {
4729                 rt = tcp_rtlookup6(inp, input_ifscope);
4730         }
4731         else
4732 #endif /* INET6 */
4733         {
4734                 rt = tcp_rtlookup(inp, input_ifscope);
4735         }
4736         isnetlocal = (tp->t_flags & TF_LOCAL);
4737
4738         if (rt == NULL) {
4739                 tp->t_maxopd = tp->t_maxseg =
4740 #if INET6
4741                 isipv6 ? tcp_v6mssdflt :
4742 #endif /* INET6 */
4743                 tcp_mssdflt;
4744                 return;
4745         }
4746         ifp = rt->rt_ifp;
4747         /*
4748          * Slower link window correction:
4749          * If a value is specificied for slowlink_wsize use it for PPP links
4750          * believed to be on a serial modem (speed <128Kbps). Excludes 9600bps as
4751          * it is the default value adversized by pseudo-devices over ppp.
4752          */
4753         if (ifp->if_type == IFT_PPP && slowlink_wsize > 0 &&
4754             ifp->if_baudrate > 9600 && ifp->if_baudrate <= 128000) {
4755                 tp->t_flags |= TF_SLOWLINK;
4756         }
4757         so = inp->inp_socket;
4758
4759         taop = rmx_taop(rt->rt_rmx);
4760         /*
4761          * Offer == -1 means that we didn't receive SYN yet,
4762          * use cached value in that case;
4763          */
4764         if (offer == -1)
4765                 offer = taop->tao_mssopt;
4766         /*
4767          * Offer == 0 means that there was no MSS on the SYN segment,
4768          * in this case we use tcp_mssdflt.
4769          */
4770         if (offer == 0)
4771                 offer =
4772 #if INET6
4773                         isipv6 ? tcp_v6mssdflt :
4774 #endif /* INET6 */
4775                         tcp_mssdflt;
4776         else {
4777                 /*
4778                  * Prevent DoS attack with too small MSS. Round up
4779                  * to at least minmss.
4780                  */
4781                 offer = max(offer, tcp_minmss);
4782                 /*
4783                  * Sanity check: make sure that maxopd will be large
4784                  * enough to allow some data on segments even is the
4785                  * all the option space is used (40bytes).  Otherwise
4786                  * funny things may happen in tcp_output.
4787                  */
4788                 offer = max(offer, 64);
4789         }
4790         taop->tao_mssopt = offer;
4791
4792         /*
4793          * While we're here, check if there's an initial rtt
4794          * or rttvar.  Convert from the route-table units
4795          * to scaled multiples of the slow timeout timer.
4796          */
4797         if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt) != 0) {
4798                 tcp_getrt_rtt(tp, rt);
4799         } else {
4800                 tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN : TCPTV_REXMTMIN;
4801         }
4802
4803 #if INET6
4804         mss = (isipv6 ? tcp_maxmtu6(rt) : tcp_maxmtu(rt));
4805 #else
4806         mss = tcp_maxmtu(rt);
4807 #endif
4808         mss -= min_protoh;
4809
4810         if (rt->rt_rmx.rmx_mtu == 0) {
4811 #if INET6
4812                 if (isipv6) {
4813                         if (!isnetlocal)
4814                                 mss = min(mss, tcp_v6mssdflt);
4815                 } else
4816 #endif /* INET6 */
4817                 if (!isnetlocal)
4818                         mss = min(mss, tcp_mssdflt);
4819         }
4820
4821         mss = min(mss, offer);
4822         /*
4823          * maxopd stores the maximum length of data AND options
4824          * in a segment; maxseg is the amount of data in a normal
4825          * segment.  We need to store this value (maxopd) apart
4826          * from maxseg, because now every segment carries options
4827          * and thus we normally have somewhat less data in segments.
4828          */
4829         tp->t_maxopd = mss;
4830
4831         /*
4832          * origoffer==-1 indicates, that no segments were received yet.
4833          * In this case we just guess.
4834          */
4835         if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
4836             (origoffer == -1 ||
4837              (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
4838                 mss -= TCPOLEN_TSTAMP_APPA;
4839
4840 #if MPTCP
4841         mss -= mptcp_adj_mss(tp, FALSE);
4842 #endif /* MPTCP */
4843         tp->t_maxseg = mss;
4844
4845         /*
4846          * Calculate corrected value for sb_max; ensure to upgrade the
4847          * numerator for large sb_max values else it will overflow.
4848          */
4849         sb_max_corrected = (sb_max * (u_int64_t)MCLBYTES) / (MSIZE + MCLBYTES);
4850
4851         /*
4852          * If there's a pipesize (ie loopback), change the socket
4853          * buffer to that size only if it's bigger than the current
4854          * sockbuf size.  Make the socket buffers an integral
4855          * number of mss units; if the mss is larger than
4856          * the socket buffer, decrease the mss.
4857          */
4858 #if RTV_SPIPE
4859         bufsize = rt->rt_rmx.rmx_sendpipe;
4860         if (bufsize < so->so_snd.sb_hiwat)
4861 #endif
4862                 bufsize = so->so_snd.sb_hiwat;
4863         if (bufsize < mss)
4864                 mss = bufsize;
4865         else {
4866                 bufsize = (((bufsize + (u_int64_t)mss - 1) / (u_int64_t)mss) * (u_int64_t)mss);
4867                 if (bufsize > sb_max_corrected)
4868                         bufsize = sb_max_corrected;
4869                 (void)sbreserve(&so->so_snd, bufsize);
4870         }
4871         tp->t_maxseg = mss;
4872
4873 #if RTV_RPIPE
4874         bufsize = rt->rt_rmx.rmx_recvpipe;
4875         if (bufsize < so->so_rcv.sb_hiwat)
4876 #endif
4877                 bufsize = so->so_rcv.sb_hiwat;
4878         if (bufsize > mss) {
4879                 bufsize = (((bufsize + (u_int64_t)mss - 1) / (u_int64_t)mss) * (u_int64_t)mss);
4880                 if (bufsize > sb_max_corrected)
4881                         bufsize = sb_max_corrected;
4882                 (void)sbreserve(&so->so_rcv, bufsize);
4883         }
4884
4885         set_tcp_stream_priority(so);
4886
4887         if (rt->rt_rmx.rmx_ssthresh) {
4888                 /*
4889                  * There's some sort of gateway or interface
4890                  * buffer limit on the path.  Use this to set
4891                  * the slow start threshhold, but set the
4892                  * threshold to no less than 2*mss.
4893                  */
4894                 tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
4895                 tcpstat.tcps_usedssthresh++;
4896         } else {
4897                 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
4898         }
4899
4900
4901         /*
4902          * Set the slow-start flight size depending on whether this
4903          * is a local network or not.
4904          */
4905         if (CC_ALGO(tp)->cwnd_init != NULL)
4906                 CC_ALGO(tp)->cwnd_init(tp);
4907
4908         DTRACE_TCP5(cc, void, NULL, struct inpcb *, tp->t_inpcb, struct tcpcb *, tp,
4909                 struct tcphdr *, NULL, int32_t, TCP_CC_CWND_INIT);
4910
4911         /* Route locked during lookup above */
4912         RT_UNLOCK(rt);
4913 }
4914
4915 /*
4916  * Determine the MSS option to send on an outgoing SYN.
4917  */
4918 int
4919 tcp_mssopt(tp)
4920         struct tcpcb *tp;
4921 {
4922         struct rtentry *rt;
4923         int mss;
4924 #if INET6
4925         int isipv6;
4926         int min_protoh;
4927 #endif
4928
4929 #if INET6
4930         isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
4931         min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr)
4932                             : sizeof (struct tcpiphdr);
4933 #else
4934 #define min_protoh  (sizeof (struct tcpiphdr))
4935 #endif
4936
4937 #if INET6
4938         if (isipv6)
4939                 rt = tcp_rtlookup6(tp->t_inpcb, IFSCOPE_NONE);
4940         else
4941 #endif /* INET6 */
4942         rt = tcp_rtlookup(tp->t_inpcb, IFSCOPE_NONE);
4943         if (rt == NULL) {
4944                 return (
4945 #if INET6
4946                         isipv6 ? tcp_v6mssdflt :
4947 #endif /* INET6 */
4948                         tcp_mssdflt);
4949         }
4950         /*
4951          * Slower link window correction:
4952          * If a value is specificied for slowlink_wsize use it for PPP links
4953          * believed to be on a serial modem (speed <128Kbps). Excludes 9600bps as
4954          * it is the default value adversized by pseudo-devices over ppp.
4955          */
4956         if (rt->rt_ifp->if_type == IFT_PPP && slowlink_wsize > 0 &&
4957             rt->rt_ifp->if_baudrate > 9600 && rt->rt_ifp->if_baudrate <= 128000) {
4958                 tp->t_flags |= TF_SLOWLINK;
4959         }
4960
4961 #if INET6
4962         mss = (isipv6 ? tcp_maxmtu6(rt) : tcp_maxmtu(rt));
4963 #else
4964         mss = tcp_maxmtu(rt);
4965 #endif
4966         /* Route locked during lookup above */
4967         RT_UNLOCK(rt);
4968         return (mss - min_protoh);
4969 }
4970
4971 /*
4972  * On a partial ack arrives, force the retransmission of the
4973  * next unacknowledged segment.  Do not clear tp->t_dupacks.
4974  * By setting snd_nxt to th_ack, this forces retransmission timer to
4975  * be started again.
4976  */
4977 static void
4978 tcp_newreno_partial_ack(tp, th)
4979         struct tcpcb *tp;
4980         struct tcphdr *th;
4981 {
4982                 tcp_seq onxt = tp->snd_nxt;
4983                 u_int32_t  ocwnd = tp->snd_cwnd;
4984                 tp->t_timer[TCPT_REXMT] = 0;
4985                 tp->t_rtttime = 0;
4986                 tp->snd_nxt = th->th_ack;
4987                 /*
4988                  * Set snd_cwnd to one segment beyond acknowledged offset
4989                  * (tp->snd_una has not yet been updated when this function
4990                  *  is called)
4991                  */
4992                 tp->snd_cwnd = tp->t_maxseg + BYTES_ACKED(th, tp);
4993                 tp->t_flags |= TF_ACKNOW;
4994                 (void) tcp_output(tp);
4995                 tp->snd_cwnd = ocwnd;
4996                 if (SEQ_GT(onxt, tp->snd_nxt))
4997                         tp->snd_nxt = onxt;
4998                 /*
4999                  * Partial window deflation.  Relies on fact that tp->snd_una
5000                  * not updated yet.
5001                  */
5002                 if (tp->snd_cwnd > BYTES_ACKED(th, tp))
5003                         tp->snd_cwnd -= BYTES_ACKED(th, tp);
5004                 else
5005                         tp->snd_cwnd = 0;
5006                 tp->snd_cwnd += tp->t_maxseg;
5007
5008 }
5009
5010 /*
5011  * Drop a random TCP connection that hasn't been serviced yet and
5012  * is eligible for discard.  There is a one in qlen chance that
5013  * we will return a null, saying that there are no dropable
5014  * requests.  In this case, the protocol specific code should drop
5015  * the new request.  This insures fairness.
5016  *
5017  * The listening TCP socket "head" must be locked
5018  */
5019 static int
5020 tcp_dropdropablreq(struct socket *head)
5021 {
5022         struct socket *so, *sonext;
5023         unsigned int i, j, qlen;
5024         static u_int32_t rnd = 0;
5025         static u_int64_t old_runtime;
5026         static unsigned int cur_cnt, old_cnt;
5027         u_int64_t now_sec;
5028         struct inpcb *inp = NULL;
5029         struct tcpcb *tp;
5030
5031         if ((head->so_options & SO_ACCEPTCONN) == 0)
5032                 return (0);
5033
5034         if (TAILQ_EMPTY(&head->so_incomp))
5035                 return (0);
5036
5037         /*
5038          * Check if there is any socket in the incomp queue
5039          * that is closed because of a reset from the peer and is
5040          * waiting to be garbage collected. If so, pick that as
5041          * the victim
5042          */
5043         TAILQ_FOREACH_SAFE(so, &head->so_incomp, so_list, sonext) {
5044                 inp = sotoinpcb(so);
5045                 tp = intotcpcb(inp);
5046                 if (tp != NULL && tp->t_state == TCPS_CLOSED &&
5047                     so->so_head != NULL &&
5048                     (so->so_state & (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) ==
5049                     (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) {
5050                         /*
5051                          * The listen socket is already locked but we
5052                          * can lock this socket here without lock ordering
5053                          * issues because it is in the incomp queue and
5054                          * is not visible to others.
5055                          */
5056                         if (lck_mtx_try_lock(&inp->inpcb_mtx)) {
5057                                 so->so_usecount++;
5058                                 goto found_victim;
5059                         } else {
5060                                 continue;
5061                         }
5062                 }
5063         }
5064
5065         so = TAILQ_FIRST(&head->so_incomp);
5066
5067         now_sec = net_uptime();
5068         if ((i = (now_sec - old_runtime)) != 0) {
5069                 old_runtime = now_sec;
5070                 old_cnt = cur_cnt / i;
5071                 cur_cnt = 0;
5072         }
5073
5074
5075         qlen = head->so_incqlen;
5076         if (rnd == 0)
5077                 rnd = RandomULong();
5078
5079         if (++cur_cnt > qlen || old_cnt > qlen) {
5080                 rnd = (314159 * rnd + 66329) & 0xffff;
5081                 j = ((qlen + 1) * rnd) >> 16;
5082
5083                 while (j-- && so)
5084                         so = TAILQ_NEXT(so, so_list);
5085         }
5086         /* Find a connection that is not already closing (or being served) */
5087         while (so) {
5088                 inp = (struct inpcb *)so->so_pcb;
5089
5090                 sonext = TAILQ_NEXT(so, so_list);
5091
5092                 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0)
5093                         != WNT_STOPUSING) {
5094                         /*
5095                          * Avoid the issue of a socket being accepted
5096                          * by one input thread and being dropped by
5097                          * another input thread. If we can't get a hold
5098                          * on this mutex, then grab the next socket in
5099                          * line.
5100                          */
5101                         if (lck_mtx_try_lock(&inp->inpcb_mtx)) {
5102                                 so->so_usecount++;
5103                                 if ((so->so_usecount == 2) &&
5104                                     (so->so_state & SS_INCOMP) &&
5105                                     !(so->so_flags & SOF_INCOMP_INPROGRESS))  {
5106                                         break;
5107                                 } else {
5108                                         /*
5109                                          * don't use if being accepted or
5110                                          * used in any other way
5111                                          */
5112                                         in_pcb_checkstate(inp, WNT_RELEASE, 1);
5113                                         tcp_unlock(so, 1, 0);
5114                                 }
5115                         } else {
5116                                 /*
5117                                  * do not try to lock the inp in
5118                                  * in_pcb_checkstate because the lock
5119                                  * is already held in some other thread.
5120                                  * Only drop the inp_wntcnt reference.
5121                                  */
5122                                 in_pcb_checkstate(inp, WNT_RELEASE, 1);
5123                         }
5124                 }
5125                 so = sonext;
5126
5127         }
5128         if (so == NULL) {
5129                 return (0);
5130         }
5131
5132         /* Makes sure socket is still in the right state to be discarded */
5133
5134         if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
5135                 tcp_unlock(so, 1, 0);
5136                 return (0);
5137         }
5138
5139 found_victim:
5140         if (so->so_usecount != 2 || !(so->so_state & SS_INCOMP)) {
5141                 /* do not discard: that socket is being accepted */
5142                 tcp_unlock(so, 1, 0);
5143                 return (0);
5144         }
5145
5146         TAILQ_REMOVE(&head->so_incomp, so, so_list);
5147         tcp_unlock(head, 0, 0);
5148
5149         lck_mtx_assert(&inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
5150         tp = sototcpcb(so);
5151         so->so_flags |= SOF_OVERFLOW;
5152         so->so_head = NULL;
5153
5154         tcp_close(tp);
5155         if (inp->inp_wantcnt > 0 && inp->inp_wantcnt != WNT_STOPUSING) {
5156                 /*
5157                  * Some one has a wantcnt on this pcb. Since WNT_ACQUIRE
5158                  * doesn't require a lock, it could have happened while
5159                  * we are holding the lock. This pcb will have to
5160                  * be garbage collected later.
5161                  * Release the reference held for so_incomp queue
5162                  */
5163                 so->so_usecount--;
5164                 tcp_unlock(so, 1, 0);
5165         } else {
5166                 /*
5167                  * Unlock this socket and leave the reference on.
5168                  * We need to acquire the pcbinfo lock in order to
5169                  * fully dispose it off
5170                  */
5171                 tcp_unlock(so, 0, 0);
5172
5173                 lck_rw_lock_exclusive(tcbinfo.ipi_lock);
5174
5175                 tcp_lock(so, 0, 0);
5176                 /* Release the reference held for so_incomp queue */
5177                 so->so_usecount--;
5178
5179                 if (so->so_usecount != 1 ||
5180                     (inp->inp_wantcnt > 0 &&
5181                     inp->inp_wantcnt != WNT_STOPUSING)) {
5182                         /*
5183                          * There is an extra wantcount or usecount
5184                          * that must have been added when the socket
5185                          * was unlocked. This socket will have to be
5186                          * garbage collected later
5187                          */
5188                         tcp_unlock(so, 1, 0);
5189                 } else {
5190
5191                         /* Drop the reference held for this function */
5192                         so->so_usecount--;
5193
5194                         in_pcbdispose(inp);
5195                 }
5196                 lck_rw_done(tcbinfo.ipi_lock);
5197         }
5198         tcpstat.tcps_drops++;
5199
5200         tcp_lock(head, 0, 0);
5201         head->so_incqlen--;
5202         head->so_qlen--;
5203         return(1);
5204 }
5205
5206 /* Set background congestion control on a socket */
5207 void
5208 tcp_set_background_cc(struct socket *so)
5209 {
5210         tcp_set_new_cc(so, TCP_CC_ALGO_BACKGROUND_INDEX);
5211 }
5212
5213 /* Set foreground congestion control on a socket */
5214 void
5215 tcp_set_foreground_cc(struct socket *so)
5216 {
5217         tcp_set_new_cc(so, TCP_CC_ALGO_NEWRENO_INDEX);
5218 }
5219
5220 static void
5221 tcp_set_new_cc(struct socket *so, uint16_t cc_index)
5222 {
5223         struct inpcb *inp = sotoinpcb(so);
5224         struct tcpcb *tp = intotcpcb(inp);
5225         u_char old_cc_index = 0;
5226         if (tp->tcp_cc_index != cc_index) {
5227
5228                 old_cc_index = tp->tcp_cc_index;
5229
5230                 if (CC_ALGO(tp)->cleanup != NULL)
5231                         CC_ALGO(tp)->cleanup(tp);
5232                 tp->tcp_cc_index = cc_index;
5233
5234                 /* Decide if the connection is just starting or if
5235                  * we have sent some packets on it.
5236                  */
5237                 if (tp->snd_nxt > tp->iss) {
5238                         /* Already sent some packets */
5239                         if (CC_ALGO(tp)->switch_to != NULL)
5240                                 CC_ALGO(tp)->switch_to(tp, old_cc_index);
5241                 } else {
5242                         if (CC_ALGO(tp)->init != NULL)
5243                                 CC_ALGO(tp)->init(tp);
5244                 }
5245                 DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp,
5246                         struct tcpcb *, tp, struct tcphdr *, NULL,
5247                         int32_t, TCP_CC_CHANGE_ALGO);
5248         }
5249 }
5250
5251 void
5252 tcp_set_recv_bg(struct socket *so)
5253 {
5254         if (!IS_TCP_RECV_BG(so))
5255                 so->so_traffic_mgt_flags |= TRAFFIC_MGT_TCP_RECVBG;
5256
5257         /* Unset Large Receive Offload on background sockets */
5258         so_set_lro(so, SO_TC_BK);
5259 }
5260
5261 void
5262 tcp_clear_recv_bg(struct socket *so)
5263 {
5264         if (IS_TCP_RECV_BG(so))
5265                 so->so_traffic_mgt_flags &= ~(TRAFFIC_MGT_TCP_RECVBG);
5266
5267         /*
5268          * Set/unset use of Large Receive Offload depending on
5269          * the traffic class
5270          */
5271         so_set_lro(so, so->so_traffic_class);
5272 }
5273
5274 void
5275 inp_fc_unthrottle_tcp(struct inpcb *inp)
5276 {
5277         struct tcpcb *tp = inp->inp_ppcb;
5278         /*
5279          * Back off the slow-start threshold and enter
5280          * congestion avoidance phase
5281          */
5282         if (CC_ALGO(tp)->pre_fr != NULL)
5283                 CC_ALGO(tp)->pre_fr(tp);
5284
5285         tp->snd_cwnd = tp->snd_ssthresh;
5286
5287         /*
5288          * Restart counting for ABC as we changed the
5289          * congestion window just now.
5290          */
5291         tp->t_bytes_acked = 0;
5292
5293         /* Reset retransmit shift as we know that the reason
5294          * for delay in sending a packet is due to flow
5295          * control on the outgoing interface. There is no need
5296          * to backoff retransmit timer.
5297          */
5298         tp->t_rxtshift = 0;
5299
5300         /*
5301          * Start the output stream again. Since we are
5302          * not retransmitting data, do not reset the
5303          * retransmit timer or rtt calculation.
5304          */
5305         tcp_output(tp);
5306 }
5307
5308 static int
5309 tcp_getstat SYSCTL_HANDLER_ARGS
5310 {
5311 #pragma unused(oidp, arg1, arg2)
5312
5313         int error;
5314
5315         proc_t caller = PROC_NULL;
5316         proc_t caller_parent = PROC_NULL;
5317         char command_name[MAXCOMLEN + 1] = "";
5318         char parent_name[MAXCOMLEN + 1] = "";
5319
5320         if ((caller = proc_self()) != PROC_NULL) {
5321                 /* get process name */
5322                 strlcpy(command_name, caller->p_comm, sizeof(command_name));
5323
5324                 /* get parent process name if possible */
5325                 if ((caller_parent = proc_find(caller->p_ppid)) != PROC_NULL) {
5326                         strlcpy(parent_name, caller_parent->p_comm,
5327                             sizeof(parent_name));
5328                         proc_rele(caller_parent);
5329                 }
5330
5331                 if ((escape_str(command_name, strlen(command_name),
5332                     sizeof(command_name)) == 0) &&
5333                     (escape_str(parent_name, strlen(parent_name),
5334                     sizeof(parent_name)) == 0)) {
5335                         kern_asl_msg(LOG_DEBUG, "messagetracer",
5336                             5,
5337                             "com.apple.message.domain",
5338                             "com.apple.kernel.tcpstat", /* 1 */
5339                             "com.apple.message.signature",
5340                             "tcpstat", /* 2 */
5341                             "com.apple.message.signature2", command_name, /* 3 */
5342                             "com.apple.message.signature3", parent_name, /* 4 */
5343                             "com.apple.message.summarize", "YES", /* 5 */
5344                             NULL);
5345                 }
5346         }
5347         if (caller != PROC_NULL)
5348                 proc_rele(caller);
5349
5350         if (req->oldptr == 0) {
5351                 req->oldlen= (size_t)sizeof(struct tcpstat);
5352         }
5353
5354         error = SYSCTL_OUT(req, &tcpstat, MIN(sizeof (tcpstat), req->oldlen));
5355
5356         return (error);
5357
5358 }
5359
5360 /*
5361  * Checksum extended TCP header and data.
5362  */
5363 int
5364 tcp_input_checksum(int af, struct mbuf *m, struct tcphdr *th, int off, int tlen)
5365 {
5366         struct ifnet *ifp = m->m_pkthdr.rcvif;
5367
5368         switch (af) {
5369         case AF_INET: {
5370                 struct ip *ip = mtod(m, struct ip *);
5371                 struct ipovly *ipov = (struct ipovly *)ip;
5372
5373                 if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_DID_CSUM)
5374                         return (0);
5375
5376                 if ((hwcksum_rx || (ifp->if_flags & IFF_LOOPBACK) ||
5377                     (m->m_pkthdr.pkt_flags & PKTF_LOOP)) &&
5378                     (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)) {
5379                         if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
5380                                 th->th_sum = m->m_pkthdr.csum_rx_val;
5381                         } else {
5382                                 uint16_t sum = m->m_pkthdr.csum_rx_val;
5383                                 uint16_t start = m->m_pkthdr.csum_rx_start;
5384
5385                                 /*
5386                                  * Perform 1's complement adjustment of octets
5387                                  * that got included/excluded in the hardware-
5388                                  * calculated checksum value.  Ignore cases
5389                                  * where the value includes or excludes the IP
5390                                  * header span, as the sum for those octets
5391                                  * would already be 0xffff and thus no-op.
5392                                  */
5393                                 if ((m->m_pkthdr.csum_flags & CSUM_PARTIAL) &&
5394                                     start != 0 && (off - start) != off) {
5395 #if BYTE_ORDER != BIG_ENDIAN
5396                                         if (start < off) {
5397                                                 HTONS(ip->ip_len);
5398                                                 HTONS(ip->ip_off);
5399                                         }
5400 #endif
5401                                         /* callee folds in sum */
5402                                         sum = m_adj_sum16(m, start, off, sum);
5403 #if BYTE_ORDER != BIG_ENDIAN
5404                                         if (start < off) {
5405                                                 NTOHS(ip->ip_off);
5406                                                 NTOHS(ip->ip_len);
5407                                         }
5408 #endif
5409                                 }
5410
5411                                 /* callee folds in sum */
5412                                 th->th_sum = in_pseudo(ip->ip_src.s_addr,
5413                                     ip->ip_dst.s_addr,
5414                                     sum + htonl(tlen + IPPROTO_TCP));
5415                         }
5416                         th->th_sum ^= 0xffff;
5417                 } else {
5418                         uint16_t ip_sum;
5419                         int len;
5420                         char b[9];
5421
5422                         bcopy(ipov->ih_x1, b, sizeof (ipov->ih_x1));
5423                         bzero(ipov->ih_x1, sizeof (ipov->ih_x1));
5424                         ip_sum = ipov->ih_len;
5425                         ipov->ih_len = (u_short)tlen;
5426 #if BYTE_ORDER != BIG_ENDIAN
5427                         HTONS(ipov->ih_len);
5428 #endif
5429                         len = sizeof (struct ip) + tlen;
5430                         th->th_sum = in_cksum(m, len);
5431                         bcopy(b, ipov->ih_x1, sizeof (ipov->ih_x1));
5432                         ipov->ih_len = ip_sum;
5433
5434                         tcp_in_cksum_stats(len);
5435                 }
5436                 break;
5437         }
5438 #if INET6
5439         case AF_INET6: {
5440                 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
5441
5442                 if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_DID_CSUM)
5443                         return (0);
5444
5445                 if ((hwcksum_rx || (ifp->if_flags & IFF_LOOPBACK) ||
5446                     (m->m_pkthdr.pkt_flags & PKTF_LOOP)) &&
5447                     (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)) {
5448                         if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
5449                                 th->th_sum = m->m_pkthdr.csum_rx_val;
5450                         } else {
5451                                 uint16_t sum = m->m_pkthdr.csum_rx_val;
5452                                 uint16_t start = m->m_pkthdr.csum_rx_start;
5453
5454                                 /*
5455                                  * Perform 1's complement adjustment of octets
5456                                  * that got included/excluded in the hardware-
5457                                  * calculated checksum value.
5458                                  */
5459                                 if ((m->m_pkthdr.csum_flags & CSUM_PARTIAL) &&
5460                                     start != off) {
5461                                         uint16_t s, d;
5462
5463                                         if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src)) {
5464                                                 s = ip6->ip6_src.s6_addr16[1];
5465                                                 ip6->ip6_src.s6_addr16[1] = 0 ;
5466                                         }
5467                                         if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst)) {
5468                                                 d = ip6->ip6_dst.s6_addr16[1];
5469                                                 ip6->ip6_dst.s6_addr16[1] = 0;
5470                                         }
5471
5472                                         /* callee folds in sum */
5473                                         sum = m_adj_sum16(m, start, off, sum);
5474
5475                                         if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src))
5476                                                 ip6->ip6_src.s6_addr16[1] = s;
5477                                         if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst))
5478                                                 ip6->ip6_dst.s6_addr16[1] = d;
5479                                 }
5480
5481                                 th->th_sum = in6_pseudo(
5482                                     &ip6->ip6_src, &ip6->ip6_dst,
5483                                     sum + htonl(tlen + IPPROTO_TCP));
5484                         }
5485                         th->th_sum ^= 0xffff;
5486                 } else {
5487                         tcp_in6_cksum_stats(tlen);
5488                         th->th_sum = in6_cksum(m, IPPROTO_TCP, off, tlen);
5489                 }
5490                 break;
5491         }
5492 #endif /* INET6 */
5493         default:
5494                 VERIFY(0);
5495                 /* NOTREACHED */
5496         }
5497
5498         if (th->th_sum != 0) {
5499                 tcpstat.tcps_rcvbadsum++;
5500                 IF_TCP_STATINC(ifp, badformat);
5501                 return (-1);
5502         }
5503
5504         return (0);
5505 }
5506
5507 SYSCTL_PROC(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
5508     tcp_getstat, "S,tcpstat", "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
5509
5510 static int
5511 sysctl_rexmtthresh SYSCTL_HANDLER_ARGS
5512 {
5513 #pragma unused(arg1, arg2)
5514
5515         int error, val = tcprexmtthresh;
5516
5517         error = sysctl_handle_int(oidp, &val, 0, req);
5518         if (error || !req->newptr)
5519                 return (error);
5520
5521         /*
5522          * Constrain the number of duplicate ACKs
5523          * to consider for TCP fast retransmit
5524          * to either 2 or 3
5525          */
5526
5527         if (val < 2 || val > 3)
5528                 return (EINVAL);
5529
5530          tcprexmtthresh = val;
5531
5532         return (0);
5533 }
5534
5535 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmt_thresh, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED,
5536         &tcprexmtthresh, 0, &sysctl_rexmtthresh, "I", "Duplicate ACK Threshold for Fast Retransmit");