bsd/netinet/tcp_input.c

   1 /*
   2  * Copyright (c) 2000-2011 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
  30  *      The Regents of the University of California.  All rights reserved.
  31  *
  32  * Redistribution and use in source and binary forms, with or without
  33  * modification, are permitted provided that the following conditions
  34  * are met:
  35  * 1. Redistributions of source code must retain the above copyright
  36  *    notice, this list of conditions and the following disclaimer.
  37  * 2. Redistributions in binary form must reproduce the above copyright
  38  *    notice, this list of conditions and the following disclaimer in the
  39  *    documentation and/or other materials provided with the distribution.
  40  * 3. All advertising materials mentioning features or use of this software
  41  *    must display the following acknowledgement:
  42  *      This product includes software developed by the University of
  43  *      California, Berkeley and its contributors.
  44  * 4. Neither the name of the University nor the names of its contributors
  45  *    may be used to endorse or promote products derived from this software
  46  *    without specific prior written permission.
  47  *
  48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  58  * SUCH DAMAGE.
  59  *
  60  *      @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
  61  * $FreeBSD: src/sys/netinet/tcp_input.c,v 1.107.2.16 2001/08/22 00:59:12 silby Exp $
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/kernel.h>
  73 #include <sys/sysctl.h>
  74 #include <sys/malloc.h>
  75 #include <sys/mbuf.h>
  76 #include <sys/proc.h>           /* for proc0 declaration */
  77 #include <sys/protosw.h>
  78 #include <sys/socket.h>
  79 #include <sys/socketvar.h>
  80 #include <sys/syslog.h>
  81
  82 #include <kern/cpu_number.h>    /* before tcp_seq.h, for tcp_random18() */
  83
  84 #include <machine/endian.h>
  85
  86 #include <net/if.h>
  87 #include <net/if_types.h>
  88 #include <net/route.h>
  89 #include <net/ntstat.h>
  90
  91 #include <netinet/in.h>
  92 #include <netinet/in_systm.h>
  93 #include <netinet/ip.h>
  94 #include <netinet/ip_icmp.h>    /* for ICMP_BANDLIM             */
  95 #include <netinet/in_var.h>
  96 #include <netinet/icmp_var.h>   /* for ICMP_BANDLIM     */
  97 #include <netinet/in_pcb.h>
  98 #include <netinet/ip_var.h>
  99 #include <mach/sdt.h>
 100 #if INET6
 101 #include <netinet/ip6.h>
 102 #include <netinet/icmp6.h>
 103 #include <netinet6/nd6.h>
 104 #include <netinet6/ip6_var.h>
 105 #include <netinet6/in6_pcb.h>
 106 #endif
 107 #include <netinet/tcp.h>
 108 #include <netinet/tcp_fsm.h>
 109 #include <netinet/tcp_seq.h>
 110 #include <netinet/tcp_timer.h>
 111 #include <netinet/tcp_var.h>
 112 #include <netinet/tcp_cc.h>
 113 #include <kern/zalloc.h>
 114 #if INET6
 115 #include <netinet6/tcp6_var.h>
 116 #endif
 117 #include <netinet/tcpip.h>
 118 #if TCPDEBUG
 119 #include <netinet/tcp_debug.h>
 120 u_char tcp_saveipgen[40]; /* the size must be of max ip header, now IPv6 */
 121 struct tcphdr tcp_savetcp;
 122 #endif /* TCPDEBUG */
 123
 124 #if IPSEC
 125 #include <netinet6/ipsec.h>
 126 #if INET6
 127 #include <netinet6/ipsec6.h>
 128 #endif
 129 #include <netkey/key.h>
 130 #endif /*IPSEC*/
 131
 132 #if CONFIG_MACF_NET || CONFIG_MACF_SOCKET
 133 #include <security/mac_framework.h>
 134 #endif /* CONFIG_MACF_NET || CONFIG_MACF_SOCKET */
 135
 136 #include <sys/kdebug.h>
 137
 138 #define DBG_LAYER_BEG           NETDBG_CODE(DBG_NETTCP, 0)
 139 #define DBG_LAYER_END           NETDBG_CODE(DBG_NETTCP, 2)
 140 #define DBG_FNC_TCP_INPUT       NETDBG_CODE(DBG_NETTCP, (3 << 8))
 141 #define DBG_FNC_TCP_NEWCONN     NETDBG_CODE(DBG_NETTCP, (7 << 8))
 142
 143 static int      tcprexmtthresh = 2;
 144 tcp_cc  tcp_ccgen;
 145
 146 #if IPSEC
 147 extern int ipsec_bypass;
 148 #endif
 149
 150 struct  tcpstat tcpstat;
 151
 152 static int log_in_vain = 0;
 153 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW | CTLFLAG_LOCKED,
 154     &log_in_vain, 0, "Log all incoming TCP connections");
 155
 156 static int blackhole = 0;
 157 SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW | CTLFLAG_LOCKED,
 158         &blackhole, 0, "Do not send RST when dropping refused connections");
 159
 160 int tcp_delack_enabled = 3;
 161 SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW | CTLFLAG_LOCKED,
 162     &tcp_delack_enabled, 0,
 163     "Delay ACK to try and piggyback it onto a data packet");
 164
 165 int tcp_lq_overflow = 1;
 166 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_lq_overflow, CTLFLAG_RW | CTLFLAG_LOCKED,
 167     &tcp_lq_overflow, 0,
 168     "Listen Queue Overflow");
 169
 170 int tcp_recv_bg = 0;
 171 SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbg, CTLFLAG_RW | CTLFLAG_LOCKED,
 172     &tcp_recv_bg, 0,
 173     "Receive background");
 174
 175 #if TCP_DROP_SYNFIN
 176 static int drop_synfin = 1;
 177 SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW | CTLFLAG_LOCKED,
 178     &drop_synfin, 0, "Drop TCP packets with SYN+FIN set");
 179 #endif
 180
 181 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW|CTLFLAG_LOCKED, 0,
 182     "TCP Segment Reassembly Queue");
 183
 184 __private_extern__ int tcp_reass_maxseg = 0;
 185 SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RW | CTLFLAG_LOCKED,
 186     &tcp_reass_maxseg, 0,
 187     "Global maximum number of TCP Segments in Reassembly Queue");
 188
 189 __private_extern__ int tcp_reass_qsize = 0;
 190 SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_RD | CTLFLAG_LOCKED,
 191     &tcp_reass_qsize, 0,
 192     "Global number of TCP Segments currently in Reassembly Queue");
 193
 194 static int tcp_reass_overflows = 0;
 195 SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD | CTLFLAG_LOCKED,
 196     &tcp_reass_overflows, 0,
 197     "Global number of TCP Segment Reassembly Queue Overflows");
 198
 199
 200 __private_extern__ int slowlink_wsize = 8192;
 201 SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowlink_wsize, CTLFLAG_RW | CTLFLAG_LOCKED,
 202         &slowlink_wsize, 0, "Maximum advertised window size for slowlink");
 203
 204 int maxseg_unacked = 8;
 205 SYSCTL_INT(_net_inet_tcp, OID_AUTO, maxseg_unacked, CTLFLAG_RW | CTLFLAG_LOCKED,
 206         &maxseg_unacked, 0, "Maximum number of outstanding segments left unacked");
 207
 208 int     tcp_do_rfc3465 = 1;
 209 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW | CTLFLAG_LOCKED,
 210         &tcp_do_rfc3465, 0, "");
 211
 212 int     tcp_do_rfc3465_lim2 = 1;
 213 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465_lim2, CTLFLAG_RW | CTLFLAG_LOCKED,
 214         &tcp_do_rfc3465_lim2, 0, "Appropriate bytes counting w/ L=2*SMSS");
 215
 216 int     rtt_samples_per_slot = 20;
 217 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rtt_samples_per_slot, CTLFLAG_RW | CTLFLAG_LOCKED,
 218         &rtt_samples_per_slot, 0, "Number of RTT samples stored for rtt history");
 219
 220 int     tcp_allowed_iaj = ALLOWED_IAJ;
 221 SYSCTL_INT(_net_inet_tcp, OID_AUTO, recv_allowed_iaj, CTLFLAG_RW | CTLFLAG_LOCKED,
 222         &tcp_allowed_iaj, 0, "Allowed inter-packet arrival jiter");
 223
 224 int     tcp_acc_iaj_high_thresh = ACC_IAJ_HIGH_THRESH;
 225 SYSCTL_INT(_net_inet_tcp, OID_AUTO, acc_iaj_high_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
 226         &tcp_acc_iaj_high_thresh, 0, "Used in calculating maximum accumulated IAJ");
 227
 228 #if CONFIG_IFEF_NOWINDOWSCALE
 229 int tcp_obey_ifef_nowindowscale = 0;
 230 SYSCTL_INT(_net_inet_tcp, OID_AUTO, obey_ifef_nowindowscale, CTLFLAG_RW | CTLFLAG_LOCKED,
 231         &tcp_obey_ifef_nowindowscale, 0, "");
 232 #endif
 233
 234 extern int tcp_TCPTV_MIN;
 235 extern int tcp_acc_iaj_high;
 236 extern int tcp_acc_iaj_react_limit;
 237 extern struct zone *tcp_reass_zone;
 238
 239
 240 u_int32_t tcp_now;
 241 struct timeval tcp_uptime;      /* uptime when tcp_now was last updated */
 242 lck_spin_t *tcp_uptime_lock;    /* Used to sychronize updates to tcp_now */
 243
 244 struct inpcbhead tcb;
 245 #define tcb6    tcb  /* for KAME src sync over BSD*'s */
 246 struct inpcbinfo tcbinfo;
 247
 248 static void tcp_dooptions(struct tcpcb *, u_char *, int, struct tcphdr *,
 249     struct tcpopt *, unsigned int);
 250 static void      tcp_pulloutofband(struct socket *,
 251             struct tcphdr *, struct mbuf *, int);
 252 static int       tcp_reass(struct tcpcb *, struct tcphdr *, int *,
 253                                 struct mbuf *);
 254 static void     tcp_xmit_timer(struct tcpcb *, int);
 255 static inline unsigned int tcp_maxmtu(struct rtentry *);
 256 static inline int tcp_stretch_ack_enable(struct tcpcb *tp);
 257
 258 #if TRAFFIC_MGT
 259 static inline void update_iaj_state(struct tcpcb *tp, uint32_t tlen, int reset_size);
 260 void compute_iaj(struct tcpcb *tp);
 261 static inline void clear_iaj_state(struct tcpcb *tp);
 262 #endif /* TRAFFIC_MGT */
 263
 264 #if INET6
 265 static inline unsigned int tcp_maxmtu6(struct rtentry *);
 266 #endif
 267
 268 /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
 269 #if INET6
 270 #define ND6_HINT(tp) \
 271 do { \
 272         if ((tp) && (tp)->t_inpcb && \
 273             ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0 && \
 274             (tp)->t_inpcb->in6p_route.ro_rt) \
 275                 nd6_nud_hint((tp)->t_inpcb->in6p_route.ro_rt, NULL, 0); \
 276 } while (0)
 277 #else
 278 #define ND6_HINT(tp)
 279 #endif
 280
 281 extern void     add_to_time_wait(struct tcpcb *, uint32_t delay);
 282 extern void postevent(struct socket *, struct sockbuf *, int);
 283
 284 extern  void    ipfwsyslog( int level, const char *format,...);
 285 extern int ChkAddressOK( __uint32_t dstaddr, __uint32_t srcaddr );
 286 extern int fw_verbose;
 287 __private_extern__ int tcp_sockthreshold;
 288 __private_extern__ int tcp_win_scale;
 289
 290 #if IPFIREWALL
 291 #define log_in_vain_log( a ) {            \
 292         if ( (log_in_vain == 3 ) && (fw_verbose == 2)) {        /* Apple logging, log to ipfw.log */ \
 293                 ipfwsyslog a ;  \
 294         }                       \
 295         else log a ;            \
 296 }
 297 #else
 298 #define log_in_vain_log( a ) { log a; }
 299 #endif
 300
 301 int tcp_rcvunackwin = TCPTV_UNACKWIN;
 302 int tcp_maxrcvidle = TCPTV_MAXRCVIDLE;
 303 int tcp_rcvsspktcnt = TCP_RCV_SS_PKTCOUNT;
 304
 305 #define DELAY_ACK(tp, th) (CC_ALGO(tp)->delay_ack != NULL && CC_ALGO(tp)->delay_ack(tp, th))
 306
 307 static int tcp_dropdropablreq(struct socket *head);
 308 static void tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th);
 309
 310 static void update_base_rtt(struct tcpcb *tp, uint32_t rtt);
 311 uint32_t get_base_rtt(struct tcpcb *tp);
 312 void tcp_set_background_cc(struct socket *so);
 313 void tcp_set_foreground_cc(struct socket *so);
 314 static void tcp_set_new_cc(struct socket *so, uint16_t cc_index);
 315
 316 #if TRAFFIC_MGT
 317 void
 318 reset_acc_iaj(struct tcpcb *tp)
 319 {
 320         tp->acc_iaj = 0;
 321         tp->iaj_rwintop = 0;
 322         clear_iaj_state(tp);
 323 }
 324
 325 static inline void
 326 update_iaj_state(struct tcpcb *tp, uint32_t size, int rst_size)
 327 {
 328         if (rst_size > 0)
 329                 tp->iaj_size = 0;
 330         if (tp->iaj_size == 0 || size >= tp->iaj_size) {
 331                 tp->iaj_size = size;
 332                 tp->iaj_rcv_ts = tcp_now;
 333                 tp->iaj_small_pkt = 0;
 334         }
 335 }
 336
 337 static inline void
 338 clear_iaj_state(struct tcpcb *tp)
 339 {
 340         tp->iaj_rcv_ts = 0;
 341 }
 342
 343 /* For every 32 bit unsigned integer(v), this function will find the
 344  * largest integer n such that (n*n <= v). This takes at most 16 iterations
 345  * irrespective of the value of v and does not involve multiplications.
 346  */
 347 static inline int
 348 isqrt(unsigned int val) {
 349         unsigned int sqrt_cache[11] = {0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100};
 350         unsigned int temp, g=0, b=0x8000, bshft=15;
 351         if ( val <= 100) {
 352                 for (g = 0; g <= 10; ++g) {
 353                         if (sqrt_cache[g] > val) {
 354                                 g--;
 355                                 break;
 356                         } else if (sqrt_cache[g] == val) {
 357                                 break;
 358                         }
 359                 }
 360         } else {
 361                 do {
 362                         temp = (((g << 1) + b) << (bshft--));
 363                         if (val >= temp) {
 364                                 g += b;
 365                                 val -= temp;
 366                         }
 367                         b >>= 1;
 368                 } while ( b > 0 && val > 0);
 369         }
 370         return(g);
 371 }
 372
 373 void
 374 compute_iaj(struct tcpcb *tp)
 375 {
 376         /* When accumulated IAJ reaches MAX_ACC_IAJ in milliseconds, throttle the
 377          * receive window to a minimum of MIN_IAJ_WIN packets
 378          */
 379 #define MAX_ACC_IAJ (tcp_acc_iaj_high_thresh + tcp_acc_iaj_react_limit)
 380
 381         uint32_t allowed_iaj, acc_iaj = 0;
 382         uint32_t cur_iaj = tcp_now - tp->iaj_rcv_ts;
 383
 384         uint32_t mean, temp;
 385         int32_t cur_iaj_dev;
 386         cur_iaj_dev = (cur_iaj - tp->avg_iaj);
 387
 388         /* Allow a jitter of "allowed_iaj" milliseconds. Some connections may have a
 389          * constant jitter more than that. We detect this by using
 390          * standard deviation.
 391          */
 392         allowed_iaj = tp->avg_iaj + tp->std_dev_iaj;
 393         if (allowed_iaj < tcp_allowed_iaj)
 394                 allowed_iaj = tcp_allowed_iaj;
 395
 396         /* Initially when the connection starts, the senders congestion window
 397          * is small. During this period we avoid throttling a connection because
 398          * we do not have a good starting point for allowed_iaj. IAJ_IGNORE_PKTCNT
 399          * is used to quietly gloss over the first few packets.
 400          */
 401         if (tp->iaj_pktcnt > IAJ_IGNORE_PKTCNT) {
 402                 if ( cur_iaj <= allowed_iaj ) {
 403                         if (tp->acc_iaj >= 2)
 404                                 acc_iaj = tp->acc_iaj - 2;
 405                         else
 406                                 acc_iaj = 0;
 407                 } else {
 408                         acc_iaj = tp->acc_iaj + (cur_iaj - allowed_iaj);
 409                 }
 410
 411                 if (acc_iaj > MAX_ACC_IAJ)
 412                         acc_iaj = MAX_ACC_IAJ;
 413                 tp->acc_iaj = acc_iaj;
 414         }
 415
 416         /* Compute weighted average where the history has a weight of
 417          * 15 out of 16 and the current value has a weight of 1 out of 16.
 418          * This will make the short-term measurements have more weight.
 419          */
 420         tp->avg_iaj = (((tp->avg_iaj << 4) - tp->avg_iaj) + cur_iaj) >> 4;
 421
 422         /* Compute Root-mean-square of deviation where mean is a weighted
 423          * average as described above
 424          */
 425         temp = tp->std_dev_iaj * tp->std_dev_iaj;
 426         mean = (((temp << 4) - temp) + (cur_iaj_dev * cur_iaj_dev)) >> 4;
 427
 428         tp->std_dev_iaj = isqrt(mean);
 429
 430         DTRACE_TCP3(iaj, struct tcpcb *, tp, uint32_t, cur_iaj, uint32_t, allowed_iaj);
 431
 432         return;
 433 }
 434 #endif /* TRAFFIC_MGT */
 435
 436 static int
 437 tcp_reass(tp, th, tlenp, m)
 438         register struct tcpcb *tp;
 439         register struct tcphdr *th;
 440         int *tlenp;
 441         struct mbuf *m;
 442 {
 443         struct tseg_qent *q;
 444         struct tseg_qent *p = NULL;
 445         struct tseg_qent *nq;
 446         struct tseg_qent *te = NULL;
 447         struct socket *so = tp->t_inpcb->inp_socket;
 448         int flags;
 449         int dowakeup = 0;
 450
 451         /*
 452          * Call with th==0 after become established to
 453          * force pre-ESTABLISHED data up to user socket.
 454          */
 455         if (th == NULL)
 456                 goto present;
 457
 458         /* If the reassembly queue already has entries or if we are going to add
 459          * a new one, then the connection has reached a loss state.
 460          * Reset the stretch-ack algorithm at this point.
 461          */
 462         if ((tp->t_flags & TF_STRETCHACK) != 0)
 463                 tcp_reset_stretch_ack(tp);
 464
 465         /* When the connection reaches a loss state, we need to send more acks
 466          * for a period of time so that the sender's congestion window will
 467          * open. Wait until we see some packets on the connection before
 468          * stretching acks again.
 469          */
 470         tp->t_flagsext |= TF_RCVUNACK_WAITSS;
 471         tp->rcv_waitforss = 0;
 472
 473
 474 #if TRAFFIC_MGT
 475         if (tp->acc_iaj > 0)
 476                 reset_acc_iaj(tp);
 477 #endif /* TRAFFIC_MGT */
 478
 479         /*
 480          * Limit the number of segments in the reassembly queue to prevent
 481          * holding on to too many segments (and thus running out of mbufs).
 482          * Make sure to let the missing segment through which caused this
 483          * queue.  Always keep one global queue entry spare to be able to
 484          * process the missing segment.
 485          */
 486         if (th->th_seq != tp->rcv_nxt &&
 487             tcp_reass_qsize + 1 >= tcp_reass_maxseg) {
 488                 tcp_reass_overflows++;
 489                 tcpstat.tcps_rcvmemdrop++;
 490                 m_freem(m);
 491                 *tlenp = 0;
 492                 return (0);
 493         }
 494
 495         /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
 496         te = (struct tseg_qent *) zalloc_noblock(tcp_reass_zone);
 497         if (te == NULL) {
 498                 tcpstat.tcps_rcvmemdrop++;
 499                 m_freem(m);
 500                 return (0);
 501         }
 502         tcp_reass_qsize++;
 503
 504         /*
 505          * Find a segment which begins after this one does.
 506          */
 507         LIST_FOREACH(q, &tp->t_segq, tqe_q) {
 508                 if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
 509                         break;
 510                 p = q;
 511         }
 512
 513         /*
 514          * If there is a preceding segment, it may provide some of
 515          * our data already.  If so, drop the data from the incoming
 516          * segment.  If it provides all of our data, drop us.
 517          */
 518         if (p != NULL) {
 519                 register int i;
 520                 /* conversion to int (in i) handles seq wraparound */
 521                 i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
 522                 if (i > 0) {
 523                         if (i >= *tlenp) {
 524                                 tcpstat.tcps_rcvduppack++;
 525                                 tcpstat.tcps_rcvdupbyte += *tlenp;
 526                                 if (nstat_collect) {
 527                                         nstat_route_rx(tp->t_inpcb->inp_route.ro_rt, 1, *tlenp, NSTAT_RX_FLAG_DUPLICATE);
 528                                         locked_add_64(&tp->t_inpcb->inp_stat->rxpackets, 1);
 529                                         locked_add_64(&tp->t_inpcb->inp_stat->rxbytes, *tlenp);
 530                                         tp->t_stat.rxduplicatebytes += *tlenp;
 531                                 }
 532                                 m_freem(m);
 533                                 zfree(tcp_reass_zone, te);
 534                                 tcp_reass_qsize--;
 535                                 /*
 536                                  * Try to present any queued data
 537                                  * at the left window edge to the user.
 538                                  * This is needed after the 3-WHS
 539                                  * completes.
 540                                  */
 541                                 goto present;   /* ??? */
 542                         }
 543                         m_adj(m, i);
 544                         *tlenp -= i;
 545                         th->th_seq += i;
 546                 }
 547         }
 548         tcpstat.tcps_rcvoopack++;
 549         tcpstat.tcps_rcvoobyte += *tlenp;
 550         if (nstat_collect) {
 551                 nstat_route_rx(tp->t_inpcb->inp_route.ro_rt, 1, *tlenp, NSTAT_RX_FLAG_OUT_OF_ORDER);
 552                 locked_add_64(&tp->t_inpcb->inp_stat->rxpackets, 1);
 553                 locked_add_64(&tp->t_inpcb->inp_stat->rxbytes, *tlenp);
 554                 tp->t_stat.rxoutoforderbytes += *tlenp;
 555         }
 556
 557         /*
 558          * While we overlap succeeding segments trim them or,
 559          * if they are completely covered, dequeue them.
 560          */
 561         while (q) {
 562                 register int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
 563                 if (i <= 0)
 564                         break;
 565                 if (i < q->tqe_len) {
 566                         q->tqe_th->th_seq += i;
 567                         q->tqe_len -= i;
 568                         m_adj(q->tqe_m, i);
 569                         break;
 570                 }
 571
 572                 nq = LIST_NEXT(q, tqe_q);
 573                 LIST_REMOVE(q, tqe_q);
 574                 m_freem(q->tqe_m);
 575                 zfree(tcp_reass_zone, q);
 576                 tcp_reass_qsize--;
 577                 q = nq;
 578         }
 579
 580         /* Insert the new segment queue entry into place. */
 581         te->tqe_m = m;
 582         te->tqe_th = th;
 583         te->tqe_len = *tlenp;
 584
 585         if (p == NULL) {
 586                 LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
 587         } else {
 588                 LIST_INSERT_AFTER(p, te, tqe_q);
 589         }
 590
 591 present:
 592         /*
 593          * Present data to user, advancing rcv_nxt through
 594          * completed sequence space.
 595          */
 596         if (!TCPS_HAVEESTABLISHED(tp->t_state))
 597                 return (0);
 598         q = LIST_FIRST(&tp->t_segq);
 599         if (!q || q->tqe_th->th_seq != tp->rcv_nxt)
 600                 return (0);
 601         do {
 602                 tp->rcv_nxt += q->tqe_len;
 603                 flags = q->tqe_th->th_flags & TH_FIN;
 604                 nq = LIST_NEXT(q, tqe_q);
 605                 LIST_REMOVE(q, tqe_q);
 606                 if (so->so_state & SS_CANTRCVMORE)
 607                         m_freem(q->tqe_m);
 608                 else {
 609                         so_recv_data_stat(so, q->tqe_m, 0); /* XXXX */
 610                         if (sbappendstream(&so->so_rcv, q->tqe_m))
 611                                 dowakeup = 1;
 612                 }
 613                 zfree(tcp_reass_zone, q);
 614                 tcp_reass_qsize--;
 615                 q = nq;
 616         } while (q && q->tqe_th->th_seq == tp->rcv_nxt);
 617         ND6_HINT(tp);
 618
 619 #if INET6
 620         if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
 621
 622                 KERNEL_DEBUG(DBG_LAYER_BEG,
 623                      ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
 624                      (((tp->t_inpcb->in6p_laddr.s6_addr16[0] & 0xffff) << 16) |
 625                       (tp->t_inpcb->in6p_faddr.s6_addr16[0] & 0xffff)),
 626                      0,0,0);
 627         }
 628         else
 629 #endif
 630         {
 631                 KERNEL_DEBUG(DBG_LAYER_BEG,
 632                      ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
 633                      (((tp->t_inpcb->inp_laddr.s_addr & 0xffff) << 16) |
 634                       (tp->t_inpcb->inp_faddr.s_addr & 0xffff)),
 635                      0,0,0);
 636         }
 637         if (dowakeup)
 638                 sorwakeup(so); /* done with socket lock held */
 639         return (flags);
 640
 641 }
 642
 643 /*
 644  * Reduce congestion window.
 645  */
 646 static void
 647 tcp_reduce_congestion_window(
 648         struct tcpcb    *tp, struct tcphdr *th)
 649 {
 650         /*
 651          * If the current tcp cc module has
 652          * defined a hook for tasks to run
 653          * before entering FR, call it
 654          */
 655         if (CC_ALGO(tp)->pre_fr != NULL)
 656                 CC_ALGO(tp)->pre_fr(tp, th);
 657         ENTER_FASTRECOVERY(tp);
 658         tp->snd_recover = tp->snd_max;
 659         tp->t_timer[TCPT_REXMT] = 0;
 660         tp->t_rtttime = 0;
 661         tp->ecn_flags |= TE_SENDCWR;
 662         tp->snd_cwnd = tp->snd_ssthresh +
 663                  tp->t_maxseg * tcprexmtthresh;
 664 }
 665
 666
 667 /*
 668  * TCP input routine, follows pages 65-76 of the
 669  * protocol specification dated September, 1981 very closely.
 670  */
 671 #if INET6
 672 int
 673 tcp6_input(struct mbuf **mp, int *offp, int proto)
 674 {
 675 #pragma unused(proto)
 676         register struct mbuf *m = *mp;
 677         struct in6_ifaddr *ia6;
 678
 679         IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), return IPPROTO_DONE);
 680
 681         /*
 682          * draft-itojun-ipv6-tcp-to-anycast
 683          * better place to put this in?
 684          */
 685         ia6 = ip6_getdstifaddr(m);
 686         if (ia6 != NULL) {
 687                 IFA_LOCK_SPIN(&ia6->ia_ifa);
 688                 if (ia6->ia6_flags & IN6_IFF_ANYCAST) {
 689                         struct ip6_hdr *ip6;
 690
 691                         IFA_UNLOCK(&ia6->ia_ifa);
 692                         IFA_REMREF(&ia6->ia_ifa);
 693                         ip6 = mtod(m, struct ip6_hdr *);
 694                         icmp6_error(m, ICMP6_DST_UNREACH,
 695                             ICMP6_DST_UNREACH_ADDR,
 696                             (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
 697                         return (IPPROTO_DONE);
 698                 }
 699                 IFA_UNLOCK(&ia6->ia_ifa);
 700                 IFA_REMREF(&ia6->ia_ifa);
 701         }
 702
 703         tcp_input(m, *offp);
 704         return (IPPROTO_DONE);
 705 }
 706 #endif
 707
 708 /* A receiver will evaluate the flow of packets on a connection
 709  * to see if it can reduce ack traffic. The receiver will start
 710  * stretching acks if all of the following conditions are met:
 711  * 1. tcp_delack_enabled is set to 3
 712  * 2. If the bytes received in the last 100ms is greater than a threshold
 713  *      defined by maxseg_unacked
 714  * 3. If the connection has not been idle for tcp_maxrcvidle period.
 715  * 4. If the connection has seen enough packets to let the slow-start
 716  *      finish after connection establishment or after some packet loss.
 717  *
 718  * The receiver will stop stretching acks if there is congestion/reordering
 719  * as indicated by packets on reassembly queue or an ECN. If the delayed-ack
 720  * timer fires while stretching acks, it means that the packet flow has gone
 721  * below the threshold defined by maxseg_unacked and the receiver will stop
 722  * stretching acks. The receiver gets no indication when slow-start is completed
 723  * or when the connection reaches an idle state. That is why we use
 724  * tcp_rcvsspktcnt to cover slow-start and tcp_maxrcvidle to identify idle
 725  * state.
 726  */
 727  static inline int
 728  tcp_stretch_ack_enable(struct tcpcb *tp) {
 729         if (tp->rcv_by_unackwin >= (maxseg_unacked * tp->t_maxseg) &&
 730                 TSTMP_GT(tp->rcv_unackwin + tcp_maxrcvidle, tcp_now) &&
 731                 (((tp->t_flagsext & TF_RCVUNACK_WAITSS) == 0) ||
 732                 (tp->rcv_waitforss >= tcp_rcvsspktcnt))) {
 733                 return(1);
 734         }
 735         return(0);
 736 }
 737
 738 /* Reset the state related to stretch-ack algorithm. This will make
 739  * the receiver generate an ack every other packet. The receiver
 740  * will start re-evaluating the rate at which packets come to decide
 741  * if it can benefit by lowering the ack traffic.
 742  */
 743 void
 744 tcp_reset_stretch_ack(struct tcpcb *tp)
 745 {
 746         tp->t_flags &= ~(TF_STRETCHACK);
 747         tp->rcv_by_unackwin = 0;
 748         tp->rcv_unackwin = tcp_now + tcp_rcvunackwin;
 749 }
 750
 751 void
 752 tcp_input(m, off0)
 753         struct mbuf *m;
 754         int off0;
 755 {
 756         register struct tcphdr *th;
 757         register struct ip *ip = NULL;
 758         register struct ipovly *ipov;
 759         register struct inpcb *inp;
 760         u_char *optp = NULL;
 761         int optlen = 0;
 762         int len, tlen, off;
 763         int drop_hdrlen;
 764         register struct tcpcb *tp = 0;
 765         register int thflags;
 766         struct socket *so = 0;
 767         int todrop, acked, ourfinisacked, needoutput = 0;
 768         struct in_addr laddr;
 769 #if INET6
 770         struct in6_addr laddr6;
 771 #endif
 772         int dropsocket = 0;
 773         int iss = 0;
 774         int nosock = 0;
 775         u_int32_t tiwin;
 776         struct tcpopt to;               /* options in this segment */
 777         struct sockaddr_in *next_hop = NULL;
 778 #if TCPDEBUG
 779         short ostate = 0;
 780 #endif
 781         struct m_tag *fwd_tag;
 782         u_char ip_ecn = IPTOS_ECN_NOTECT;
 783         unsigned int ifscope, nocell = 0;
 784         uint8_t isconnected, isdisconnected;
 785
 786         /*
 787          * Record the interface where this segment arrived on; this does not
 788          * affect normal data output (for non-detached TCP) as it provides a
 789          * hint about which route and interface to use for sending in the
 790          * absence of a PCB, when scoped routing (and thus source interface
 791          * selection) are enabled.
 792          */
 793         if ((m->m_flags & M_PKTHDR) && m->m_pkthdr.rcvif != NULL)
 794                 ifscope = m->m_pkthdr.rcvif->if_index;
 795         else
 796                 ifscope = IFSCOPE_NONE;
 797
 798         /* Since this is an entry point for input processing of tcp packets, we
 799          * can update the tcp clock here.
 800          */
 801         calculate_tcp_clock();
 802
 803         /* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */
 804         if (!SLIST_EMPTY(&m->m_pkthdr.tags)) {
 805                 fwd_tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID,
 806                     KERNEL_TAG_TYPE_IPFORWARD, NULL);
 807         } else {
 808                 fwd_tag = NULL;
 809         }
 810         if (fwd_tag != NULL) {
 811                 struct ip_fwd_tag *ipfwd_tag = (struct ip_fwd_tag *)(fwd_tag+1);
 812
 813                 next_hop = ipfwd_tag->next_hop;
 814                 m_tag_delete(m, fwd_tag);
 815         }
 816
 817 #if INET6
 818         struct ip6_hdr *ip6 = NULL;
 819         int isipv6;
 820 #endif /* INET6 */
 821         int rstreason; /* For badport_bandlim accounting purposes */
 822         struct proc *proc0=current_proc();
 823
 824         KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_START,0,0,0,0,0);
 825
 826 #if INET6
 827         isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
 828 #endif
 829         bzero((char *)&to, sizeof(to));
 830
 831         tcpstat.tcps_rcvtotal++;
 832
 833
 834
 835 #if INET6
 836         if (isipv6) {
 837                 /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */
 838                 ip6 = mtod(m, struct ip6_hdr *);
 839                 tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
 840                 th = (struct tcphdr *)((caddr_t)ip6 + off0);
 841
 842                 if ((apple_hwcksum_rx != 0) && (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)) {
 843                         if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
 844                                 th->th_sum = m->m_pkthdr.csum_data;
 845                         else
 846                                 th->th_sum = in6_cksum_phdr(&ip6->ip6_src,
 847                                         &ip6->ip6_dst, htonl(sizeof(struct tcphdr)),
 848                                         htonl(IPPROTO_TCP));
 849
 850                         th->th_sum ^= 0xffff;
 851                         if (th->th_sum) {
 852                                 tcpstat.tcps_rcvbadsum++;
 853                                 goto dropnosock;
 854                         }
 855                 }
 856                 else {
 857                         if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) {
 858                                 tcpstat.tcps_rcvbadsum++;
 859                                 goto dropnosock;
 860                         }
 861                 }
 862
 863                 KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport),
 864                      (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
 865                      th->th_seq, th->th_ack, th->th_win);
 866                 /*
 867                  * Be proactive about unspecified IPv6 address in source.
 868                  * As we use all-zero to indicate unbounded/unconnected pcb,
 869                  * unspecified IPv6 address can be used to confuse us.
 870                  *
 871                  * Note that packets with unspecified IPv6 destination is
 872                  * already dropped in ip6_input.
 873                  */
 874                 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
 875                         /* XXX stat */
 876                         goto dropnosock;
 877                 }
 878                 DTRACE_TCP5(receive, sruct mbuf *, m, struct inpcb *, NULL,
 879                         struct ip6_hdr *, ip6, struct tcpcb *, NULL,
 880                         struct tcphdr *, th);
 881
 882         ip_ecn = (ntohl(ip6->ip6_flow) >> 20) & IPTOS_ECN_MASK;
 883         } else
 884 #endif /* INET6 */
 885         {
 886         /*
 887          * Get IP and TCP header together in first mbuf.
 888          * Note: IP leaves IP header in first mbuf.
 889          */
 890         if (off0 > sizeof (struct ip)) {
 891                 ip_stripoptions(m, (struct mbuf *)0);
 892                 off0 = sizeof(struct ip);
 893                 if (m->m_pkthdr.csum_flags & CSUM_TCP_SUM16)
 894                         m->m_pkthdr.csum_flags = 0; /* invalidate hwcksuming */
 895
 896         }
 897         if (m->m_len < sizeof (struct tcpiphdr)) {
 898                 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) {
 899                         tcpstat.tcps_rcvshort++;
 900                         return;
 901                 }
 902         }
 903         ip = mtod(m, struct ip *);
 904         ipov = (struct ipovly *)ip;
 905         th = (struct tcphdr *)((caddr_t)ip + off0);
 906         tlen = ip->ip_len;
 907
 908         DTRACE_TCP5(receive, struct mbuf *, m, struct inpcb *, NULL,
 909                 struct ip *, ip, struct tcpcb *, NULL, struct tcphdr *, th);
 910
 911         KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport),
 912                      (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
 913                      th->th_seq, th->th_ack, th->th_win);
 914
 915         if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
 916                 if (m->m_pkthdr.csum_flags & CSUM_TCP_SUM16) {
 917                         u_short pseudo;
 918                         char b[9];
 919                         *(uint32_t*)&b[0] = *(uint32_t*)&ipov->ih_x1[0];
 920                         *(uint32_t*)&b[4] = *(uint32_t*)&ipov->ih_x1[4];
 921                         *(uint8_t*)&b[8] = *(uint8_t*)&ipov->ih_x1[8];
 922
 923                         bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
 924                         ipov->ih_len = (u_short)tlen;
 925
 926 #if BYTE_ORDER != BIG_ENDIAN
 927                         HTONS(ipov->ih_len);
 928 #endif
 929
 930                         pseudo = in_cksum(m, sizeof (struct ip));
 931
 932                         *(uint32_t*)&ipov->ih_x1[0] = *(uint32_t*)&b[0];
 933                         *(uint32_t*)&ipov->ih_x1[4] = *(uint32_t*)&b[4];
 934                         *(uint8_t*)&ipov->ih_x1[8] = *(uint8_t*)&b[8];
 935
 936                         th->th_sum = in_addword(pseudo, (m->m_pkthdr.csum_data & 0xFFFF));
 937                 } else {
 938                         if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
 939                                 th->th_sum = m->m_pkthdr.csum_data;
 940                         else
 941                                 th->th_sum = in_pseudo(ip->ip_src.s_addr,
 942                                         ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data +
 943                                         ip->ip_len + IPPROTO_TCP));
 944                 }
 945                 th->th_sum ^= 0xffff;
 946         } else {
 947                 char b[9];
 948                 /*
 949                  * Checksum extended TCP header and data.
 950                  */
 951                 *(uint32_t*)&b[0] = *(uint32_t*)&ipov->ih_x1[0];
 952                 *(uint32_t*)&b[4] = *(uint32_t*)&ipov->ih_x1[4];
 953                 *(uint8_t*)&b[8] = *(uint8_t*)&ipov->ih_x1[8];
 954
 955                 len = sizeof (struct ip) + tlen;
 956                 bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
 957                 ipov->ih_len = (u_short)tlen;
 958
 959 #if BYTE_ORDER != BIG_ENDIAN
 960                 HTONS(ipov->ih_len);
 961 #endif
 962
 963                 th->th_sum = in_cksum(m, len);
 964
 965                 *(uint32_t*)&ipov->ih_x1[0] = *(uint32_t*)&b[0];
 966                 *(uint32_t*)&ipov->ih_x1[4] = *(uint32_t*)&b[4];
 967                 *(uint8_t*)&ipov->ih_x1[8] = *(uint8_t*)&b[8];
 968
 969                 tcp_in_cksum_stats(len);
 970         }
 971         if (th->th_sum) {
 972                 tcpstat.tcps_rcvbadsum++;
 973                 goto dropnosock;
 974         }
 975 #if INET6
 976         /* Re-initialization for later version check */
 977         ip->ip_v = IPVERSION;
 978 #endif
 979         ip_ecn = (ip->ip_tos & IPTOS_ECN_MASK);
 980         }
 981
 982         /*
 983          * Check that TCP offset makes sense,
 984          * pull out TCP options and adjust length.              XXX
 985          */
 986         off = th->th_off << 2;
 987         if (off < sizeof (struct tcphdr) || off > tlen) {
 988                 tcpstat.tcps_rcvbadoff++;
 989                 goto dropnosock;
 990         }
 991         tlen -= off;    /* tlen is used instead of ti->ti_len */
 992         if (off > sizeof (struct tcphdr)) {
 993 #if INET6
 994                 if (isipv6) {
 995                         IP6_EXTHDR_CHECK(m, off0, off, return);
 996                         ip6 = mtod(m, struct ip6_hdr *);
 997                         th = (struct tcphdr *)((caddr_t)ip6 + off0);
 998                 } else
 999 #endif /* INET6 */
1000                 {
1001                         if (m->m_len < sizeof(struct ip) + off) {
1002                                 if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) {
1003                                         tcpstat.tcps_rcvshort++;
1004                                         return;
1005                                 }
1006                                 ip = mtod(m, struct ip *);
1007                                 ipov = (struct ipovly *)ip;
1008                                 th = (struct tcphdr *)((caddr_t)ip + off0);
1009                         }
1010                 }
1011                 optlen = off - sizeof (struct tcphdr);
1012                 optp = (u_char *)(th + 1);
1013                 /*
1014                  * Do quick retrieval of timestamp options ("options
1015                  * prediction?").  If timestamp is the only option and it's
1016                  * formatted as recommended in RFC 1323 appendix A, we
1017                  * quickly get the values now and not bother calling
1018                  * tcp_dooptions(), etc.
1019                  */
1020                 if ((optlen == TCPOLEN_TSTAMP_APPA ||
1021                         (optlen > TCPOLEN_TSTAMP_APPA &&
1022                         optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
1023                         *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
1024                         (th->th_flags & TH_SYN) == 0) {
1025                         to.to_flags |= TOF_TS;
1026                         to.to_tsval = ntohl(*(u_int32_t *)(optp + 4));
1027                         to.to_tsecr = ntohl(*(u_int32_t *)(optp + 8));
1028                         optp = NULL;    /* we've parsed the options */
1029                 }
1030         }
1031         thflags = th->th_flags;
1032
1033 #if TCP_DROP_SYNFIN
1034         /*
1035          * If the drop_synfin option is enabled, drop all packets with
1036          * both the SYN and FIN bits set. This prevents e.g. nmap from
1037          * identifying the TCP/IP stack.
1038          *
1039          * This is a violation of the TCP specification.
1040          */
1041         if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN))
1042                 goto dropnosock;
1043 #endif
1044
1045         /*
1046          * Convert TCP protocol specific fields to host format.
1047          */
1048
1049 #if BYTE_ORDER != BIG_ENDIAN
1050         NTOHL(th->th_seq);
1051         NTOHL(th->th_ack);
1052         NTOHS(th->th_win);
1053         NTOHS(th->th_urp);
1054 #endif
1055
1056         /*
1057          * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options,
1058          * until after ip6_savecontrol() is called and before other functions
1059          * which don't want those proto headers.
1060          * Because ip6_savecontrol() is going to parse the mbuf to
1061          * search for data to be passed up to user-land, it wants mbuf
1062          * parameters to be unchanged.
1063          */
1064         drop_hdrlen = off0 + off;
1065
1066         /*
1067          * Locate pcb for segment.
1068          */
1069 findpcb:
1070
1071         isconnected = FALSE;
1072         isdisconnected = FALSE;
1073
1074 #if IPFIREWALL_FORWARD
1075         if (next_hop != NULL
1076 #if INET6
1077             && isipv6 == 0 /* IPv6 support is not yet */
1078 #endif /* INET6 */
1079             ) {
1080                 /*
1081                  * Diverted. Pretend to be the destination.
1082                  * already got one like this?
1083                  */
1084                 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
1085                         ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif);
1086                 if (!inp) {
1087                         /*
1088                          * No, then it's new. Try find the ambushing socket
1089                          */
1090                         if (!next_hop->sin_port) {
1091                                 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src,
1092                                     th->th_sport, next_hop->sin_addr,
1093                                     th->th_dport, 1, m->m_pkthdr.rcvif);
1094                         } else {
1095                                 inp = in_pcblookup_hash(&tcbinfo,
1096                                     ip->ip_src, th->th_sport,
1097                                     next_hop->sin_addr,
1098                                     ntohs(next_hop->sin_port), 1,
1099                                     m->m_pkthdr.rcvif);
1100                         }
1101                 }
1102         } else
1103 #endif  /* IPFIREWALL_FORWARD */
1104       {
1105 #if INET6
1106         if (isipv6)
1107                 inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport,
1108                                          &ip6->ip6_dst, th->th_dport, 1,
1109                                          m->m_pkthdr.rcvif);
1110         else
1111 #endif /* INET6 */
1112         inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
1113             ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif);
1114       }
1115
1116         /*
1117          * Use the interface scope information from the PCB for outbound
1118          * segments.  If the PCB isn't present and if scoped routing is
1119          * enabled, tcp_respond will use the scope of the interface where
1120          * the segment arrived on.
1121          */
1122         if (inp != NULL && (inp->inp_flags & INP_BOUND_IF))
1123                 ifscope = inp->inp_boundif;
1124         /*
1125          * If the PCB is present and the socket isn't allowed to use
1126          * the cellular interface, indicate it as such for tcp_respond.
1127          */
1128         if (inp != NULL && (inp->inp_flags & INP_NO_IFT_CELLULAR))
1129                 nocell = 1;
1130
1131 #if IPSEC
1132         if (ipsec_bypass == 0)  {
1133 #if INET6
1134                 if (isipv6) {
1135                         if (inp != NULL && ipsec6_in_reject_so(m, inp->inp_socket)) {
1136                                 IPSEC_STAT_INCREMENT(ipsec6stat.in_polvio);
1137                                 if (in_pcb_checkstate(inp, WNT_RELEASE, 0) == WNT_STOPUSING)
1138                                         inp = NULL;     // pretend we didn't find it
1139                                 goto dropnosock;
1140                         }
1141                 } else
1142 #endif /* INET6 */
1143                         if (inp != NULL && ipsec4_in_reject_so(m, inp->inp_socket)) {
1144                                 IPSEC_STAT_INCREMENT(ipsecstat.in_polvio);
1145                                 if (in_pcb_checkstate(inp, WNT_RELEASE, 0) == WNT_STOPUSING)
1146                                         inp = NULL;     // pretend we didn't find it
1147                                 goto dropnosock;
1148                         }
1149         }
1150 #endif /*IPSEC*/
1151
1152         /*
1153          * If the state is CLOSED (i.e., TCB does not exist) then
1154          * all data in the incoming segment is discarded.
1155          * If the TCB exists but is in CLOSED state, it is embryonic,
1156          * but should either do a listen or a connect soon.
1157          */
1158         if (inp == NULL) {
1159                 if (log_in_vain) {
1160 #if INET6
1161                         char dbuf[MAX_IPv6_STR_LEN], sbuf[MAX_IPv6_STR_LEN];
1162 #else /* INET6 */
1163                         char dbuf[MAX_IPv4_STR_LEN], sbuf[MAX_IPv4_STR_LEN];
1164 #endif /* INET6 */
1165
1166 #if INET6
1167                         if (isipv6) {
1168                                 inet_ntop(AF_INET6, &ip6->ip6_dst, dbuf, sizeof(dbuf));
1169                                 inet_ntop(AF_INET6, &ip6->ip6_src, sbuf, sizeof(sbuf));
1170                         } else
1171 #endif
1172                         {
1173                                 inet_ntop(AF_INET, &ip->ip_dst, dbuf, sizeof(dbuf));
1174                                 inet_ntop(AF_INET, &ip->ip_src, sbuf, sizeof(sbuf));
1175                         }
1176                         switch (log_in_vain) {
1177                         case 1:
1178                                 if(thflags & TH_SYN)
1179                                         log(LOG_INFO,
1180                                                 "Connection attempt to TCP %s:%d from %s:%d\n",
1181                                                 dbuf, ntohs(th->th_dport),
1182                                                 sbuf,
1183                                                 ntohs(th->th_sport));
1184                                 break;
1185                         case 2:
1186                                 log(LOG_INFO,
1187                                         "Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n",
1188                                         dbuf, ntohs(th->th_dport), sbuf,
1189                                         ntohs(th->th_sport), thflags);
1190                                 break;
1191                         case 3:
1192                                 if ((thflags & TH_SYN) &&
1193                                         !(m->m_flags & (M_BCAST | M_MCAST)) &&
1194 #if INET6
1195                                         ((isipv6 && !IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) ||
1196                                          (!isipv6 && ip->ip_dst.s_addr != ip->ip_src.s_addr))
1197 #else
1198                                         ip->ip_dst.s_addr != ip->ip_src.s_addr
1199 #endif
1200                                          )
1201                                         log_in_vain_log((LOG_INFO,
1202                                                 "Stealth Mode connection attempt to TCP %s:%d from %s:%d\n",
1203                                                 dbuf, ntohs(th->th_dport),
1204                                                 sbuf,
1205                                                 ntohs(th->th_sport)));
1206                                 break;
1207                         default:
1208                                 break;
1209                         }
1210                 }
1211                 if (blackhole) {
1212                         if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type != IFT_LOOP)
1213                                 switch (blackhole) {
1214                                 case 1:
1215                                         if (thflags & TH_SYN)
1216                                                 goto dropnosock;
1217                                         break;
1218                                 case 2:
1219                                         goto dropnosock;
1220                                 default:
1221                                         goto dropnosock;
1222                                 }
1223                 }
1224                 rstreason = BANDLIM_RST_CLOSEDPORT;
1225                 goto dropwithresetnosock;
1226         }
1227         so = inp->inp_socket;
1228         if (so == NULL) {
1229                 /* This case shouldn't happen  as the socket shouldn't be null
1230                  * if inp_state isn't set to INPCB_STATE_DEAD
1231                  * But just in case, we pretend we didn't find the socket if we hit this case
1232                  * as this isn't cause for a panic (the socket might be leaked however)...
1233                  */
1234                 inp = NULL;
1235 #if TEMPDEBUG
1236                 printf("tcp_input: no more socket for inp=%x. This shouldn't happen\n", inp);
1237 #endif
1238                 goto dropnosock;
1239         }
1240
1241         tcp_lock(so, 1, 0);
1242         if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
1243                 tcp_unlock(so, 1, (void *)2);
1244                 inp = NULL;     // pretend we didn't find it
1245                 goto dropnosock;
1246         }
1247
1248         tp = intotcpcb(inp);
1249         if (tp == 0) {
1250                 rstreason = BANDLIM_RST_CLOSEDPORT;
1251                 goto dropwithreset;
1252         }
1253         if (tp->t_state == TCPS_CLOSED)
1254                 goto drop;
1255
1256         /* Unscale the window into a 32-bit value. */
1257         if ((thflags & TH_SYN) == 0)
1258                 tiwin = th->th_win << tp->snd_scale;
1259         else
1260                 tiwin = th->th_win;
1261
1262 #if CONFIG_MACF_NET
1263         if (mac_inpcb_check_deliver(inp, m, AF_INET, SOCK_STREAM))
1264                 goto drop;
1265 #endif
1266
1267         /* Radar 7377561: Avoid processing packets while closing a listen socket */
1268         if (tp->t_state == TCPS_LISTEN && (so->so_options & SO_ACCEPTCONN) == 0)
1269                 goto drop;
1270
1271         if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
1272 #if TCPDEBUG
1273                 if (so->so_options & SO_DEBUG) {
1274                         ostate = tp->t_state;
1275 #if INET6
1276                         if (isipv6)
1277                                 bcopy((char *)ip6, (char *)tcp_saveipgen,
1278                                       sizeof(*ip6));
1279                         else
1280 #endif /* INET6 */
1281                         bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip));
1282                         tcp_savetcp = *th;
1283                 }
1284 #endif
1285                 if (so->so_options & SO_ACCEPTCONN) {
1286                     register struct tcpcb *tp0 = tp;
1287                         struct socket *so2;
1288                         struct socket *oso;
1289                         struct sockaddr_storage from;
1290 #if INET6
1291                         struct inpcb *oinp = sotoinpcb(so);
1292 #endif /* INET6 */
1293                         unsigned int head_ifscope;
1294                         unsigned int head_nocell;
1295
1296                         /* Get listener's bound-to-interface, if any */
1297                         head_ifscope = (inp->inp_flags & INP_BOUND_IF) ?
1298                             inp->inp_boundif : IFSCOPE_NONE;
1299                         /* Get listener's no-cellular information, if any */
1300                         head_nocell = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0;
1301
1302                         /*
1303                          * If the state is LISTEN then ignore segment if it contains an RST.
1304                          * If the segment contains an ACK then it is bad and send a RST.
1305                          * If it does not contain a SYN then it is not interesting; drop it.
1306                          * If it is from this socket, drop it, it must be forged.
1307                          */
1308                         if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
1309                                 if (thflags & TH_RST) {
1310                                         goto drop;
1311                                 }
1312                                 if (thflags & TH_ACK) {
1313                                         tp = NULL;
1314                                         tcpstat.tcps_badsyn++;
1315                                         rstreason = BANDLIM_RST_OPENPORT;
1316                                         goto dropwithreset;
1317                                 }
1318
1319                                 /* We come here if there is no SYN set */
1320                                 tcpstat.tcps_badsyn++;
1321                                 goto drop;
1322                         }
1323                         KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_START,0,0,0,0,0);
1324                         if (th->th_dport == th->th_sport) {
1325 #if INET6
1326                                 if (isipv6) {
1327                                         if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
1328                                                        &ip6->ip6_src))
1329                                                 goto drop;
1330                                 } else
1331 #endif /* INET6 */
1332                                         if (ip->ip_dst.s_addr == ip->ip_src.s_addr)
1333                                                 goto drop;
1334                         }
1335                         /*
1336                          * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
1337                          * in_broadcast() should never return true on a received
1338                          * packet with M_BCAST not set.
1339                          *
1340                          * Packets with a multicast source address should also
1341                          * be discarded.
1342                          */
1343                         if (m->m_flags & (M_BCAST|M_MCAST))
1344                                 goto drop;
1345 #if INET6
1346                         if (isipv6) {
1347                                 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
1348                                         IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
1349                                         goto drop;
1350                         } else
1351 #endif
1352                         if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
1353                                 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
1354                                 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
1355                                 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
1356                                 goto drop;
1357
1358
1359 #if INET6
1360                         /*
1361                          * If deprecated address is forbidden,
1362                          * we do not accept SYN to deprecated interface
1363                          * address to prevent any new inbound connection from
1364                          * getting established.
1365                          * When we do not accept SYN, we send a TCP RST,
1366                          * with deprecated source address (instead of dropping
1367                          * it).  We compromise it as it is much better for peer
1368                          * to send a RST, and RST will be the final packet
1369                          * for the exchange.
1370                          *
1371                          * If we do not forbid deprecated addresses, we accept
1372                          * the SYN packet.  RFC2462 does not suggest dropping
1373                          * SYN in this case.
1374                          * If we decipher RFC2462 5.5.4, it says like this:
1375                          * 1. use of deprecated addr with existing
1376                          *    communication is okay - "SHOULD continue to be
1377                          *    used"
1378                          * 2. use of it with new communication:
1379                          *   (2a) "SHOULD NOT be used if alternate address
1380                          *        with sufficient scope is available"
1381                          *   (2b) nothing mentioned otherwise.
1382                          * Here we fall into (2b) case as we have no choice in
1383                          * our source address selection - we must obey the peer.
1384                          *
1385                          * The wording in RFC2462 is confusing, and there are
1386                          * multiple description text for deprecated address
1387                          * handling - worse, they are not exactly the same.
1388                          * I believe 5.5.4 is the best one, so we follow 5.5.4.
1389                          */
1390                         if (isipv6 && !ip6_use_deprecated) {
1391                                 struct in6_ifaddr *ia6;
1392
1393                                 ia6 = ip6_getdstifaddr(m);
1394                                 if (ia6 != NULL) {
1395                                         IFA_LOCK_SPIN(&ia6->ia_ifa);
1396                                         if (ia6->ia6_flags & IN6_IFF_DEPRECATED) {
1397                                                 IFA_UNLOCK(&ia6->ia_ifa);
1398                                                 IFA_REMREF(&ia6->ia_ifa);
1399                                                 tp = NULL;
1400                                                 rstreason = BANDLIM_RST_OPENPORT;
1401                                                 goto dropwithreset;
1402                                         }
1403                                         IFA_UNLOCK(&ia6->ia_ifa);
1404                                         IFA_REMREF(&ia6->ia_ifa);
1405                                 }
1406                         }
1407 #endif
1408                         if (so->so_filt) {
1409 #if INET6
1410                                 if (isipv6) {
1411                                         struct sockaddr_in6     *sin6 = (struct sockaddr_in6*)&from;
1412
1413                                         sin6->sin6_len = sizeof(*sin6);
1414                                         sin6->sin6_family = AF_INET6;
1415                                         sin6->sin6_port = th->th_sport;
1416                                         sin6->sin6_flowinfo = 0;
1417                                         sin6->sin6_addr = ip6->ip6_src;
1418                                         sin6->sin6_scope_id = 0;
1419                                 }
1420                                 else
1421 #endif
1422                                 {
1423                                         struct sockaddr_in *sin = (struct sockaddr_in*)&from;
1424
1425                                         sin->sin_len = sizeof(*sin);
1426                                         sin->sin_family = AF_INET;
1427                                         sin->sin_port = th->th_sport;
1428                                         sin->sin_addr = ip->ip_src;
1429                                 }
1430                                 so2 = sonewconn(so, 0, (struct sockaddr*)&from);
1431                         } else {
1432                                 so2 = sonewconn(so, 0, NULL);
1433                         }
1434                         if (so2 == 0) {
1435                                 tcpstat.tcps_listendrop++;
1436                                 if (tcp_dropdropablreq(so)) {
1437                                         if (so->so_filt)
1438                                                 so2 = sonewconn(so, 0, (struct sockaddr*)&from);
1439                                         else
1440                                                 so2 = sonewconn(so, 0, NULL);
1441                                 }
1442                                 if (!so2)
1443                                         goto drop;
1444                         }
1445
1446                         /* Point "inp" and "tp" in tandem to new socket */
1447                         inp = (struct inpcb *)so2->so_pcb;
1448                         tp = intotcpcb(inp);
1449
1450                         oso = so;
1451                         tcp_unlock(so, 0, 0); /* Unlock but keep a reference on listener for now */
1452
1453                         so = so2;
1454                         tcp_lock(so, 1, 0);
1455                         /*
1456                          * Mark socket as temporary until we're
1457                          * committed to keeping it.  The code at
1458                          * ``drop'' and ``dropwithreset'' check the
1459                          * flag dropsocket to see if the temporary
1460                          * socket created here should be discarded.
1461                          * We mark the socket as discardable until
1462                          * we're committed to it below in TCPS_LISTEN.
1463                          * There are some error conditions in which we
1464                          * have to drop the temporary socket.
1465                          */
1466                         dropsocket++;
1467                         /*
1468                          * Inherit INP_BOUND_IF from listener; testing if
1469                          * head_ifscope is non-zero is sufficient, since it
1470                          * can only be set to a non-zero value earlier if
1471                          * the listener has such a flag set.
1472                          */
1473                         if (head_ifscope != IFSCOPE_NONE) {
1474                                 inp->inp_flags |= INP_BOUND_IF;
1475                                 inp->inp_boundif = head_ifscope;
1476                         }
1477                         /*
1478                          * Inherit INP_NO_IFT_CELLULAR from listener.
1479                          */
1480                         if (head_nocell) {
1481                                 inp->inp_flags |= INP_NO_IFT_CELLULAR;
1482                         }
1483 #if INET6
1484                         if (isipv6)
1485                                 inp->in6p_laddr = ip6->ip6_dst;
1486                         else {
1487                                 inp->inp_vflag &= ~INP_IPV6;
1488                                 inp->inp_vflag |= INP_IPV4;
1489 #endif /* INET6 */
1490                                 inp->inp_laddr = ip->ip_dst;
1491 #if INET6
1492                         }
1493 #endif /* INET6 */
1494                         inp->inp_lport = th->th_dport;
1495                         if (in_pcbinshash(inp, 0) != 0) {
1496                                 /*
1497                                  * Undo the assignments above if we failed to
1498                                  * put the PCB on the hash lists.
1499                                  */
1500 #if INET6
1501                                 if (isipv6)
1502                                         inp->in6p_laddr = in6addr_any;
1503                                 else
1504 #endif /* INET6 */
1505                                 inp->inp_laddr.s_addr = INADDR_ANY;
1506                                 inp->inp_lport = 0;
1507                                 tcp_lock(oso, 0, 0);    /* release ref on parent */
1508                                 tcp_unlock(oso, 1, 0);
1509                                 goto drop;
1510                         }
1511 #if INET6
1512                         if (isipv6) {
1513                                 /*
1514                                  * Inherit socket options from the listening
1515                                  * socket.
1516                                  * Note that in6p_inputopts are not (even
1517                                  * should not be) copied, since it stores
1518                                  * previously received options and is used to
1519                                  * detect if each new option is different than
1520                                  * the previous one and hence should be passed
1521                                  * to a user.
1522                                  * If we copied in6p_inputopts, a user would
1523                                  * not be able to receive options just after
1524                                  * calling the accept system call.
1525                                  */
1526                                 inp->inp_flags |=
1527                                         oinp->inp_flags & INP_CONTROLOPTS;
1528                                 if (oinp->in6p_outputopts)
1529                                         inp->in6p_outputopts =
1530                                                 ip6_copypktopts(oinp->in6p_outputopts,
1531                                                                 M_NOWAIT);
1532                         } else
1533 #endif /* INET6 */
1534                         inp->inp_options = ip_srcroute();
1535                         tcp_lock(oso, 0, 0);
1536 #if IPSEC
1537                         /* copy old policy into new socket's */
1538                         if (sotoinpcb(oso)->inp_sp)
1539                         {
1540                                 int error = 0;
1541                                 /* Is it a security hole here to silently fail to copy the policy? */
1542                                 if (inp->inp_sp != NULL)
1543                                         error = ipsec_init_policy(so, &inp->inp_sp);
1544                                 if (error != 0 || ipsec_copy_policy(sotoinpcb(oso)->inp_sp, inp->inp_sp))
1545                                         printf("tcp_input: could not copy policy\n");
1546                         }
1547 #endif
1548                         /* inherit states from the listener */
1549                         DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
1550                                 struct tcpcb *, tp, int32_t, TCPS_LISTEN);
1551                         tp->t_state = TCPS_LISTEN;
1552                         tp->t_flags |= tp0->t_flags & (TF_NOPUSH|TF_NOOPT|TF_NODELAY);
1553                         tp->t_flagsext |= (tp0->t_flagsext & TF_RXTFINDROP);
1554                         tp->t_keepinit = tp0->t_keepinit;
1555                         tp->t_inpcb->inp_ip_ttl = tp0->t_inpcb->inp_ip_ttl;
1556
1557                         /* now drop the reference on the listener */
1558                         tcp_unlock(oso, 1, 0);
1559
1560                         /* Compute proper scaling value from buffer space */
1561                         if (inp->inp_pcbinfo->ipi_count < tcp_sockthreshold) {
1562                                 tp->request_r_scale = max(tcp_win_scale, tp->request_r_scale);
1563                                 so->so_rcv.sb_hiwat = imin(TCP_MAXWIN << tp->request_r_scale, (sb_max / (MSIZE+MCLBYTES)) * MCLBYTES);
1564                         }
1565                         else {
1566                                 while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
1567                                 TCP_MAXWIN << tp->request_r_scale <
1568                                 so->so_rcv.sb_hiwat)
1569                                         tp->request_r_scale++;
1570                         }
1571
1572                         KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_END,0,0,0,0,0);
1573                 }
1574         }
1575         lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1576
1577         /*
1578          * Radar 3529618
1579          * This is the second part of the MSS DoS prevention code (after
1580          * minmss on the sending side) and it deals with too many too small
1581          * tcp packets in a too short timeframe (1 second).
1582          *
1583          * For every full second we count the number of received packets
1584          * and bytes. If we get a lot of packets per second for this connection
1585          * (tcp_minmssoverload) we take a closer look at it and compute the
1586          * average packet size for the past second. If that is less than
1587          * tcp_minmss we get too many packets with very small payload which
1588          * is not good and burdens our system (and every packet generates
1589          * a wakeup to the process connected to our socket). We can reasonable
1590          * expect this to be small packet DoS attack to exhaust our CPU
1591          * cycles.
1592          *
1593          * Care has to be taken for the minimum packet overload value. This
1594          * value defines the minimum number of packets per second before we
1595          * start to worry. This must not be too low to avoid killing for
1596          * example interactive connections with many small packets like
1597          * telnet or SSH.
1598          *
1599          * Setting either tcp_minmssoverload or tcp_minmss to "0" disables
1600          * this check.
1601          *
1602          * Account for packet if payload packet, skip over ACK, etc.
1603          */
1604         if (tp->t_state == TCPS_ESTABLISHED && tlen > 0) {
1605                 if (TSTMP_GT(tp->rcv_reset, tcp_now)) {
1606                         tp->rcv_pps++;
1607                         tp->rcv_byps += tlen + off;
1608                         if (tp->rcv_byps > tp->rcv_maxbyps)
1609                                 tp->rcv_maxbyps = tp->rcv_byps;
1610                 /*
1611                  * Setting either tcp_minmssoverload or tcp_minmss to "0" disables
1612                  * the check.
1613                  */
1614                         if (tcp_minmss && tcp_minmssoverload && tp->rcv_pps > tcp_minmssoverload) {
1615                                 if ((tp->rcv_byps / tp->rcv_pps) < tcp_minmss) {
1616                                         char    ipstrbuf[MAX_IPv6_STR_LEN];
1617                                         printf("too many small tcp packets from "
1618                                                "%s:%u, av. %ubyte/packet, "
1619                                                "dropping connection\n",
1620 #if INET6
1621                                                 isipv6 ?
1622                                                 inet_ntop(AF_INET6, &inp->in6p_faddr, ipstrbuf,
1623                                                                   sizeof(ipstrbuf)) :
1624 #endif
1625                                                 inet_ntop(AF_INET, &inp->inp_faddr, ipstrbuf,
1626                                                                   sizeof(ipstrbuf)),
1627                                                 inp->inp_fport,
1628                                                 tp->rcv_byps / tp->rcv_pps);
1629                                         tp = tcp_drop(tp, ECONNRESET);
1630 /*                                      tcpstat.tcps_minmssdrops++; */
1631                                         goto drop;
1632                                 }
1633                         }
1634                 } else {
1635                         tp->rcv_reset = tcp_now + TCP_RETRANSHZ;
1636                         tp->rcv_pps = 1;
1637                         tp->rcv_byps = tlen + off;
1638                 }
1639
1640                 /* Evaluate the rate of arrival of packets to see if the
1641                  * receiver can reduce the ack traffic. The algorithm to
1642                  * stretch acks will be enabled if the connection meets
1643                  * certain criteria defined in tcp_stretch_ack_enable function.
1644                  */
1645                 if ((tp->t_flagsext & TF_RCVUNACK_WAITSS) != 0) {
1646                         tp->rcv_waitforss++;
1647                 }
1648                 if (tcp_stretch_ack_enable(tp)) {
1649                         tp->t_flags |= TF_STRETCHACK;
1650                         tp->t_flagsext &= ~(TF_RCVUNACK_WAITSS);
1651                         tp->rcv_waitforss = 0;
1652                 } else {
1653                         tp->t_flags &= ~(TF_STRETCHACK);
1654                 }
1655                 if (TSTMP_GT(tp->rcv_unackwin, tcp_now)) {
1656                         tp->rcv_by_unackwin += (tlen + off);
1657                 } else {
1658                         tp->rcv_unackwin = tcp_now + tcp_rcvunackwin;
1659                         tp->rcv_by_unackwin = tlen + off;
1660                 }
1661         }
1662
1663         /*
1664            Explicit Congestion Notification - Flag that we need to send ECT if
1665                 + The IP Congestion experienced flag was set.
1666                 + Socket is in established state
1667                 + We negotiated ECN in the TCP setup
1668                 + This isn't a pure ack (tlen > 0)
1669                 + The data is in the valid window
1670
1671                 TE_SENDECE will be cleared when we receive a packet with TH_CWR set.
1672          */
1673         if (ip_ecn == IPTOS_ECN_CE && tp->t_state == TCPS_ESTABLISHED &&
1674                 (tp->ecn_flags & (TE_SETUPSENT | TE_SETUPRECEIVED)) ==
1675                  (TE_SETUPSENT | TE_SETUPRECEIVED) && tlen > 0 &&
1676                 SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
1677                 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
1678                 tp->ecn_flags |= TE_SENDECE;
1679         }
1680
1681         /*
1682            Clear TE_SENDECE if TH_CWR is set. This is harmless, so we don't
1683            bother doing extensive checks for state and whatnot.
1684          */
1685         if ((thflags & TH_CWR) == TH_CWR) {
1686                 tp->ecn_flags &= ~TE_SENDECE;
1687         }
1688
1689         /* If we received an  explicit notification of congestion in
1690          * ip tos ecn bits or by the CWR bit in TCP header flags, reset
1691          * the ack-strteching state.
1692          */
1693         if (tp->t_state == TCPS_ESTABLISHED && (tp->t_flags & TF_STRETCHACK) != 0 &&
1694                 ((ip_ecn == IPTOS_ECN_CE) || ((thflags & TH_CWR) == TH_CWR)))
1695                 tcp_reset_stretch_ack(tp);
1696
1697         /*
1698          * Segment received on connection.
1699          * Reset idle time and keep-alive timer.
1700          */
1701         tp->t_rcvtime = tcp_now;
1702         if (TCPS_HAVEESTABLISHED(tp->t_state))
1703                 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, TCP_KEEPIDLE(tp));
1704
1705         /*
1706          * Process options if not in LISTEN state,
1707          * else do it below (after getting remote address).
1708          */
1709         if (tp->t_state != TCPS_LISTEN && optp)
1710                 tcp_dooptions(tp, optp, optlen, th, &to, ifscope);
1711
1712         if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
1713                 if (to.to_flags & TOF_SCALE) {
1714                         tp->t_flags |= TF_RCVD_SCALE;
1715                         tp->requested_s_scale = to.to_requested_s_scale;
1716                         tp->snd_wnd = th->th_win << tp->snd_scale;
1717                         tiwin = tp->snd_wnd;
1718                 }
1719                 if (to.to_flags & TOF_TS) {
1720                         tp->t_flags |= TF_RCVD_TSTMP;
1721                         tp->ts_recent = to.to_tsval;
1722                         tp->ts_recent_age = tcp_now;
1723                 }
1724                 if (to.to_flags & TOF_MSS)
1725                         tcp_mss(tp, to.to_mss, ifscope);
1726                 if (tp->sack_enable) {
1727                         if (!(to.to_flags & TOF_SACK))
1728                                 tp->sack_enable = 0;
1729                         else
1730                                 tp->t_flags |= TF_SACK_PERMIT;
1731                 }
1732         }
1733
1734 #if TRAFFIC_MGT
1735         /* Compute inter-packet arrival jitter. According to RFC 3550, inter-packet
1736          * arrival jitter is defined as the difference in packet spacing at the
1737          * receiver compared to the sender for a pair of packets. When two packets
1738          * of maximum segment size come one after the other with consecutive
1739          * sequence numbers, we consider them as packets sent together at the
1740          * sender and use them as a pair to compute inter-packet arrival jitter.
1741          * This metric indicates the delay induced by the network components due
1742          * to queuing in edge/access routers.
1743          */
1744         if (tp->t_state == TCPS_ESTABLISHED &&
1745             (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK|TH_ECE|TH_PUSH)) == TH_ACK &&
1746             ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
1747             ((to.to_flags & TOF_TS) == 0 ||
1748             TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
1749             th->th_seq == tp->rcv_nxt &&
1750             LIST_EMPTY(&tp->t_segq)) {
1751                 if (tp->iaj_pktcnt <= IAJ_IGNORE_PKTCNT) {
1752                         tp->iaj_pktcnt++;
1753                 }
1754
1755                 if ( tp->iaj_size == 0 || tlen > tp->iaj_size ||
1756                         (tlen == tp->iaj_size && tp->iaj_rcv_ts == 0)) {
1757                         /* State related to inter-arrival jitter is uninitialized
1758                          * or we are trying to find a good first packet to start
1759                          * computing the metric
1760                          */
1761                         update_iaj_state(tp, tlen, 0);
1762                 } else {
1763                         if (tlen == tp->iaj_size) {
1764                                 /* Compute inter-arrival jitter taking this packet
1765                                  * as the second packet
1766                                  */
1767                                 compute_iaj(tp);
1768                         }
1769                         if (tlen  < tp->iaj_size) {
1770                                 /* There is a smaller packet in the stream.
1771                                  * Some times the maximum size supported on a path can
1772                                  * change if there is a new link with smaller MTU.
1773                                  * The receiver will not know about this change.
1774                                  * If there are too many packets smaller than iaj_size,
1775                                  * we try to learn the iaj_size again.
1776                                  */
1777                                 tp->iaj_small_pkt++;
1778                                 if (tp->iaj_small_pkt > RESET_IAJ_SIZE_THRESH) {
1779                                         update_iaj_state(tp, tlen, 1);
1780                                 } else {
1781                                         clear_iaj_state(tp);
1782                                 }
1783                         } else {
1784                                 update_iaj_state(tp, tlen, 0);
1785                         }
1786                 }
1787         } else {
1788                 clear_iaj_state(tp);
1789         }
1790 #endif /* TRAFFIC_MGT */
1791
1792         /*
1793          * Header prediction: check for the two common cases
1794          * of a uni-directional data xfer.  If the packet has
1795          * no control flags, is in-sequence, the window didn't
1796          * change and we're not retransmitting, it's a
1797          * candidate.  If the length is zero and the ack moved
1798          * forward, we're the sender side of the xfer.  Just
1799          * free the data acked & wake any higher level process
1800          * that was blocked waiting for space.  If the length
1801          * is non-zero and the ack didn't move, we're the
1802          * receiver side.  If we're getting packets in-order
1803          * (the reassembly queue is empty), add the data to
1804          * the socket buffer and note that we need a delayed ack.
1805          * Make sure that the hidden state-flags are also off.
1806          * Since we check for TCPS_ESTABLISHED above, it can only
1807          * be TH_NEEDSYN.
1808          */
1809         if (tp->t_state == TCPS_ESTABLISHED &&
1810             (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK|TH_ECE)) == TH_ACK &&
1811             ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
1812             ((to.to_flags & TOF_TS) == 0 ||
1813              TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
1814             th->th_seq == tp->rcv_nxt &&
1815             tiwin && tiwin == tp->snd_wnd &&
1816             tp->snd_nxt == tp->snd_max) {
1817
1818                 /*
1819                  * If last ACK falls within this segment's sequence numbers,
1820                  * record the timestamp.
1821                  * NOTE that the test is modified according to the latest
1822                  * proposal of the tcplw@cray.com list (Braden 1993/04/26).
1823                  */
1824                 if ((to.to_flags & TOF_TS) != 0 &&
1825                    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
1826                         tp->ts_recent_age = tcp_now;
1827                         tp->ts_recent = to.to_tsval;
1828                 }
1829
1830                 /* Force acknowledgment if we received a FIN */
1831
1832                 if (thflags & TH_FIN)
1833                         tp->t_flags |= TF_ACKNOW;
1834
1835                 if (tlen == 0) {
1836                         if (SEQ_GT(th->th_ack, tp->snd_una) &&
1837                             SEQ_LEQ(th->th_ack, tp->snd_max) &&
1838                             tp->snd_cwnd >= tp->snd_ssthresh &&
1839                             (!IN_FASTRECOVERY(tp) &&
1840                             ((!tp->sack_enable && tp->t_dupacks < tcprexmtthresh) ||
1841                              (tp->sack_enable && to.to_nsacks == 0 &&
1842                               TAILQ_EMPTY(&tp->snd_holes))))) {
1843                                 /*
1844                                  * this is a pure ack for outstanding data.
1845                                  */
1846                                 ++tcpstat.tcps_predack;
1847                                 /*
1848                                  * "bad retransmit" recovery
1849                                  */
1850                                 if (tp->t_rxtshift == 1 &&
1851                                     TSTMP_LT(tcp_now, tp->t_badrxtwin)) {
1852                                         ++tcpstat.tcps_sndrexmitbad;
1853                                         tp->snd_cwnd = tp->snd_cwnd_prev;
1854                                         tp->snd_ssthresh =
1855                                             tp->snd_ssthresh_prev;
1856                                         tp->snd_recover = tp->snd_recover_prev;
1857                                         if (tp->t_flags & TF_WASFRECOVERY)
1858                                             ENTER_FASTRECOVERY(tp);
1859                                         tp->snd_nxt = tp->snd_max;
1860                                         tp->t_badrxtwin = 0;
1861                                         tp->t_rxtshift = 0;
1862                                         tp->rxt_start = 0;
1863                                         DTRACE_TCP5(cc, void, NULL, struct inpcb *, tp->t_inpcb,
1864                                                 struct tcpcb *, tp, struct tcphdr *, th,
1865                                                 int32_t, TCP_CC_BAD_REXMT_RECOVERY);
1866                                 }
1867                                 /*
1868                                  * Recalculate the transmit timer / rtt.
1869                                  *
1870                                  * Some boxes send broken timestamp replies
1871                                  * during the SYN+ACK phase, ignore
1872                                  * timestamps of 0 or we could calculate a
1873                                  * huge RTT and blow up the retransmit timer.
1874                                  */
1875                                 if (((to.to_flags & TOF_TS) != 0) && (to.to_tsecr != 0) &&
1876                                         TSTMP_GEQ(tcp_now, to.to_tsecr)) {
1877                                         tcp_xmit_timer(tp,
1878                                             tcp_now - to.to_tsecr);
1879                                 } else if (tp->t_rtttime &&
1880                                             SEQ_GT(th->th_ack, tp->t_rtseq)) {
1881                                         tcp_xmit_timer(tp, tcp_now - tp->t_rtttime);
1882                                 }
1883                                 acked = th->th_ack - tp->snd_una;
1884                                 tcpstat.tcps_rcvackpack++;
1885                                 tcpstat.tcps_rcvackbyte += acked;
1886
1887                                 /* Handle an ack that is in sequence during congestion
1888                                  * avoidance phase. The calculations in this function
1889                                  * assume that snd_una is not updated yet.
1890                                  */
1891                                 if (CC_ALGO(tp)->inseq_ack_rcvd != NULL)
1892                                         CC_ALGO(tp)->inseq_ack_rcvd(tp, th);
1893
1894                                 DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp,
1895                                         struct tcpcb *, tp, struct tcphdr *, th,
1896                                         int32_t, TCP_CC_INSEQ_ACK_RCVD);
1897
1898                                 sbdrop(&so->so_snd, acked);
1899                                 if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
1900                                     SEQ_LEQ(th->th_ack, tp->snd_recover))
1901                                         tp->snd_recover = th->th_ack - 1;
1902                                 tp->snd_una = th->th_ack;
1903                                 /*
1904                                  * pull snd_wl2 up to prevent seq wrap relative
1905                                  * to th_ack.
1906                                  */
1907                                 tp->snd_wl2 = th->th_ack;
1908                                 tp->t_dupacks = 0;
1909                                 m_freem(m);
1910                                 ND6_HINT(tp); /* some progress has been done */
1911
1912                                 /*
1913                                  * If all outstanding data are acked, stop
1914                                  * retransmit timer, otherwise restart timer
1915                                  * using current (possibly backed-off) value.
1916                                  * If process is waiting for space,
1917                                  * wakeup/selwakeup/signal.  If data
1918                                  * are ready to send, let tcp_output
1919                                  * decide between more output or persist.
1920                                  */
1921                                 if (tp->snd_una == tp->snd_max)
1922                                         tp->t_timer[TCPT_REXMT] = 0;
1923                                 else if (tp->t_timer[TCPT_PERSIST] == 0)
1924                                         tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
1925
1926                                 sowwakeup(so); /* has to be done with socket lock held */
1927                                 if ((so->so_snd.sb_cc) || (tp->t_flags & TF_ACKNOW)) {
1928                                         (void) tcp_output(tp);
1929                                 }
1930
1931                                 tcp_check_timer_state(tp);
1932                                 tcp_unlock(so, 1, 0);
1933                                 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
1934                                 return;
1935                         }
1936                 } else if (th->th_ack == tp->snd_una &&
1937                     LIST_EMPTY(&tp->t_segq) &&
1938                     tlen <= tcp_sbspace(tp)) {
1939                         /*
1940                          * this is a pure, in-sequence data packet
1941                          * with nothing on the reassembly queue and
1942                          * we have enough buffer space to take it.
1943                          */
1944                         /* Clean receiver SACK report if present */
1945                         if (tp->sack_enable && tp->rcv_numsacks)
1946                                 tcp_clean_sackreport(tp);
1947                         ++tcpstat.tcps_preddat;
1948                         tp->rcv_nxt += tlen;
1949                         /*
1950                          * Pull snd_wl1 up to prevent seq wrap relative to
1951                          * th_seq.
1952                          */
1953                         tp->snd_wl1 = th->th_seq;
1954                         /*
1955                          * Pull rcv_up up to prevent seq wrap relative to
1956                          * rcv_nxt.
1957                          */
1958                         tp->rcv_up = tp->rcv_nxt;
1959                         tcpstat.tcps_rcvpack++;
1960                         tcpstat.tcps_rcvbyte += tlen;
1961                         if (nstat_collect) {
1962                                 locked_add_64(&inp->inp_stat->rxpackets, 1);
1963                                 locked_add_64(&inp->inp_stat->rxbytes, tlen);
1964                         }
1965                         ND6_HINT(tp);   /* some progress has been done */
1966                         /*
1967                          * Add data to socket buffer.
1968                          */
1969                         so_recv_data_stat(so, m, 0);
1970                         m_adj(m, drop_hdrlen);  /* delayed header drop */
1971                         if (sbappendstream(&so->so_rcv, m))
1972                                 sorwakeup(so);
1973 #if INET6
1974                         if (isipv6) {
1975                                 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
1976                                         (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
1977                                         th->th_seq, th->th_ack, th->th_win);
1978                         }
1979                         else
1980 #endif
1981                         {
1982                                 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
1983                                         (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
1984                                         th->th_seq, th->th_ack, th->th_win);
1985                         }
1986                         if (DELAY_ACK(tp, th))  {
1987                                 if ((tp->t_flags & TF_DELACK) == 0) {
1988                                         tp->t_flags |= TF_DELACK;
1989                                         tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
1990                                 }
1991                                 tp->t_unacksegs++;
1992                         } else {
1993                                 tp->t_flags |= TF_ACKNOW;
1994                                 tcp_output(tp);
1995                         }
1996                         tcp_check_timer_state(tp);
1997                         tcp_unlock(so, 1, 0);
1998                         KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
1999                         return;
2000                 }
2001         }
2002
2003         /*
2004          * Calculate amount of space in receive window,
2005          * and then do TCP input processing.
2006          * Receive window is amount of space in rcv queue,
2007          * but not less than advertised window.
2008          */
2009         lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
2010
2011         { int win;
2012
2013         win = tcp_sbspace(tp);
2014
2015         if (win < 0)
2016                 win = 0;
2017         else {  /* clip rcv window to 4K for modems */
2018                 if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0)
2019                         win = min(win, slowlink_wsize);
2020         }
2021         tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
2022         }
2023
2024         switch (tp->t_state) {
2025
2026         /*
2027          * Initialize tp->rcv_nxt, and tp->irs, select an initial
2028          * tp->iss, and send a segment:
2029          *     <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
2030          * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
2031          * Fill in remote peer address fields if not previously specified.
2032          * Enter SYN_RECEIVED state, and process any other fields of this
2033          * segment in this state.
2034          */
2035         case TCPS_LISTEN: {
2036                 register struct sockaddr_in *sin;
2037 #if INET6
2038                 register struct sockaddr_in6 *sin6;
2039 #endif
2040
2041                 lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
2042 #if INET6
2043                 if (isipv6) {
2044                         MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6,
2045                                M_SONAME, M_NOWAIT);
2046                         if (sin6 == NULL)
2047                                 goto drop;
2048                         bzero(sin6, sizeof(*sin6));
2049                         sin6->sin6_family = AF_INET6;
2050                         sin6->sin6_len = sizeof(*sin6);
2051                         sin6->sin6_addr = ip6->ip6_src;
2052                         sin6->sin6_port = th->th_sport;
2053                         laddr6 = inp->in6p_laddr;
2054                         if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
2055                                 inp->in6p_laddr = ip6->ip6_dst;
2056                         if (in6_pcbconnect(inp, (struct sockaddr *)sin6,
2057                                            proc0)) {
2058                                 inp->in6p_laddr = laddr6;
2059                                 FREE(sin6, M_SONAME);
2060                                 goto drop;
2061                         }
2062                         FREE(sin6, M_SONAME);
2063                 } else
2064 #endif
2065             {
2066                         lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
2067                         MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME,
2068                        M_NOWAIT);
2069                         if (sin == NULL)
2070                                 goto drop;
2071                         sin->sin_family = AF_INET;
2072                         sin->sin_len = sizeof(*sin);
2073                         sin->sin_addr = ip->ip_src;
2074                         sin->sin_port = th->th_sport;
2075                         bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero));
2076                         laddr = inp->inp_laddr;
2077                         if (inp->inp_laddr.s_addr == INADDR_ANY)
2078                                 inp->inp_laddr = ip->ip_dst;
2079                         if (in_pcbconnect(inp, (struct sockaddr *)sin, proc0, NULL)) {
2080                                 inp->inp_laddr = laddr;
2081                                 FREE(sin, M_SONAME);
2082                                 goto drop;
2083                         }
2084                         FREE(sin, M_SONAME);
2085                 }
2086
2087                 tcp_dooptions(tp, optp, optlen, th, &to, ifscope);
2088
2089                 if (tp->sack_enable) {
2090                         if (!(to.to_flags & TOF_SACK))
2091                                 tp->sack_enable = 0;
2092                         else
2093                                 tp->t_flags |= TF_SACK_PERMIT;
2094                 }
2095
2096                 if (iss)
2097                         tp->iss = iss;
2098                 else {
2099                         tp->iss = tcp_new_isn(tp);
2100                 }
2101                 tp->irs = th->th_seq;
2102                 tcp_sendseqinit(tp);
2103                 tcp_rcvseqinit(tp);
2104                 tp->snd_recover = tp->snd_una;
2105                 /*
2106                  * Initialization of the tcpcb for transaction;
2107                  *   set SND.WND = SEG.WND,
2108                  *   initialize CCsend and CCrecv.
2109                  */
2110                 tp->snd_wnd = tiwin;    /* initial send-window */
2111                 tp->t_flags |= TF_ACKNOW;
2112                 tp->t_unacksegs = 0;
2113                 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
2114                         struct tcpcb *, tp, int32_t, TCPS_SYN_RECEIVED);
2115                 tp->t_state = TCPS_SYN_RECEIVED;
2116                 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
2117                         tp->t_keepinit ? tp->t_keepinit : tcp_keepinit);
2118                 dropsocket = 0;         /* committed to socket */
2119
2120                 /* reset the incomp processing flag */
2121                 so->so_flags &= ~(SOF_INCOMP_INPROGRESS);
2122                 tcpstat.tcps_accepts++;
2123                 if ((thflags & (TH_ECE | TH_CWR)) == (TH_ECE | TH_CWR)) {
2124                         /* ECN-setup SYN */
2125                         tp->ecn_flags |= (TE_SETUPRECEIVED | TE_SENDIPECT);
2126                 }
2127 #if CONFIG_IFEF_NOWINDOWSCALE
2128                 if (tcp_obey_ifef_nowindowscale && m->m_pkthdr.rcvif != NULL &&
2129                     (m->m_pkthdr.rcvif->if_eflags & IFEF_NOWINDOWSCALE)) {
2130                         /* Window scaling is not enabled on this interface */
2131                         tp->t_flags &= ~TF_REQ_SCALE;
2132                 }
2133 #endif
2134                 goto trimthenstep6;
2135                 }
2136
2137         /*
2138          * If the state is SYN_RECEIVED:
2139          *      if seg contains an ACK, but not for our SYN/ACK, send a RST.
2140          */
2141         case TCPS_SYN_RECEIVED:
2142                 if ((thflags & TH_ACK) &&
2143                     (SEQ_LEQ(th->th_ack, tp->snd_una) ||
2144                      SEQ_GT(th->th_ack, tp->snd_max))) {
2145                                 rstreason = BANDLIM_RST_OPENPORT;
2146                                 goto dropwithreset;
2147                 }
2148                 break;
2149
2150         /*
2151          * If the state is SYN_SENT:
2152          *      if seg contains an ACK, but not for our SYN, drop the input.
2153          *      if seg contains a RST, then drop the connection.
2154          *      if seg does not contain SYN, then drop it.
2155          * Otherwise this is an acceptable SYN segment
2156          *      initialize tp->rcv_nxt and tp->irs
2157          *      if seg contains ack then advance tp->snd_una
2158          *      if SYN has been acked change to ESTABLISHED else SYN_RCVD state
2159          *      arrange for segment to be acked (eventually)
2160          *      continue processing rest of data/controls, beginning with URG
2161          */
2162         case TCPS_SYN_SENT:
2163                 if ((thflags & TH_ACK) &&
2164                     (SEQ_LEQ(th->th_ack, tp->iss) ||
2165                      SEQ_GT(th->th_ack, tp->snd_max))) {
2166                         rstreason = BANDLIM_UNLIMITED;
2167                         goto dropwithreset;
2168                 }
2169                 if (thflags & TH_RST) {
2170                         if ((thflags & TH_ACK) != 0) {
2171                                 tp = tcp_drop(tp, ECONNREFUSED);
2172                                 postevent(so, 0, EV_RESET);
2173                         }
2174                         goto drop;
2175                 }
2176                 if ((thflags & TH_SYN) == 0)
2177                         goto drop;
2178                 tp->snd_wnd = th->th_win;       /* initial send window */
2179
2180                 tp->irs = th->th_seq;
2181                 tcp_rcvseqinit(tp);
2182                 if (thflags & TH_ACK) {
2183                         tcpstat.tcps_connects++;
2184
2185                         if ((thflags & (TH_ECE | TH_CWR)) == (TH_ECE)) {
2186                                 /* ECN-setup SYN-ACK */
2187                                 tp->ecn_flags |= TE_SETUPRECEIVED;
2188                         }
2189                         else {
2190                                 /* non-ECN-setup SYN-ACK */
2191                                 tp->ecn_flags &= ~TE_SENDIPECT;
2192                         }
2193
2194 #if CONFIG_MACF_NET && CONFIG_MACF_SOCKET
2195                         /* XXXMAC: recursive lock: SOCK_LOCK(so); */
2196                         mac_socketpeer_label_associate_mbuf(m, so);
2197                         /* XXXMAC: SOCK_UNLOCK(so); */
2198 #endif
2199                         /* Do window scaling on this connection? */
2200                         if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
2201                                 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
2202                                 tp->snd_scale = tp->requested_s_scale;
2203                                 tp->rcv_scale = tp->request_r_scale;
2204                         }
2205                         tp->rcv_adv += tp->rcv_wnd;
2206                         tp->snd_una++;          /* SYN is acked */
2207                         /*
2208                          * If there's data, delay ACK; if there's also a FIN
2209                          * ACKNOW will be turned on later.
2210                          */
2211                         if (DELAY_ACK(tp, th) && tlen != 0) {
2212                                 if ((tp->t_flags & TF_DELACK) == 0) {
2213                                         tp->t_flags |= TF_DELACK;
2214                                         tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
2215                                 }
2216                                 tp->t_unacksegs++;
2217                         }
2218                         else {
2219                                 tp->t_flags |= TF_ACKNOW;
2220                         }
2221                         /*
2222                          * Received <SYN,ACK> in SYN_SENT[*] state.
2223                          * Transitions:
2224                          *      SYN_SENT  --> ESTABLISHED
2225                          *      SYN_SENT* --> FIN_WAIT_1
2226                          */
2227                         tp->t_starttime = tcp_now;
2228                         if (tp->t_flags & TF_NEEDFIN) {
2229                                 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
2230                                         struct tcpcb *, tp, int32_t, TCPS_FIN_WAIT_1);
2231                                 tp->t_state = TCPS_FIN_WAIT_1;
2232                                 tp->t_flags &= ~TF_NEEDFIN;
2233                                 thflags &= ~TH_SYN;
2234                         } else {
2235                                 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
2236                                         struct tcpcb *, tp, int32_t, TCPS_ESTABLISHED);
2237                                 tp->t_state = TCPS_ESTABLISHED;
2238                                 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, TCP_KEEPIDLE(tp));
2239                                 if (nstat_collect)
2240                                         nstat_route_connect_success(tp->t_inpcb->inp_route.ro_rt);
2241                         }
2242                         isconnected = TRUE;
2243                 } else {
2244                         /*
2245                          *  Received initial SYN in SYN-SENT[*] state => simul-
2246                          *  taneous open.  If segment contains CC option and there is
2247                          *  a cached CC, apply TAO test; if it succeeds, connection is
2248                          *  half-synchronized.  Otherwise, do 3-way handshake:
2249                          *        SYN-SENT -> SYN-RECEIVED
2250                          *        SYN-SENT* -> SYN-RECEIVED*
2251                          */
2252                         tp->t_flags |= TF_ACKNOW;
2253                         tp->t_timer[TCPT_REXMT] = 0;
2254                         DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
2255                                 struct tcpcb *, tp, int32_t, TCPS_SYN_RECEIVED);
2256                         tp->t_state = TCPS_SYN_RECEIVED;
2257
2258                 }
2259
2260 trimthenstep6:
2261                 /*
2262                  * Advance th->th_seq to correspond to first data byte.
2263                  * If data, trim to stay within window,
2264                  * dropping FIN if necessary.
2265                  */
2266                 th->th_seq++;
2267                 if (tlen > tp->rcv_wnd) {
2268                         todrop = tlen - tp->rcv_wnd;
2269                         m_adj(m, -todrop);
2270                         tlen = tp->rcv_wnd;
2271                         thflags &= ~TH_FIN;
2272                         tcpstat.tcps_rcvpackafterwin++;
2273                         tcpstat.tcps_rcvbyteafterwin += todrop;
2274                 }
2275                 tp->snd_wl1 = th->th_seq - 1;
2276                 tp->rcv_up = th->th_seq;
2277                 /*
2278                  *  Client side of transaction: already sent SYN and data.
2279                  *  If the remote host used T/TCP to validate the SYN,
2280                  *  our data will be ACK'd; if so, enter normal data segment
2281                  *  processing in the middle of step 5, ack processing.
2282                  *  Otherwise, goto step 6.
2283                  */
2284                 if (thflags & TH_ACK)
2285                         goto process_ACK;
2286                 goto step6;
2287         /*
2288          * If the state is LAST_ACK or CLOSING or TIME_WAIT:
2289          *      do normal processing.
2290          *
2291          * NB: Leftover from RFC1644 T/TCP.  Cases to be reused later.
2292          */
2293         case TCPS_LAST_ACK:
2294         case TCPS_CLOSING:
2295         case TCPS_TIME_WAIT:
2296                 break;  /* continue normal processing */
2297
2298         /* Received a SYN while connection is already established.
2299          * This is a "half open connection and other anomalies" described
2300          * in RFC793 page 34, send an ACK so the remote reset the connection
2301          * or recovers by adjusting its sequence numberering
2302          */
2303         case TCPS_ESTABLISHED:
2304                 if (thflags & TH_SYN)
2305                         goto dropafterack;
2306                 break;
2307         }
2308
2309         /*
2310          * States other than LISTEN or SYN_SENT.
2311          * First check the RST flag and sequence number since reset segments
2312          * are exempt from the timestamp and connection count tests.  This
2313          * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
2314          * below which allowed reset segments in half the sequence space
2315          * to fall though and be processed (which gives forged reset
2316          * segments with a random sequence number a 50 percent chance of
2317          * killing a connection).
2318          * Then check timestamp, if present.
2319          * Then check the connection count, if present.
2320          * Then check that at least some bytes of segment are within
2321          * receive window.  If segment begins before rcv_nxt,
2322          * drop leading data (and SYN); if nothing left, just ack.
2323          *
2324          *
2325          * If the RST bit is set, check the sequence number to see
2326          * if this is a valid reset segment.
2327          * RFC 793 page 37:
2328          *   In all states except SYN-SENT, all reset (RST) segments
2329          *   are validated by checking their SEQ-fields.  A reset is
2330          *   valid if its sequence number is in the window.
2331          * Note: this does not take into account delayed ACKs, so
2332          *   we should test against last_ack_sent instead of rcv_nxt.
2333          *   The sequence number in the reset segment is normally an
2334          *   echo of our outgoing acknowlegement numbers, but some hosts
2335          *   send a reset with the sequence number at the rightmost edge
2336          *   of our receive window, and we have to handle this case.
2337          * Note 2: Paul Watson's paper "Slipping in the Window" has shown
2338          *   that brute force RST attacks are possible.  To combat this,
2339          *   we use a much stricter check while in the ESTABLISHED state,
2340          *   only accepting RSTs where the sequence number is equal to
2341          *   last_ack_sent.  In all other states (the states in which a
2342          *   RST is more likely), the more permissive check is used.
2343          * If we have multiple segments in flight, the intial reset
2344          * segment sequence numbers will be to the left of last_ack_sent,
2345          * but they will eventually catch up.
2346          * In any case, it never made sense to trim reset segments to
2347          * fit the receive window since RFC 1122 says:
2348          *   4.2.2.12  RST Segment: RFC-793 Section 3.4
2349          *
2350          *    A TCP SHOULD allow a received RST segment to include data.
2351          *
2352          *    DISCUSSION
2353          *         It has been suggested that a RST segment could contain
2354          *         ASCII text that encoded and explained the cause of the
2355          *         RST.  No standard has yet been established for such
2356          *         data.
2357          *
2358          * If the reset segment passes the sequence number test examine
2359          * the state:
2360          *    SYN_RECEIVED STATE:
2361          *      If passive open, return to LISTEN state.
2362          *      If active open, inform user that connection was refused.
2363          *    ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES:
2364          *      Inform user that connection was reset, and close tcb.
2365          *    CLOSING, LAST_ACK STATES:
2366          *      Close the tcb.
2367          *    TIME_WAIT STATE:
2368          *      Drop the segment - see Stevens, vol. 2, p. 964 and
2369          *      RFC 1337.
2370          *
2371          *      Radar 4803931: Allows for the case where we ACKed the FIN but
2372          *                     there is already a RST in flight from the peer.
2373          *                     In that case, accept the RST for non-established
2374          *                     state if it's one off from last_ack_sent.
2375
2376          */
2377         if (thflags & TH_RST) {
2378                 if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
2379                     SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
2380                     (tp->rcv_wnd == 0 &&
2381                         ((tp->last_ack_sent == th->th_seq) || ((tp->last_ack_sent -1) == th->th_seq)))) {
2382                         switch (tp->t_state) {
2383
2384                         case TCPS_SYN_RECEIVED:
2385                                 so->so_error = ECONNREFUSED;
2386                                 goto close;
2387
2388                         case TCPS_ESTABLISHED:
2389                                 if (tp->last_ack_sent != th->th_seq) {
2390                                         tcpstat.tcps_badrst++;
2391                                         goto drop;
2392                                 }
2393                         case TCPS_FIN_WAIT_1:
2394                         case TCPS_CLOSE_WAIT:
2395                                 /*
2396                                   Drop through ...
2397                                 */
2398                         case TCPS_FIN_WAIT_2:
2399                                 so->so_error = ECONNRESET;
2400                         close:
2401                                 postevent(so, 0, EV_RESET);
2402                                 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
2403                                         struct tcpcb *, tp, int32_t, TCPS_CLOSED);
2404                                 tp->t_state = TCPS_CLOSED;
2405                                 tcpstat.tcps_drops++;
2406                                 tp = tcp_close(tp);
2407                                 break;
2408
2409                         case TCPS_CLOSING:
2410                         case TCPS_LAST_ACK:
2411                                 tp = tcp_close(tp);
2412                                 break;
2413
2414                         case TCPS_TIME_WAIT:
2415                                 break;
2416                         }
2417                 }
2418                 goto drop;
2419         }
2420
2421         /*
2422          * RFC 1323 PAWS: If we have a timestamp reply on this segment
2423          * and it's less than ts_recent, drop it.
2424          */
2425         if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
2426             TSTMP_LT(to.to_tsval, tp->ts_recent)) {
2427
2428                 /* Check to see if ts_recent is over 24 days old.  */
2429                 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) {
2430                         /*
2431                          * Invalidate ts_recent.  If this segment updates
2432                          * ts_recent, the age will be reset later and ts_recent
2433                          * will get a valid value.  If it does not, setting
2434                          * ts_recent to zero will at least satisfy the
2435                          * requirement that zero be placed in the timestamp
2436                          * echo reply when ts_recent isn't valid.  The
2437                          * age isn't reset until we get a valid ts_recent
2438                          * because we don't want out-of-order segments to be
2439                          * dropped when ts_recent is old.
2440                          */
2441                         tp->ts_recent = 0;
2442                 } else {
2443                         tcpstat.tcps_rcvduppack++;
2444                         tcpstat.tcps_rcvdupbyte += tlen;
2445                         tcpstat.tcps_pawsdrop++;
2446                         if (nstat_collect) {
2447                                 nstat_route_rx(tp->t_inpcb->inp_route.ro_rt, 1, tlen, NSTAT_RX_FLAG_DUPLICATE);
2448                                 locked_add_64(&inp->inp_stat->rxpackets, 1);
2449                                 locked_add_64(&inp->inp_stat->rxbytes, tlen);
2450                                 tp->t_stat.rxduplicatebytes += tlen;
2451                         }
2452                         if (tlen)
2453                                 goto dropafterack;
2454                         goto drop;
2455                 }
2456         }
2457
2458         /*
2459          * In the SYN-RECEIVED state, validate that the packet belongs to
2460          * this connection before trimming the data to fit the receive
2461          * window.  Check the sequence number versus IRS since we know
2462          * the sequence numbers haven't wrapped.  This is a partial fix
2463          * for the "LAND" DoS attack.
2464          */
2465         if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
2466                 rstreason = BANDLIM_RST_OPENPORT;
2467                 goto dropwithreset;
2468         }
2469
2470         todrop = tp->rcv_nxt - th->th_seq;
2471         if (todrop > 0) {
2472                 if (thflags & TH_SYN) {
2473                         thflags &= ~TH_SYN;
2474                         th->th_seq++;
2475                         if (th->th_urp > 1)
2476                                 th->th_urp--;
2477                         else
2478                                 thflags &= ~TH_URG;
2479                         todrop--;
2480                 }
2481                 /*
2482                  * Following if statement from Stevens, vol. 2, p. 960.
2483                  */
2484                 if (todrop > tlen
2485                     || (todrop == tlen && (thflags & TH_FIN) == 0)) {
2486                         /*
2487                          * Any valid FIN must be to the left of the window.
2488                          * At this point the FIN must be a duplicate or out
2489                          * of sequence; drop it.
2490                          */
2491                         thflags &= ~TH_FIN;
2492
2493                         /*
2494                          * Send an ACK to resynchronize and drop any data.
2495                          * But keep on processing for RST or ACK.
2496                          */
2497                         tp->t_flags |= TF_ACKNOW;
2498                         todrop = tlen;
2499                         tcpstat.tcps_rcvduppack++;
2500                         tcpstat.tcps_rcvdupbyte += todrop;
2501                 } else {
2502                         tcpstat.tcps_rcvpartduppack++;
2503                         tcpstat.tcps_rcvpartdupbyte += todrop;
2504                 }
2505                 if (nstat_collect) {
2506                         nstat_route_rx(tp->t_inpcb->inp_route.ro_rt, 1, todrop, NSTAT_RX_FLAG_DUPLICATE);
2507                         locked_add_64(&inp->inp_stat->rxpackets, 1);
2508                         locked_add_64(&inp->inp_stat->rxbytes, todrop);
2509                         tp->t_stat.rxduplicatebytes += todrop;
2510                 }
2511                 drop_hdrlen += todrop;  /* drop from the top afterwards */
2512                 th->th_seq += todrop;
2513                 tlen -= todrop;
2514                 if (th->th_urp > todrop)
2515                         th->th_urp -= todrop;
2516                 else {
2517                         thflags &= ~TH_URG;
2518                         th->th_urp = 0;
2519                 }
2520         }
2521
2522         /*
2523          * If new data are received on a connection after the
2524          * user processes are gone, then RST the other end.
2525          */
2526         if ((so->so_state & SS_NOFDREF) &&
2527             tp->t_state > TCPS_CLOSE_WAIT && tlen) {
2528                 tp = tcp_close(tp);
2529                 tcpstat.tcps_rcvafterclose++;
2530                 rstreason = BANDLIM_UNLIMITED;
2531                 goto dropwithreset;
2532         }
2533
2534         /*
2535          * If segment ends after window, drop trailing data
2536          * (and PUSH and FIN); if nothing left, just ACK.
2537          */
2538         todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd);
2539         if (todrop > 0) {
2540                 tcpstat.tcps_rcvpackafterwin++;
2541                 if (todrop >= tlen) {
2542                         tcpstat.tcps_rcvbyteafterwin += tlen;
2543                         /*
2544                          * If a new connection request is received
2545                          * while in TIME_WAIT, drop the old connection
2546                          * and start over if the sequence numbers
2547                          * are above the previous ones.
2548                          */
2549                         if (thflags & TH_SYN &&
2550                             tp->t_state == TCPS_TIME_WAIT &&
2551                             SEQ_GT(th->th_seq, tp->rcv_nxt)) {
2552                                 iss = tcp_new_isn(tp);
2553                                 tp = tcp_close(tp);
2554                                 tcp_unlock(so, 1, 0);
2555                                 goto findpcb;
2556                         }
2557                         /*
2558                          * If window is closed can only take segments at
2559                          * window edge, and have to drop data and PUSH from
2560                          * incoming segments.  Continue processing, but
2561                          * remember to ack.  Otherwise, drop segment
2562                          * and ack.
2563                          */
2564                         if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
2565                                 tp->t_flags |= TF_ACKNOW;
2566                                 tcpstat.tcps_rcvwinprobe++;
2567                         } else
2568                                 goto dropafterack;
2569                 } else
2570                         tcpstat.tcps_rcvbyteafterwin += todrop;
2571                 m_adj(m, -todrop);
2572                 tlen -= todrop;
2573                 thflags &= ~(TH_PUSH|TH_FIN);
2574         }
2575
2576         /*
2577          * If last ACK falls within this segment's sequence numbers,
2578          * record its timestamp.
2579          * NOTE:
2580          * 1) That the test incorporates suggestions from the latest
2581          *    proposal of the tcplw@cray.com list (Braden 1993/04/26).
2582          * 2) That updating only on newer timestamps interferes with
2583          *    our earlier PAWS tests, so this check should be solely
2584          *    predicated on the sequence space of this segment.
2585          * 3) That we modify the segment boundary check to be
2586          *        Last.ACK.Sent <= SEG.SEQ + SEG.Len
2587          *    instead of RFC1323's
2588          *        Last.ACK.Sent < SEG.SEQ + SEG.Len,
2589          *    This modified check allows us to overcome RFC1323's
2590          *    limitations as described in Stevens TCP/IP Illustrated
2591          *    Vol. 2 p.869. In such cases, we can still calculate the
2592          *    RTT correctly when RCV.NXT == Last.ACK.Sent.
2593          */
2594         if ((to.to_flags & TOF_TS) != 0 &&
2595             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
2596             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
2597                 ((thflags & (TH_SYN|TH_FIN)) != 0))) {
2598                 tp->ts_recent_age = tcp_now;
2599                 tp->ts_recent = to.to_tsval;
2600         }
2601
2602         /*
2603          * If a SYN is in the window, then this is an
2604          * error and we send an RST and drop the connection.
2605          */
2606         if (thflags & TH_SYN) {
2607                 tp = tcp_drop(tp, ECONNRESET);
2608                 rstreason = BANDLIM_UNLIMITED;
2609                 postevent(so, 0, EV_RESET);
2610                 goto dropwithreset;
2611         }
2612
2613         /*
2614          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN
2615          * flag is on (half-synchronized state), then queue data for
2616          * later processing; else drop segment and return.
2617          */
2618         if ((thflags & TH_ACK) == 0) {
2619                 if (tp->t_state == TCPS_SYN_RECEIVED ||
2620                     (tp->t_flags & TF_NEEDSYN))
2621                         goto step6;
2622                 else if (tp->t_flags & TF_ACKNOW)
2623                         goto dropafterack;
2624                 else
2625                         goto drop;
2626         }
2627
2628         /*
2629          * Ack processing.
2630          */
2631         switch (tp->t_state) {
2632
2633         /*
2634          * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
2635          * ESTABLISHED state and continue processing.
2636          * The ACK was checked above.
2637          */
2638         case TCPS_SYN_RECEIVED:
2639
2640                 tcpstat.tcps_connects++;
2641
2642                 /* Do window scaling? */
2643                 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
2644                         (TF_RCVD_SCALE|TF_REQ_SCALE)) {
2645                         tp->snd_scale = tp->requested_s_scale;
2646                         tp->rcv_scale = tp->request_r_scale;
2647                         tp->snd_wnd = th->th_win << tp->snd_scale;
2648                         tiwin = tp->snd_wnd;
2649                 }
2650                 /*
2651                  * Make transitions:
2652                  *      SYN-RECEIVED  -> ESTABLISHED
2653                  *      SYN-RECEIVED* -> FIN-WAIT-1
2654                  */
2655                 tp->t_starttime = tcp_now;
2656                 if (tp->t_flags & TF_NEEDFIN) {
2657                         DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
2658                                 struct tcpcb *, tp, int32_t, TCPS_FIN_WAIT_1);
2659                         tp->t_state = TCPS_FIN_WAIT_1;
2660                         tp->t_flags &= ~TF_NEEDFIN;
2661                 } else {
2662                         DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
2663                                 struct tcpcb *, tp, int32_t, TCPS_ESTABLISHED);
2664                         tp->t_state = TCPS_ESTABLISHED;
2665                         tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, TCP_KEEPIDLE(tp));
2666                         if (nstat_collect)
2667                                 nstat_route_connect_success(tp->t_inpcb->inp_route.ro_rt);
2668                 }
2669                 /*
2670                  * If segment contains data or ACK, will call tcp_reass()
2671                  * later; if not, do so now to pass queued data to user.
2672                  */
2673                 if (tlen == 0 && (thflags & TH_FIN) == 0)
2674                         (void) tcp_reass(tp, (struct tcphdr *)0, &tlen,
2675                             (struct mbuf *)0);
2676                 tp->snd_wl1 = th->th_seq - 1;
2677
2678                 /* FALLTHROUGH */
2679
2680                 isconnected = TRUE;
2681
2682         /*
2683          * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
2684          * ACKs.  If the ack is in the range
2685          *      tp->snd_una < th->th_ack <= tp->snd_max
2686          * then advance tp->snd_una to th->th_ack and drop
2687          * data from the retransmission queue.  If this ACK reflects
2688          * more up to date window information we update our window information.
2689          */
2690         case TCPS_ESTABLISHED:
2691         case TCPS_FIN_WAIT_1:
2692         case TCPS_FIN_WAIT_2:
2693         case TCPS_CLOSE_WAIT:
2694         case TCPS_CLOSING:
2695         case TCPS_LAST_ACK:
2696         case TCPS_TIME_WAIT:
2697                 if (SEQ_GT(th->th_ack, tp->snd_max)) {
2698                         tcpstat.tcps_rcvacktoomuch++;
2699                         goto dropafterack;
2700                 }
2701                 if (tp->sack_enable &&
2702                     (to.to_nsacks > 0 || !TAILQ_EMPTY(&tp->snd_holes)))
2703                         tcp_sack_doack(tp, &to, th->th_ack);
2704                 if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
2705                         if (tlen == 0 && tiwin == tp->snd_wnd) {
2706                                 tcpstat.tcps_rcvdupack++;
2707                                 /*
2708                                  * If we have outstanding data (other than
2709                                  * a window probe), this is a completely
2710                                  * duplicate ack (ie, window info didn't
2711                                  * change), the ack is the biggest we've
2712                                  * seen and we've seen exactly our rexmt
2713                                  * threshhold of them, assume a packet
2714                                  * has been dropped and retransmit it.
2715                                  * Kludge snd_nxt & the congestion
2716                                  * window so we send only this one
2717                                  * packet.
2718                                  *
2719                                  * We know we're losing at the current
2720                                  * window size so do congestion avoidance
2721                                  * (set ssthresh to half the current window
2722                                  * and pull our congestion window back to
2723                                  * the new ssthresh).
2724                                  *
2725                                  * Dup acks mean that packets have left the
2726                                  * network (they're now cached at the receiver)
2727                                  * so bump cwnd by the amount in the receiver
2728                                  * to keep a constant cwnd packets in the
2729                                  * network.
2730                                  */
2731                                 if (tp->t_timer[TCPT_REXMT] == 0 ||
2732                                     th->th_ack != tp->snd_una)
2733                                         tp->t_dupacks = 0;
2734                                 else if (++tp->t_dupacks > tcprexmtthresh ||
2735                                           IN_FASTRECOVERY(tp)) {
2736                                         if (tp->sack_enable && IN_FASTRECOVERY(tp)) {
2737                                                 int awnd;
2738
2739                                                 /*
2740                                                  * Compute the amount of data in flight first.
2741                                                  * We can inject new data into the pipe iff
2742                                                  * we have less than 1/2 the original window's
2743                                                  * worth of data in flight.
2744                                                  */
2745                                                 awnd = (tp->snd_nxt - tp->snd_fack) +
2746                                                         tp->sackhint.sack_bytes_rexmit;
2747                                                 if (awnd < tp->snd_ssthresh) {
2748                                                         tp->snd_cwnd += tp->t_maxseg;
2749                                                         if (tp->snd_cwnd > tp->snd_ssthresh)
2750                                                                 tp->snd_cwnd = tp->snd_ssthresh;
2751                                                 }
2752                                         } else
2753                                                 tp->snd_cwnd += tp->t_maxseg;
2754
2755                                         DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp,
2756                                                 struct tcpcb *, tp, struct tcphdr *, th,
2757                                                 int32_t, TCP_CC_IN_FASTRECOVERY);
2758
2759                                         (void) tcp_output(tp);
2760                                         goto drop;
2761                                 } else if (tp->t_dupacks == tcprexmtthresh) {
2762                                         tcp_seq onxt = tp->snd_nxt;
2763
2764                                         /*
2765                                          * If we're doing sack, check to
2766                                          * see if we're already in sack
2767                                          * recovery. If we're not doing sack,
2768                                          * check to see if we're in newreno
2769                                          * recovery.
2770                                          */
2771                                         if (tp->sack_enable) {
2772                                                 if (IN_FASTRECOVERY(tp)) {
2773                                                         tp->t_dupacks = 0;
2774                                                         break;
2775                                                 }
2776                                         } else {
2777                                                 if (SEQ_LEQ(th->th_ack,
2778                                                     tp->snd_recover)) {
2779                                                         tp->t_dupacks = 0;
2780                                                         break;
2781                                                 }
2782                                         }
2783
2784                                         /*
2785                                          * If the current tcp cc module has
2786                                          * defined a hook for tasks to run
2787                                          * before entering FR, call it
2788                                          */
2789                                         if (CC_ALGO(tp)->pre_fr != NULL)
2790                                                 CC_ALGO(tp)->pre_fr(tp, th);
2791                                         ENTER_FASTRECOVERY(tp);
2792                                         tp->snd_recover = tp->snd_max;
2793                                         tp->t_timer[TCPT_REXMT] = 0;
2794                                         tp->t_rtttime = 0;
2795                                         tp->ecn_flags |= TE_SENDCWR;
2796                                         if (tp->sack_enable) {
2797                                                 tcpstat.tcps_sack_recovery_episode++;
2798                                                 tp->sack_newdata = tp->snd_nxt;
2799                                                 tp->snd_cwnd = tp->t_maxseg;
2800
2801                                                 DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp,
2802                                                         struct tcpcb *, tp, struct tcphdr *, th,
2803                                                         int32_t, TCP_CC_ENTER_FASTRECOVERY);
2804
2805                                                 (void) tcp_output(tp);
2806                                                 goto drop;
2807                                         }
2808                                         tp->snd_nxt = th->th_ack;
2809                                         tp->snd_cwnd = tp->t_maxseg;
2810                                         (void) tcp_output(tp);
2811                                         tp->snd_cwnd = tp->snd_ssthresh +
2812                                              tp->t_maxseg * tp->t_dupacks;
2813                                         if (SEQ_GT(onxt, tp->snd_nxt))
2814                                                 tp->snd_nxt = onxt;
2815                                         DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp,
2816                                                 struct tcpcb *, tp, struct tcphdr *, th,
2817                                                 int32_t, TCP_CC_ENTER_FASTRECOVERY);
2818                                         goto drop;
2819                                 }
2820                         } else
2821                                 tp->t_dupacks = 0;
2822                         break;
2823                 }
2824                 /*
2825                  * If the congestion window was inflated to account
2826                  * for the other side's cached packets, retract it.
2827                  */
2828                 if (IN_FASTRECOVERY(tp)) {
2829                         if (SEQ_LT(th->th_ack, tp->snd_recover)) {
2830                                 if (tp->sack_enable)
2831                                         tcp_sack_partialack(tp, th);
2832                                 else
2833                                         tcp_newreno_partial_ack(tp, th);
2834
2835                                 DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp,
2836                                         struct tcpcb *, tp, struct tcphdr *, th,
2837                                         int32_t, TCP_CC_PARTIAL_ACK);
2838                         } else {
2839                                 EXIT_FASTRECOVERY(tp);
2840                                 if (CC_ALGO(tp)->post_fr != NULL)
2841                                         CC_ALGO(tp)->post_fr(tp, th);
2842                                 tp->t_dupacks = 0;
2843
2844                                 DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp,
2845                                         struct tcpcb *, tp, struct tcphdr *, th,
2846                                         int32_t, TCP_CC_EXIT_FASTRECOVERY);
2847                         }
2848                 } else {
2849                         /*
2850                          * We were not in fast recovery. Reset the duplicate ack
2851                          * counter.
2852                          */
2853                         tp->t_dupacks = 0;
2854                 }
2855
2856
2857                 /*
2858                  * If we reach this point, ACK is not a duplicate,
2859                  *     i.e., it ACKs something we sent.
2860                  */
2861                 if (tp->t_flags & TF_NEEDSYN) {
2862                         /*
2863                          * T/TCP: Connection was half-synchronized, and our
2864                          * SYN has been ACK'd (so connection is now fully
2865                          * synchronized).  Go to non-starred state,
2866                          * increment snd_una for ACK of SYN, and check if
2867                          * we can do window scaling.
2868                          */
2869                         tp->t_flags &= ~TF_NEEDSYN;
2870                         tp->snd_una++;
2871                         /* Do window scaling? */
2872                         if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
2873                                 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
2874                                 tp->snd_scale = tp->requested_s_scale;
2875                                 tp->rcv_scale = tp->request_r_scale;
2876                         }
2877                 }
2878
2879 process_ACK:
2880                 acked = th->th_ack - tp->snd_una;
2881                 tcpstat.tcps_rcvackpack++;
2882                 tcpstat.tcps_rcvackbyte += acked;
2883
2884                 /*
2885                  * If we just performed our first retransmit, and the ACK
2886                  * arrives within our recovery window, then it was a mistake
2887                  * to do the retransmit in the first place.  Recover our
2888                  * original cwnd and ssthresh, and proceed to transmit where
2889                  * we left off.
2890                  */
2891                 if (tp->t_rxtshift == 1 &&
2892                         TSTMP_LT(tcp_now, tp->t_badrxtwin)) {
2893                         ++tcpstat.tcps_sndrexmitbad;
2894                         tp->snd_cwnd = tp->snd_cwnd_prev;
2895                         tp->snd_ssthresh = tp->snd_ssthresh_prev;
2896                         tp->snd_recover = tp->snd_recover_prev;
2897                         if (tp->t_flags & TF_WASFRECOVERY)
2898                                 ENTER_FASTRECOVERY(tp);
2899                         tp->snd_nxt = tp->snd_max;
2900                         tp->t_badrxtwin = 0;    /* XXX probably not required */
2901                         tp->t_rxtshift = 0;
2902                         tp->rxt_start = 0;
2903
2904                         DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp,
2905                                 struct tcpcb *, tp, struct tcphdr *, th,
2906                                 int32_t, TCP_CC_BAD_REXMT_RECOVERY);
2907                 }
2908
2909                 /*
2910                  * If we have a timestamp reply, update smoothed
2911                  * round trip time.  If no timestamp is present but
2912                  * transmit timer is running and timed sequence
2913                  * number was acked, update smoothed round trip time.
2914                  * Since we now have an rtt measurement, cancel the
2915                  * timer backoff (cf., Phil Karn's retransmit alg.).
2916                  * Recompute the initial retransmit timer.
2917                  * Also makes sure we have a valid time stamp in hand
2918                  *
2919                  * Some boxes send broken timestamp replies
2920                  * during the SYN+ACK phase, ignore
2921                  * timestamps of 0 or we could calculate a
2922                  * huge RTT and blow up the retransmit timer.
2923                  */
2924                 if (((to.to_flags & TOF_TS) != 0) && (to.to_tsecr != 0) &&
2925                         TSTMP_GEQ(tcp_now, to.to_tsecr)) {
2926                         tcp_xmit_timer(tp, tcp_now - to.to_tsecr);
2927                 } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) {
2928                         tcp_xmit_timer(tp, tcp_now - tp->t_rtttime);
2929                 }
2930
2931                 /*
2932                  * If all outstanding data is acked, stop retransmit
2933                  * timer and remember to restart (more output or persist).
2934                  * If there is more data to be acked, restart retransmit
2935                  * timer, using current (possibly backed-off) value.
2936                  */
2937                 if (th->th_ack == tp->snd_max) {
2938                         tp->t_timer[TCPT_REXMT] = 0;
2939                         needoutput = 1;
2940                 } else if (tp->t_timer[TCPT_PERSIST] == 0)
2941                         tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
2942
2943                 /*
2944                  * If no data (only SYN) was ACK'd,
2945                  *    skip rest of ACK processing.
2946                  */
2947                 if (acked == 0)
2948                         goto step6;
2949
2950                 if ((thflags & TH_ECE) != 0 &&
2951                         (tp->ecn_flags & TE_SETUPSENT) != 0) {
2952                         /*
2953                          * Reduce the congestion window if we haven't done so.
2954                          */
2955                         if (!tp->sack_enable && !IN_FASTRECOVERY(tp) &&
2956                                 SEQ_GEQ(th->th_ack, tp->snd_recover)) {
2957                                 tcp_reduce_congestion_window(tp, th);
2958                                 DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp,
2959                                         struct tcpcb *, tp, struct tcphdr *, th,
2960                                         int32_t, TCP_CC_ECN_RCVD);
2961                         }
2962                 }
2963
2964                 /*
2965                  * When new data is acked, open the congestion window.
2966                  * The specifics of how this is achieved are up to the
2967                  * congestion control algorithm in use for this connection.
2968                  *
2969                  * The calculations in this function assume that snd_una is
2970                  * not updated yet.
2971                  */
2972                 if (!IN_FASTRECOVERY(tp)) {
2973                         if (CC_ALGO(tp)->ack_rcvd != NULL)
2974                                 CC_ALGO(tp)->ack_rcvd(tp, th);
2975
2976                         DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp,
2977                                 struct tcpcb *, tp, struct tcphdr *, th,
2978                                 int32_t, TCP_CC_ACK_RCVD);
2979                 }
2980                 if (acked > so->so_snd.sb_cc) {
2981                         tp->snd_wnd -= so->so_snd.sb_cc;
2982                         sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
2983                         ourfinisacked = 1;
2984                 } else {
2985                         sbdrop(&so->so_snd, acked);
2986                         tp->snd_wnd -= acked;
2987                         ourfinisacked = 0;
2988                 }
2989                 /* detect una wraparound */
2990                 if ( !IN_FASTRECOVERY(tp) &&
2991                     SEQ_GT(tp->snd_una, tp->snd_recover) &&
2992                     SEQ_LEQ(th->th_ack, tp->snd_recover))
2993                         tp->snd_recover = th->th_ack - 1;
2994
2995                 if (IN_FASTRECOVERY(tp) &&
2996                     SEQ_GEQ(th->th_ack, tp->snd_recover))
2997                         EXIT_FASTRECOVERY(tp);
2998
2999                 tp->snd_una = th->th_ack;
3000                 if (tp->sack_enable) {
3001                         if (SEQ_GT(tp->snd_una, tp->snd_recover))
3002                                 tp->snd_recover = tp->snd_una;
3003                 }
3004                 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
3005                         tp->snd_nxt = tp->snd_una;
3006
3007                 /*
3008                  * sowwakeup must happen after snd_una, et al. are updated so that
3009                  * the sequence numbers are in sync with so_snd
3010                  */
3011                 sowwakeup(so);
3012
3013                 switch (tp->t_state) {
3014
3015                 /*
3016                  * In FIN_WAIT_1 STATE in addition to the processing
3017                  * for the ESTABLISHED state if our FIN is now acknowledged
3018                  * then enter FIN_WAIT_2.
3019                  */
3020                 case TCPS_FIN_WAIT_1:
3021                         if (ourfinisacked) {
3022                                 /*
3023                                  * If we can't receive any more
3024                                  * data, then closing user can proceed.
3025                                  * Starting the timer is contrary to the
3026                                  * specification, but if we don't get a FIN
3027                                  * we'll hang forever.
3028                                  */
3029                                 if (so->so_state & SS_CANTRCVMORE) {
3030                                         add_to_time_wait(tp, tcp_maxidle);
3031                                         isconnected = FALSE;
3032                                         isdisconnected = TRUE;
3033                                 }
3034                                 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
3035                                         struct tcpcb *, tp, int32_t, TCPS_FIN_WAIT_2);
3036                                 tp->t_state = TCPS_FIN_WAIT_2;
3037                                 /* fall through and make sure we also recognize data ACKed with the FIN */
3038                         }
3039                         tp->t_flags |= TF_ACKNOW;
3040                         break;
3041
3042                 /*
3043                  * In CLOSING STATE in addition to the processing for
3044                  * the ESTABLISHED state if the ACK acknowledges our FIN
3045                  * then enter the TIME-WAIT state, otherwise ignore
3046                  * the segment.
3047                  */
3048                 case TCPS_CLOSING:
3049                         if (ourfinisacked) {
3050                                 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
3051                                         struct tcpcb *, tp, int32_t, TCPS_TIME_WAIT);
3052                                 tp->t_state = TCPS_TIME_WAIT;
3053                                 tcp_canceltimers(tp);
3054                                 /* Shorten TIME_WAIT [RFC-1644, p.28] */
3055                                 if (tp->cc_recv != 0 &&
3056                                     ((int)(tcp_now - tp->t_starttime)) < tcp_msl)
3057                                         add_to_time_wait(tp, tp->t_rxtcur * TCPTV_TWTRUNC);
3058                                 else
3059                                         add_to_time_wait(tp, 2 * tcp_msl);
3060                                 isconnected = FALSE;
3061                                 isdisconnected = TRUE;
3062                         }
3063                         tp->t_flags |= TF_ACKNOW;
3064                         break;
3065
3066                 /*
3067                  * In LAST_ACK, we may still be waiting for data to drain
3068                  * and/or to be acked, as well as for the ack of our FIN.
3069                  * If our FIN is now acknowledged, delete the TCB,
3070                  * enter the closed state and return.
3071                  */
3072                 case TCPS_LAST_ACK:
3073                         if (ourfinisacked) {
3074                                 tp = tcp_close(tp);
3075                                 goto drop;
3076                         }
3077                         break;
3078
3079                 /*
3080                  * In TIME_WAIT state the only thing that should arrive
3081                  * is a retransmission of the remote FIN.  Acknowledge
3082                  * it and restart the finack timer.
3083                  */
3084                 case TCPS_TIME_WAIT:
3085                         add_to_time_wait(tp, 2 * tcp_msl);
3086                         goto dropafterack;
3087                 }
3088         }
3089
3090 step6:
3091         /*
3092          * Update window information.
3093          * Don't look at window if no ACK: TAC's send garbage on first SYN.
3094          */
3095         if ((thflags & TH_ACK) &&
3096             (SEQ_LT(tp->snd_wl1, th->th_seq) ||
3097             (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
3098              (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
3099                 /* keep track of pure window updates */
3100                 if (tlen == 0 &&
3101                     tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
3102                         tcpstat.tcps_rcvwinupd++;
3103                 tp->snd_wnd = tiwin;
3104                 tp->snd_wl1 = th->th_seq;
3105                 tp->snd_wl2 = th->th_ack;
3106                 if (tp->snd_wnd > tp->max_sndwnd)
3107                         tp->max_sndwnd = tp->snd_wnd;
3108                 needoutput = 1;
3109         }
3110
3111         /*
3112          * Process segments with URG.
3113          */
3114         if ((thflags & TH_URG) && th->th_urp &&
3115             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
3116                 /*
3117                  * This is a kludge, but if we receive and accept
3118                  * random urgent pointers, we'll crash in
3119                  * soreceive.  It's hard to imagine someone
3120                  * actually wanting to send this much urgent data.
3121                  */
3122                 if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
3123                         th->th_urp = 0;                 /* XXX */
3124                         thflags &= ~TH_URG;             /* XXX */
3125                         goto dodata;                    /* XXX */
3126                 }
3127                 /*
3128                  * If this segment advances the known urgent pointer,
3129                  * then mark the data stream.  This should not happen
3130                  * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
3131                  * a FIN has been received from the remote side.
3132                  * In these states we ignore the URG.
3133                  *
3134                  * According to RFC961 (Assigned Protocols),
3135                  * the urgent pointer points to the last octet
3136                  * of urgent data.  We continue, however,
3137                  * to consider it to indicate the first octet
3138                  * of data past the urgent section as the original
3139                  * spec states (in one of two places).
3140                  */
3141                 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
3142                         tp->rcv_up = th->th_seq + th->th_urp;
3143                         so->so_oobmark = so->so_rcv.sb_cc +
3144                             (tp->rcv_up - tp->rcv_nxt) - 1;
3145                         if (so->so_oobmark == 0) {
3146                                 so->so_state |= SS_RCVATMARK;
3147                                 postevent(so, 0, EV_OOB);
3148                         }
3149                         sohasoutofband(so);
3150                         tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
3151                 }
3152                 /*
3153                  * Remove out of band data so doesn't get presented to user.
3154                  * This can happen independent of advancing the URG pointer,
3155                  * but if two URG's are pending at once, some out-of-band
3156                  * data may creep in... ick.
3157                  */
3158                 if (th->th_urp <= (u_int32_t)tlen
3159 #if SO_OOBINLINE
3160                      && (so->so_options & SO_OOBINLINE) == 0
3161 #endif
3162                      )
3163                         tcp_pulloutofband(so, th, m,
3164                                 drop_hdrlen);   /* hdr drop is delayed */
3165         } else {
3166                 /*
3167                  * If no out of band data is expected,
3168                  * pull receive urgent pointer along
3169                  * with the receive window.
3170                  */
3171                 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
3172                         tp->rcv_up = tp->rcv_nxt;
3173         }
3174 dodata:
3175
3176         /* Set socket's connect or disconnect state correcly before doing data.
3177          * The following might unlock the socket if there is an upcall or a socket
3178          * filter.
3179          */
3180         if (isconnected) {
3181                 soisconnected(so);
3182         } else if (isdisconnected) {
3183                 soisdisconnected(so);
3184         }
3185
3186         /* Let's check the state of pcb just to make sure that it did not get closed
3187          * when we unlocked above
3188          */
3189         if (inp->inp_state == INPCB_STATE_DEAD) {
3190                 /* Just drop the packet that we are processing and return */
3191                 goto drop;
3192         }
3193
3194         /*
3195          * Process the segment text, merging it into the TCP sequencing queue,
3196          * and arranging for acknowledgment of receipt if necessary.
3197          * This process logically involves adjusting tp->rcv_wnd as data
3198          * is presented to the user (this happens in tcp_usrreq.c,
3199          * case PRU_RCVD).  If a FIN has already been received on this
3200          * connection then we just ignore the text.
3201          */
3202         if ((tlen || (thflags & TH_FIN)) &&
3203             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
3204                 tcp_seq save_start = th->th_seq;
3205                 tcp_seq save_end = th->th_seq + tlen;
3206                 m_adj(m, drop_hdrlen);  /* delayed header drop */
3207                 /*
3208                  * Insert segment which includes th into TCP reassembly queue
3209                  * with control block tp.  Set thflags to whether reassembly now
3210                  * includes a segment with FIN.  This handles the common case
3211                  * inline (segment is the next to be received on an established
3212                  * connection, and the queue is empty), avoiding linkage into
3213                  * and removal from the queue and repetition of various
3214                  * conversions.
3215                  * Set DELACK for segments received in order, but ack
3216                  * immediately when segments are out of order (so
3217                  * fast retransmit can work).
3218                  */
3219                 if (th->th_seq == tp->rcv_nxt &&
3220                     LIST_EMPTY(&tp->t_segq) &&
3221                     TCPS_HAVEESTABLISHED(tp->t_state)) {
3222                         if (DELAY_ACK(tp, th) && ((tp->t_flags & TF_ACKNOW) == 0)) {
3223                                 if ((tp->t_flags & TF_DELACK) == 0) {
3224                                         tp->t_flags |= TF_DELACK;
3225                                         tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
3226                                 }
3227                                 tp->t_unacksegs++;
3228                         }
3229                         else {
3230                                 tp->t_flags |= TF_ACKNOW;
3231                         }
3232                         tp->rcv_nxt += tlen;
3233                         thflags = th->th_flags & TH_FIN;
3234                         tcpstat.tcps_rcvpack++;
3235                         tcpstat.tcps_rcvbyte += tlen;
3236                         if (nstat_collect) {
3237                                 locked_add_64(&inp->inp_stat->rxpackets, 1);
3238                                 locked_add_64(&inp->inp_stat->rxbytes, tlen);
3239                         }
3240                         ND6_HINT(tp);
3241                         so_recv_data_stat(so, m, drop_hdrlen);
3242                         if (sbappendstream(&so->so_rcv, m))
3243                                 sorwakeup(so);
3244                 } else {
3245                         thflags = tcp_reass(tp, th, &tlen, m);
3246                         tp->t_flags |= TF_ACKNOW;
3247                 }
3248
3249                 if (tlen > 0 && tp->sack_enable)
3250                         tcp_update_sack_list(tp, save_start, save_end);
3251
3252                 if (tp->t_flags & TF_DELACK)
3253                 {
3254 #if INET6
3255                         if (isipv6) {
3256                                 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
3257                                         (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
3258                                         th->th_seq, th->th_ack, th->th_win);
3259                         }
3260                         else
3261 #endif
3262                         {
3263                                 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
3264                                         (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
3265                                         th->th_seq, th->th_ack, th->th_win);
3266                         }
3267
3268                 }
3269                 /*
3270                  * Note the amount of data that peer has sent into
3271                  * our window, in order to estimate the sender's
3272                  * buffer size.
3273                  */
3274                 len = (u_int)(so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt));
3275                 if (len > so->so_rcv.sb_maxused)
3276                         so->so_rcv.sb_maxused = len;
3277         } else {
3278                 m_freem(m);
3279                 thflags &= ~TH_FIN;
3280         }
3281
3282         /*
3283          * If FIN is received ACK the FIN and let the user know
3284          * that the connection is closing.
3285          */
3286         if (thflags & TH_FIN) {
3287                 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
3288                         socantrcvmore(so);
3289                         postevent(so, 0, EV_FIN);
3290                         /*
3291                          *  If connection is half-synchronized
3292                          *  (ie NEEDSYN flag on) then delay ACK,
3293                          * If connection is half-synchronized
3294                          * (ie NEEDSYN flag on) then delay ACK,
3295                          * so it may be piggybacked when SYN is sent.
3296                          * Otherwise, since we received a FIN then no
3297                          * more input can be expected, send ACK now.
3298                          */
3299                         if (DELAY_ACK(tp, th) && (tp->t_flags & TF_NEEDSYN)) {
3300                                 if ((tp->t_flags & TF_DELACK) == 0) {
3301                                         tp->t_flags |= TF_DELACK;
3302                                         tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
3303                                 }
3304                                 tp->t_unacksegs++;
3305                         }
3306                         else {
3307                                 tp->t_flags |= TF_ACKNOW;
3308                         }
3309                         tp->rcv_nxt++;
3310                 }
3311                 switch (tp->t_state) {
3312
3313                 /*
3314                  * In SYN_RECEIVED and ESTABLISHED STATES
3315                  * enter the CLOSE_WAIT state.
3316                  */
3317                 case TCPS_SYN_RECEIVED:
3318                         tp->t_starttime = tcp_now;
3319                 case TCPS_ESTABLISHED:
3320                         DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
3321                                 struct tcpcb *, tp, int32_t, TCPS_CLOSE_WAIT);
3322                         tp->t_state = TCPS_CLOSE_WAIT;
3323                         break;
3324
3325                 /*
3326                  * If still in FIN_WAIT_1 STATE FIN has not been acked so
3327                  * enter the CLOSING state.
3328                  */
3329                 case TCPS_FIN_WAIT_1:
3330                         DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
3331                                 struct tcpcb *, tp, int32_t, TCPS_CLOSING);
3332                         tp->t_state = TCPS_CLOSING;
3333                         break;
3334
3335                 /*
3336                  * In FIN_WAIT_2 state enter the TIME_WAIT state,
3337                  * starting the time-wait timer, turning off the other
3338                  * standard timers.
3339                  */
3340                 case TCPS_FIN_WAIT_2:
3341                         DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
3342                                 struct tcpcb *, tp, int32_t, TCPS_TIME_WAIT);
3343                         tp->t_state = TCPS_TIME_WAIT;
3344                         tcp_canceltimers(tp);
3345                         /* Shorten TIME_WAIT [RFC-1644, p.28] */
3346                         if (tp->cc_recv != 0 &&
3347                                 ((int)(tcp_now - tp->t_starttime)) < tcp_msl) {
3348                                 add_to_time_wait(tp, tp->t_rxtcur * TCPTV_TWTRUNC);
3349                                 /* For transaction client, force ACK now. */
3350                                 tp->t_flags |= TF_ACKNOW;
3351                                 tp->t_unacksegs = 0;
3352                         }
3353                         else
3354                                 add_to_time_wait(tp, 2 * tcp_msl);
3355                         soisdisconnected(so);
3356                         break;
3357
3358                 /*
3359                  * In TIME_WAIT state restart the 2 MSL time_wait timer.
3360                  */
3361                 case TCPS_TIME_WAIT:
3362                         add_to_time_wait(tp, 2 * tcp_msl);
3363                         break;
3364                 }
3365         }
3366 #if TCPDEBUG
3367         if (so->so_options & SO_DEBUG)
3368                 tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
3369                           &tcp_savetcp, 0);
3370 #endif
3371
3372         /*
3373          * Return any desired output.
3374          */
3375         if (needoutput || (tp->t_flags & TF_ACKNOW)) {
3376                 (void) tcp_output(tp);
3377         }
3378
3379         tcp_check_timer_state(tp);
3380
3381
3382         tcp_unlock(so, 1, 0);
3383         KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
3384         return;
3385
3386 dropafterack:
3387         /*
3388          * Generate an ACK dropping incoming segment if it occupies
3389          * sequence space, where the ACK reflects our state.
3390          *
3391          * We can now skip the test for the RST flag since all
3392          * paths to this code happen after packets containing
3393          * RST have been dropped.
3394          *
3395          * In the SYN-RECEIVED state, don't send an ACK unless the
3396          * segment we received passes the SYN-RECEIVED ACK test.
3397          * If it fails send a RST.  This breaks the loop in the
3398          * "LAND" DoS attack, and also prevents an ACK storm
3399          * between two listening ports that have been sent forged
3400          * SYN segments, each with the source address of the other.
3401          */
3402         if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
3403             (SEQ_GT(tp->snd_una, th->th_ack) ||
3404              SEQ_GT(th->th_ack, tp->snd_max)) ) {
3405                 rstreason = BANDLIM_RST_OPENPORT;
3406                 goto dropwithreset;
3407         }
3408 #if TCPDEBUG
3409         if (so->so_options & SO_DEBUG)
3410                 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
3411                           &tcp_savetcp, 0);
3412 #endif
3413         m_freem(m);
3414         tp->t_flags |= TF_ACKNOW;
3415         (void) tcp_output(tp);
3416
3417         /* Don't need to check timer state as we should have done it during tcp_output */
3418         tcp_unlock(so, 1, 0);
3419         KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
3420         return;
3421 dropwithresetnosock:
3422         nosock = 1;
3423 dropwithreset:
3424         /*
3425          * Generate a RST, dropping incoming segment.
3426          * Make ACK acceptable to originator of segment.
3427          * Don't bother to respond if destination was broadcast/multicast.
3428          */
3429         if ((thflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST))
3430                 goto drop;
3431 #if INET6
3432         if (isipv6) {
3433                 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
3434                     IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
3435                         goto drop;
3436         } else
3437 #endif /* INET6 */
3438         if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
3439             IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
3440             ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
3441             in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
3442                 goto drop;
3443         /* IPv6 anycast check is done at tcp6_input() */
3444
3445         /*
3446          * Perform bandwidth limiting.
3447          */
3448 #if ICMP_BANDLIM
3449         if (badport_bandlim(rstreason) < 0)
3450                 goto drop;
3451 #endif
3452
3453 #if TCPDEBUG
3454         if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
3455                 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
3456                           &tcp_savetcp, 0);
3457 #endif
3458         if (thflags & TH_ACK)
3459                 /* mtod() below is safe as long as hdr dropping is delayed */
3460                 tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack,
3461                     TH_RST, ifscope, nocell);
3462         else {
3463                 if (thflags & TH_SYN)
3464                         tlen++;
3465                 /* mtod() below is safe as long as hdr dropping is delayed */
3466                 tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
3467                     (tcp_seq)0, TH_RST|TH_ACK, ifscope, nocell);
3468         }
3469         /* destroy temporarily created socket */
3470         if (dropsocket) {
3471                 (void) soabort(so);
3472                 tcp_unlock(so, 1, 0);
3473         }
3474         else if ((inp != NULL) && (nosock == 0)) {
3475                 tcp_unlock(so, 1, 0);
3476         }
3477         KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
3478         return;
3479 dropnosock:
3480         nosock = 1;
3481 drop:
3482         /*
3483          * Drop space held by incoming segment and return.
3484          */
3485 #if TCPDEBUG
3486         if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
3487                 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
3488                           &tcp_savetcp, 0);
3489 #endif
3490         m_freem(m);
3491         /* destroy temporarily created socket */
3492         if (dropsocket) {
3493                 (void) soabort(so);
3494                 tcp_unlock(so, 1, 0);
3495         }
3496         else if (nosock == 0) {
3497                 tcp_unlock(so, 1, 0);
3498         }
3499         KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
3500         return;
3501 }
3502
3503 static void
3504 tcp_dooptions(tp, cp, cnt, th, to, input_ifscope)
3505 /*
3506  * Parse TCP options and place in tcpopt.
3507  */
3508         struct tcpcb *tp;
3509         u_char *cp;
3510         int cnt;
3511         struct tcphdr *th;
3512         struct tcpopt *to;
3513         unsigned int input_ifscope;
3514 {
3515         u_short mss = 0;
3516         int opt, optlen;
3517
3518         for (; cnt > 0; cnt -= optlen, cp += optlen) {
3519                 opt = cp[0];
3520                 if (opt == TCPOPT_EOL)
3521                         break;
3522                 if (opt == TCPOPT_NOP)
3523                         optlen = 1;
3524                 else {
3525                         if (cnt < 2)
3526                                 break;
3527                         optlen = cp[1];
3528                         if (optlen < 2 || optlen > cnt)
3529                                 break;
3530                 }
3531                 switch (opt) {
3532
3533                 default:
3534                         continue;
3535
3536                 case TCPOPT_MAXSEG:
3537                         if (optlen != TCPOLEN_MAXSEG)
3538                                 continue;
3539                         if (!(th->th_flags & TH_SYN))
3540                                 continue;
3541                         bcopy((char *) cp + 2, (char *) &mss, sizeof(mss));
3542
3543 #if BYTE_ORDER != BIG_ENDIAN
3544                         NTOHS(mss);
3545 #endif
3546
3547                         break;
3548
3549                 case TCPOPT_WINDOW:
3550                         if (optlen != TCPOLEN_WINDOW)
3551                                 continue;
3552                         if (!(th->th_flags & TH_SYN))
3553                                 continue;
3554                         tp->t_flags |= TF_RCVD_SCALE;
3555                         tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
3556                         break;
3557
3558                 case TCPOPT_TIMESTAMP:
3559                         if (optlen != TCPOLEN_TIMESTAMP)
3560                                 continue;
3561                         to->to_flags |= TOF_TS;
3562                         bcopy((char *)cp + 2,
3563                             (char *)&to->to_tsval, sizeof(to->to_tsval));
3564
3565 #if BYTE_ORDER != BIG_ENDIAN
3566                         NTOHL(to->to_tsval);
3567 #endif
3568
3569                         bcopy((char *)cp + 6,
3570                             (char *)&to->to_tsecr, sizeof(to->to_tsecr));
3571
3572 #if BYTE_ORDER != BIG_ENDIAN
3573                         NTOHL(to->to_tsecr);
3574 #endif
3575
3576                         /*
3577                          * A timestamp received in a SYN makes
3578                          * it ok to send timestamp requests and replies.
3579                          */
3580                         if (th->th_flags & TH_SYN) {
3581                                 tp->t_flags |= TF_RCVD_TSTMP;
3582                                 tp->ts_recent = to->to_tsval;
3583                                 tp->ts_recent_age = tcp_now;
3584                         }
3585                         break;
3586                 case TCPOPT_SACK_PERMITTED:
3587                         if (!tcp_do_sack ||
3588                             optlen != TCPOLEN_SACK_PERMITTED)
3589                                 continue;
3590                         if (th->th_flags & TH_SYN)
3591                                 to->to_flags |= TOF_SACK;
3592                         break;
3593                 case TCPOPT_SACK:
3594                         if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
3595                                 continue;
3596                         to->to_nsacks = (optlen - 2) / TCPOLEN_SACK;
3597                         to->to_sacks = cp + 2;
3598                         tcpstat.tcps_sack_rcv_blocks++;
3599
3600                         break;
3601                 }
3602         }
3603         if (th->th_flags & TH_SYN)
3604                 tcp_mss(tp, mss, input_ifscope);        /* sets t_maxseg */
3605 }
3606
3607 /*
3608  * Pull out of band byte out of a segment so
3609  * it doesn't appear in the user's data queue.
3610  * It is still reflected in the segment length for
3611  * sequencing purposes.
3612  */
3613 static void
3614 tcp_pulloutofband(so, th, m, off)
3615         struct socket *so;
3616         struct tcphdr *th;
3617         register struct mbuf *m;
3618         int off;                /* delayed to be droped hdrlen */
3619 {
3620         int cnt = off + th->th_urp - 1;
3621
3622         while (cnt >= 0) {
3623                 if (m->m_len > cnt) {
3624                         char *cp = mtod(m, caddr_t) + cnt;
3625                         struct tcpcb *tp = sototcpcb(so);
3626
3627                         tp->t_iobc = *cp;
3628                         tp->t_oobflags |= TCPOOB_HAVEDATA;
3629                         bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
3630                         m->m_len--;
3631                         if (m->m_flags & M_PKTHDR)
3632                                 m->m_pkthdr.len--;
3633                         return;
3634                 }
3635                 cnt -= m->m_len;
3636                 m = m->m_next;
3637                 if (m == 0)
3638                         break;
3639         }
3640         panic("tcp_pulloutofband");
3641 }
3642
3643 uint32_t
3644 get_base_rtt(struct tcpcb *tp)
3645 {
3646         uint32_t base_rtt = 0, i;
3647         for (i = 0; i < N_RTT_BASE; ++i) {
3648                 if (tp->rtt_hist[i] != 0 &&
3649                         (base_rtt == 0 || tp->rtt_hist[i] < base_rtt))
3650                         base_rtt = tp->rtt_hist[i];
3651         }
3652         return base_rtt;
3653 }
3654
3655 /* Each value of RTT base represents the minimum RTT seen in a minute.
3656  * We keep upto N_RTT_BASE minutes worth of history.
3657  */
3658 void
3659 update_base_rtt(struct tcpcb *tp, uint32_t rtt)
3660 {
3661         if (++tp->rtt_count >= rtt_samples_per_slot) {
3662                 int i=0;
3663                 for (i = (N_RTT_BASE-1); i > 0; --i) {
3664                         tp->rtt_hist[i] = tp->rtt_hist[i-1];
3665                 }
3666                 tp->rtt_hist[0] = rtt;
3667                 tp->rtt_count = 0;
3668         } else {
3669                 tp->rtt_hist[0] = min(tp->rtt_hist[0], rtt);
3670         }
3671 }
3672
3673 /*
3674  * Collect new round-trip time estimate
3675  * and update averages and current timeout.
3676  */
3677 static void
3678 tcp_xmit_timer(tp, rtt)
3679         register struct tcpcb *tp;
3680         int rtt;
3681 {
3682         register int delta;
3683
3684         tcpstat.tcps_rttupdated++;
3685         tp->t_rttupdated++;
3686
3687         if (rtt > 0) {
3688                 tp->t_rttcur = rtt;
3689                 update_base_rtt(tp, rtt);
3690         }
3691
3692         if (tp->t_srtt != 0) {
3693                 /*
3694                  * srtt is stored as fixed point with 5 bits after the
3695                  * binary point (i.e., scaled by 32).  The following magic
3696                  * is equivalent to the smoothing algorithm in rfc793 with
3697                  * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
3698                  * point).
3699                  *
3700                  * Freebsd adjusts rtt to origin 0 by subtracting 1 from the provided
3701                  * rtt value. This was required because of the way t_rtttime was
3702                  * initiailised to 1 before. Since we changed t_rtttime to be based on
3703                  * tcp_now, this extra adjustment is not needed.
3704                  */
3705                 delta = (rtt << TCP_DELTA_SHIFT)
3706                         - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
3707
3708                 if ((tp->t_srtt += delta) <= 0)
3709                         tp->t_srtt = 1;
3710
3711                 /*
3712                  * We accumulate a smoothed rtt variance (actually, a
3713                  * smoothed mean difference), then set the retransmit
3714                  * timer to smoothed rtt + 4 times the smoothed variance.
3715                  * rttvar is stored as fixed point with 4 bits after the
3716                  * binary point (scaled by 16).  The following is
3717                  * equivalent to rfc793 smoothing with an alpha of .75
3718                  * (rttvar = rttvar*3/4 + |delta| / 4).  This replaces
3719                  * rfc793's wired-in beta.
3720                  */
3721                 if (delta < 0)
3722                         delta = -delta;
3723                 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
3724                 if ((tp->t_rttvar += delta) <= 0)
3725                         tp->t_rttvar = 1;
3726                 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
3727                     tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
3728         } else {
3729                 /*
3730                  * No rtt measurement yet - use the unsmoothed rtt.
3731                  * Set the variance to half the rtt (so our first
3732                  * retransmit happens at 3*rtt).
3733                  */
3734                 tp->t_srtt = rtt << TCP_RTT_SHIFT;
3735                 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
3736                 tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
3737         }
3738         nstat_route_rtt(tp->t_inpcb->inp_route.ro_rt, tp->t_srtt, tp->t_rttvar);
3739         tp->t_rtttime = 0;
3740         tp->t_rxtshift = 0;
3741         tp->rxt_start = 0;
3742
3743         /*
3744          * the retransmit should happen at rtt + 4 * rttvar.
3745          * Because of the way we do the smoothing, srtt and rttvar
3746          * will each average +1/2 tick of bias.  When we compute
3747          * the retransmit timer, we want 1/2 tick of rounding and
3748          * 1 extra tick because of +-1/2 tick uncertainty in the
3749          * firing of the timer.  The bias will give us exactly the
3750          * 1.5 tick we need.  But, because the bias is
3751          * statistical, we have to test that we don't drop below
3752          * the minimum feasible timer (which is 2 ticks).
3753          */
3754         TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
3755                 max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX,
3756                 TCP_ADD_REXMTSLOP(tp));
3757
3758         /*
3759          * We received an ack for a packet that wasn't retransmitted;
3760          * it is probably safe to discard any error indications we've
3761          * received recently.  This isn't quite right, but close enough
3762          * for now (a route might have failed after we sent a segment,
3763          * and the return path might not be symmetrical).
3764          */
3765         tp->t_softerror = 0;
3766 }
3767
3768 static inline unsigned int
3769 tcp_maxmtu(struct rtentry *rt)
3770 {
3771         unsigned int maxmtu;
3772
3773         RT_LOCK_ASSERT_HELD(rt);
3774         if (rt->rt_rmx.rmx_mtu == 0)
3775                 maxmtu = rt->rt_ifp->if_mtu;
3776         else
3777                 maxmtu = MIN(rt->rt_rmx.rmx_mtu, rt->rt_ifp->if_mtu);
3778
3779         return (maxmtu);
3780 }
3781
3782 #if INET6
3783 static inline unsigned int
3784 tcp_maxmtu6(struct rtentry *rt)
3785 {
3786         unsigned int maxmtu;
3787
3788         RT_LOCK_ASSERT_HELD(rt);
3789         lck_rw_lock_shared(nd_if_rwlock);
3790         if (rt->rt_rmx.rmx_mtu == 0)
3791                 maxmtu = IN6_LINKMTU(rt->rt_ifp);
3792         else
3793                 maxmtu = MIN(rt->rt_rmx.rmx_mtu, IN6_LINKMTU(rt->rt_ifp));
3794         lck_rw_done(nd_if_rwlock);
3795
3796         return (maxmtu);
3797 }
3798 #endif
3799
3800 /*
3801  * Determine a reasonable value for maxseg size.
3802  * If the route is known, check route for mtu.
3803  * If none, use an mss that can be handled on the outgoing
3804  * interface without forcing IP to fragment; if bigger than
3805  * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
3806  * to utilize large mbufs.  If no route is found, route has no mtu,
3807  * or the destination isn't local, use a default, hopefully conservative
3808  * size (usually 512 or the default IP max size, but no more than the mtu
3809  * of the interface), as we can't discover anything about intervening
3810  * gateways or networks.  We also initialize the congestion/slow start
3811  * window to be a single segment if the destination isn't local.
3812  * While looking at the routing entry, we also initialize other path-dependent
3813  * parameters from pre-set or cached values in the routing entry.
3814  *
3815  * Also take into account the space needed for options that we
3816  * send regularly.  Make maxseg shorter by that amount to assure
3817  * that we can send maxseg amount of data even when the options
3818  * are present.  Store the upper limit of the length of options plus
3819  * data in maxopd.
3820  *
3821  * NOTE that this routine is only called when we process an incoming
3822  * segment, for outgoing segments only tcp_mssopt is called.
3823  *
3824  */
3825 void
3826 tcp_mss(tp, offer, input_ifscope)
3827         struct tcpcb *tp;
3828         int offer;
3829         unsigned int input_ifscope;
3830 {
3831         register struct rtentry *rt;
3832         struct ifnet *ifp;
3833         register int rtt, mss;
3834         u_int32_t bufsize;
3835         struct inpcb *inp;
3836         struct socket *so;
3837         struct rmxp_tao *taop;
3838         int origoffer = offer;
3839         u_int32_t sb_max_corrected;
3840         int isnetlocal = 0;
3841 #if INET6
3842         int isipv6;
3843         int min_protoh;
3844 #endif
3845
3846         inp = tp->t_inpcb;
3847 #if INET6
3848         isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
3849         min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr)
3850                             : sizeof (struct tcpiphdr);
3851 #else
3852 #define min_protoh  (sizeof (struct tcpiphdr))
3853 #endif
3854
3855 #if INET6
3856         if (isipv6) {
3857                 rt = tcp_rtlookup6(inp, input_ifscope);
3858                 if (rt != NULL &&
3859                     (IN6_IS_ADDR_LOOPBACK(&inp->in6p_faddr) ||
3860                     IN6_IS_ADDR_LINKLOCAL(&inp->in6p_faddr) ||
3861                     rt->rt_gateway->sa_family == AF_LINK ||
3862                     in6_localaddr(&inp->in6p_faddr))) {
3863                         tp->t_flags |= TF_LOCAL;
3864                 }
3865         }
3866         else
3867 #endif /* INET6 */
3868         {
3869                 rt = tcp_rtlookup(inp, input_ifscope);
3870                 if (rt != NULL &&
3871                     (rt->rt_gateway->sa_family == AF_LINK ||
3872                     rt->rt_ifp->if_flags & IFF_LOOPBACK ||
3873                     in_localaddr(inp->inp_faddr))) {
3874                         tp->t_flags |= TF_LOCAL;
3875                 }
3876         }
3877         isnetlocal = (tp->t_flags & TF_LOCAL);
3878
3879         if (rt == NULL) {
3880                 tp->t_maxopd = tp->t_maxseg =
3881 #if INET6
3882                 isipv6 ? tcp_v6mssdflt :
3883 #endif /* INET6 */
3884                 tcp_mssdflt;
3885                 return;
3886         }
3887         ifp = rt->rt_ifp;
3888         /*
3889          * Slower link window correction:
3890          * If a value is specificied for slowlink_wsize use it for PPP links
3891          * believed to be on a serial modem (speed <128Kbps). Excludes 9600bps as
3892          * it is the default value adversized by pseudo-devices over ppp.
3893          */
3894         if (ifp->if_type == IFT_PPP && slowlink_wsize > 0 &&
3895             ifp->if_baudrate > 9600 && ifp->if_baudrate <= 128000) {
3896                 tp->t_flags |= TF_SLOWLINK;
3897         }
3898         so = inp->inp_socket;
3899
3900         taop = rmx_taop(rt->rt_rmx);
3901         /*
3902          * Offer == -1 means that we didn't receive SYN yet,
3903          * use cached value in that case;
3904          */
3905         if (offer == -1)
3906                 offer = taop->tao_mssopt;
3907         /*
3908          * Offer == 0 means that there was no MSS on the SYN segment,
3909          * in this case we use tcp_mssdflt.
3910          */
3911         if (offer == 0)
3912                 offer =
3913 #if INET6
3914                         isipv6 ? tcp_v6mssdflt :
3915 #endif /* INET6 */
3916                         tcp_mssdflt;
3917         else {
3918                 /*
3919                  * Prevent DoS attack with too small MSS. Round up
3920                  * to at least minmss.
3921                  */
3922                 offer = max(offer, tcp_minmss);
3923                 /*
3924                  * Sanity check: make sure that maxopd will be large
3925                  * enough to allow some data on segments even is the
3926                  * all the option space is used (40bytes).  Otherwise
3927                  * funny things may happen in tcp_output.
3928                  */
3929                 offer = max(offer, 64);
3930         }
3931         taop->tao_mssopt = offer;
3932
3933         /*
3934          * While we're here, check if there's an initial rtt
3935          * or rttvar.  Convert from the route-table units
3936          * to scaled multiples of the slow timeout timer.
3937          */
3938         if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) {
3939                 /*
3940                  * XXX the lock bit for RTT indicates that the value
3941                  * is also a minimum value; this is subject to time.
3942                  */
3943                 if (rt->rt_rmx.rmx_locks & RTV_RTT)
3944                         tp->t_rttmin = rtt / (RTM_RTTUNIT / TCP_RETRANSHZ);
3945                 else
3946                         tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN : TCPTV_REXMTMIN;
3947                 tp->t_srtt = rtt / (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE));
3948                 tcpstat.tcps_usedrtt++;
3949                 if (rt->rt_rmx.rmx_rttvar) {
3950                         tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
3951                             (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTTVAR_SCALE));
3952                         tcpstat.tcps_usedrttvar++;
3953                 } else {
3954                         /* default variation is +- 1 rtt */
3955                         tp->t_rttvar =
3956                             tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
3957                 }
3958                 TCPT_RANGESET(tp->t_rxtcur,
3959                               ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
3960                               tp->t_rttmin, TCPTV_REXMTMAX,
3961                               TCP_ADD_REXMTSLOP(tp));
3962         }
3963         else
3964                 tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN : TCPTV_REXMTMIN;
3965
3966 #if INET6
3967         mss = (isipv6 ? tcp_maxmtu6(rt) : tcp_maxmtu(rt));
3968 #else
3969         mss = tcp_maxmtu(rt);
3970 #endif
3971         mss -= min_protoh;
3972
3973         if (rt->rt_rmx.rmx_mtu == 0) {
3974 #if INET6
3975                 if (isipv6) {
3976                         if (!isnetlocal)
3977                                 mss = min(mss, tcp_v6mssdflt);
3978                 } else
3979 #endif /* INET6 */
3980                 if (!isnetlocal)
3981                         mss = min(mss, tcp_mssdflt);
3982         }
3983
3984         mss = min(mss, offer);
3985         /*
3986          * maxopd stores the maximum length of data AND options
3987          * in a segment; maxseg is the amount of data in a normal
3988          * segment.  We need to store this value (maxopd) apart
3989          * from maxseg, because now every segment carries options
3990          * and thus we normally have somewhat less data in segments.
3991          */
3992         tp->t_maxopd = mss;
3993
3994         /*
3995          * origoffer==-1 indicates, that no segments were received yet.
3996          * In this case we just guess.
3997          */
3998         if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
3999             (origoffer == -1 ||
4000              (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
4001                 mss -= TCPOLEN_TSTAMP_APPA;
4002         tp->t_maxseg = mss;
4003
4004         /*
4005          * Calculate corrected value for sb_max; ensure to upgrade the
4006          * numerator for large sb_max values else it will overflow.
4007          */
4008         sb_max_corrected = (sb_max * (u_int64_t)MCLBYTES) / (MSIZE + MCLBYTES);
4009
4010         /*
4011          * If there's a pipesize (ie loopback), change the socket
4012          * buffer to that size only if it's bigger than the current
4013          * sockbuf size.  Make the socket buffers an integral
4014          * number of mss units; if the mss is larger than
4015          * the socket buffer, decrease the mss.
4016          */
4017 #if RTV_SPIPE
4018         bufsize = rt->rt_rmx.rmx_sendpipe;
4019         if (bufsize < so->so_snd.sb_hiwat)
4020 #endif
4021                 bufsize = so->so_snd.sb_hiwat;
4022         if (bufsize < mss)
4023                 mss = bufsize;
4024         else {
4025                 bufsize = (((bufsize + (u_int64_t)mss - 1) / (u_int64_t)mss) * (u_int64_t)mss);
4026                 if (bufsize > sb_max_corrected)
4027                         bufsize = sb_max_corrected;
4028                 (void)sbreserve(&so->so_snd, bufsize);
4029         }
4030         tp->t_maxseg = mss;
4031
4032 #if RTV_RPIPE
4033         bufsize = rt->rt_rmx.rmx_recvpipe;
4034         if (bufsize < so->so_rcv.sb_hiwat)
4035 #endif
4036                 bufsize = so->so_rcv.sb_hiwat;
4037         if (bufsize > mss) {
4038                 bufsize = (((bufsize + (u_int64_t)mss - 1) / (u_int64_t)mss) * (u_int64_t)mss);
4039                 if (bufsize > sb_max_corrected)
4040                         bufsize = sb_max_corrected;
4041                 (void)sbreserve(&so->so_rcv, bufsize);
4042         }
4043
4044         set_tcp_stream_priority(so);
4045
4046         if (rt->rt_rmx.rmx_ssthresh) {
4047                 /*
4048                  * There's some sort of gateway or interface
4049                  * buffer limit on the path.  Use this to set
4050                  * the slow start threshhold, but set the
4051                  * threshold to no less than 2*mss.
4052                  */
4053                 tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
4054                 tcpstat.tcps_usedssthresh++;
4055         } else {
4056                 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
4057         }
4058
4059
4060         /*
4061          * Set the slow-start flight size depending on whether this
4062          * is a local network or not.
4063          */
4064         if (CC_ALGO(tp)->cwnd_init != NULL)
4065                 CC_ALGO(tp)->cwnd_init(tp);
4066
4067         DTRACE_TCP5(cc, void, NULL, struct inpcb *, tp->t_inpcb, struct tcpcb *, tp,
4068                 struct tcphdr *, NULL, int32_t, TCP_CC_CWND_INIT);
4069
4070         /* Route locked during lookup above */
4071         RT_UNLOCK(rt);
4072 }
4073
4074 /*
4075  * Determine the MSS option to send on an outgoing SYN.
4076  */
4077 int
4078 tcp_mssopt(tp)
4079         struct tcpcb *tp;
4080 {
4081         struct rtentry *rt;
4082         int mss;
4083 #if INET6
4084         int isipv6;
4085         int min_protoh;
4086 #endif
4087
4088 #if INET6
4089         isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
4090         min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr)
4091                             : sizeof (struct tcpiphdr);
4092 #else
4093 #define min_protoh  (sizeof (struct tcpiphdr))
4094 #endif
4095
4096 #if INET6
4097         if (isipv6)
4098                 rt = tcp_rtlookup6(tp->t_inpcb, IFSCOPE_NONE);
4099         else
4100 #endif /* INET6 */
4101         rt = tcp_rtlookup(tp->t_inpcb, IFSCOPE_NONE);
4102         if (rt == NULL) {
4103                 return (
4104 #if INET6
4105                         isipv6 ? tcp_v6mssdflt :
4106 #endif /* INET6 */
4107                         tcp_mssdflt);
4108         }
4109         /*
4110          * Slower link window correction:
4111          * If a value is specificied for slowlink_wsize use it for PPP links
4112          * believed to be on a serial modem (speed <128Kbps). Excludes 9600bps as
4113          * it is the default value adversized by pseudo-devices over ppp.
4114          */
4115         if (rt->rt_ifp->if_type == IFT_PPP && slowlink_wsize > 0 &&
4116             rt->rt_ifp->if_baudrate > 9600 && rt->rt_ifp->if_baudrate <= 128000) {
4117                 tp->t_flags |= TF_SLOWLINK;
4118         }
4119
4120 #if INET6
4121         mss = (isipv6 ? tcp_maxmtu6(rt) : tcp_maxmtu(rt));
4122 #else
4123         mss = tcp_maxmtu(rt);
4124 #endif
4125         /* Route locked during lookup above */
4126         RT_UNLOCK(rt);
4127         return (mss - min_protoh);
4128 }
4129
4130 /*
4131  * On a partial ack arrives, force the retransmission of the
4132  * next unacknowledged segment.  Do not clear tp->t_dupacks.
4133  * By setting snd_nxt to th_ack, this forces retransmission timer to
4134  * be started again.
4135  */
4136 static void
4137 tcp_newreno_partial_ack(tp, th)
4138         struct tcpcb *tp;
4139         struct tcphdr *th;
4140 {
4141                 tcp_seq onxt = tp->snd_nxt;
4142                 u_int32_t  ocwnd = tp->snd_cwnd;
4143                 tp->t_timer[TCPT_REXMT] = 0;
4144                 tp->t_rtttime = 0;
4145                 tp->snd_nxt = th->th_ack;
4146                 /*
4147                  * Set snd_cwnd to one segment beyond acknowledged offset
4148                  * (tp->snd_una has not yet been updated when this function
4149                  *  is called)
4150                  */
4151                 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una);
4152                 tp->t_flags |= TF_ACKNOW;
4153                 (void) tcp_output(tp);
4154                 tp->snd_cwnd = ocwnd;
4155                 if (SEQ_GT(onxt, tp->snd_nxt))
4156                         tp->snd_nxt = onxt;
4157                 /*
4158                  * Partial window deflation.  Relies on fact that tp->snd_una
4159                  * not updated yet.
4160                  */
4161                 if (tp->snd_cwnd > th->th_ack - tp->snd_una)
4162                         tp->snd_cwnd -= th->th_ack - tp->snd_una;
4163                 else
4164                         tp->snd_cwnd = 0;
4165                 tp->snd_cwnd += tp->t_maxseg;
4166
4167 }
4168
4169 /*
4170  * Drop a random TCP connection that hasn't been serviced yet and
4171  * is eligible for discard.  There is a one in qlen chance that
4172  * we will return a null, saying that there are no dropable
4173  * requests.  In this case, the protocol specific code should drop
4174  * the new request.  This insures fairness.
4175  *
4176  * The listening TCP socket "head" must be locked
4177  */
4178 static int
4179 tcp_dropdropablreq(struct socket *head)
4180 {
4181         struct socket *so, *sonext;
4182         unsigned int i, j, qlen;
4183         static int rnd;
4184         static struct timeval old_runtime;
4185         static unsigned int cur_cnt, old_cnt;
4186         struct timeval tv;
4187         struct inpcb *inp = NULL;
4188         struct tcpcb *tp;
4189
4190         if ((head->so_options & SO_ACCEPTCONN) == 0)
4191                 return 0;
4192
4193         so = TAILQ_FIRST(&head->so_incomp);
4194         if (!so)
4195                 return 0;
4196
4197         microtime(&tv);
4198         if ((i = (tv.tv_sec - old_runtime.tv_sec)) != 0) {
4199                 old_runtime = tv;
4200                 old_cnt = cur_cnt / i;
4201                 cur_cnt = 0;
4202         }
4203
4204
4205         qlen = head->so_incqlen;
4206         if (++cur_cnt > qlen || old_cnt > qlen) {
4207                 rnd = (314159 * rnd + 66329) & 0xffff;
4208                 j = ((qlen + 1) * rnd) >> 16;
4209
4210                 while (j-- && so)
4211                         so = TAILQ_NEXT(so, so_list);
4212         }
4213         /* Find a connection that is not already closing (or being served) */
4214         while (so) {
4215                 inp = (struct inpcb *)so->so_pcb;
4216
4217                 sonext = TAILQ_NEXT(so, so_list);
4218
4219                 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
4220                         /* Avoid the issue of a socket being accepted by one input thread
4221                          * and being dropped by another input thread.
4222                          * If we can't get a hold on this mutex, then grab the next socket in line.
4223                          */
4224                         if (lck_mtx_try_lock(&inp->inpcb_mtx)) {
4225                                 so->so_usecount++;
4226                                 if ((so->so_usecount == 2) &&
4227                                     (so->so_state & SS_INCOMP) != 0 &&
4228                                     (so->so_flags & SOF_INCOMP_INPROGRESS) == 0)
4229                                         break;
4230                                 else {/* don't use if being accepted or used in any other way */
4231                                         in_pcb_checkstate(inp, WNT_RELEASE, 1);
4232                                         tcp_unlock(so, 1, 0);
4233                                 }
4234                         }
4235                         else {
4236                                 /* do not try to lock the inp in in_pcb_checkstate
4237                                  * because the lock is already held in some other thread.
4238                                  * Only drop the inp_wntcnt reference.
4239                                  */
4240                                 in_pcb_checkstate(inp, WNT_RELEASE, 1);
4241                         }
4242                 }
4243                 so = sonext;
4244
4245         }
4246         if (!so)
4247                 return 0;
4248
4249         /* Makes sure socket is still in the right state to be discarded */
4250
4251         if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
4252                 tcp_unlock(so, 1, 0);
4253                 return 0;
4254         }
4255
4256         if (so->so_usecount != 2 || !(so->so_state & SS_INCOMP)) {
4257                 /* do not discard: that socket is being accepted */
4258                 tcp_unlock(so, 1, 0);
4259                 return 0;
4260         }
4261
4262         TAILQ_REMOVE(&head->so_incomp, so, so_list);
4263         tcp_unlock(head, 0, 0);
4264
4265         lck_mtx_assert(&inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
4266         tp = sototcpcb(so);
4267         so->so_flags |= SOF_OVERFLOW;
4268         so->so_head = NULL;
4269
4270         tcp_close(tp);
4271         tp->t_unacksegs = 0;
4272
4273         if (inp->inp_wantcnt > 0 && inp->inp_wantcnt != WNT_STOPUSING) {
4274                 /* Some one has a wantcnt on this pcb. Since WNT_ACQUIRE
4275                  * doesn't require a lock, it could have happened while
4276                  * we are holding the lock. This pcb will have to
4277                  * be garbage collected later.
4278                  * Release the reference held for so_incomp queue
4279                  */
4280                 so->so_usecount--;
4281
4282                 tcp_unlock(so, 1, 0);
4283         } else {
4284                 /* Unlock this socket and leave the reference on. We need to
4285                  * acquire the pcbinfo lock in order to fully dispose it off
4286                  */
4287                 tcp_unlock(so, 0, 0);
4288
4289                 lck_rw_lock_exclusive(tcbinfo.mtx);
4290
4291                 tcp_lock(so, 0, 0);
4292
4293                 /* Release the reference held for so_incomp queue */
4294                 so->so_usecount--;
4295
4296                 if (so->so_usecount != 1 ||
4297                     (inp->inp_wantcnt > 0 && inp->inp_wantcnt != WNT_STOPUSING)) {
4298                         /* There is an extra wantcount or usecount that must
4299                          * have been added when the socket was unlocked. This
4300                          * socket will have to be garbage collected later
4301                          */
4302                         tcp_unlock(so, 1, 0);
4303                 } else {
4304
4305                         /* Drop the reference held for this function */
4306                         so->so_usecount--;
4307
4308                         in_pcbdispose(inp);
4309                 }
4310                 lck_rw_done(tcbinfo.mtx);
4311         }
4312         tcpstat.tcps_drops++;
4313
4314         tcp_lock(head, 0, 0);
4315         head->so_incqlen--;
4316         head->so_qlen--;
4317         return(1);
4318 }
4319
4320 /* Set background congestion control on a socket */
4321 void
4322 tcp_set_background_cc(struct socket *so)
4323 {
4324         tcp_set_new_cc(so, TCP_CC_ALGO_BACKGROUND_INDEX);
4325 }
4326
4327 /* Set foreground congestion control on a socket */
4328 void
4329 tcp_set_foreground_cc(struct socket *so)
4330 {
4331         tcp_set_new_cc(so, TCP_CC_ALGO_NEWRENO_INDEX);
4332 }
4333
4334 static void
4335 tcp_set_new_cc(struct socket *so, uint16_t cc_index)
4336 {
4337         struct inpcb *inp = sotoinpcb(so);
4338         struct tcpcb *tp = intotcpcb(inp);
4339         uint16_t old_cc_index = 0;
4340         if (tp->tcp_cc_index != cc_index) {
4341
4342                 old_cc_index = tp->tcp_cc_index;
4343
4344                 if (CC_ALGO(tp)->cleanup != NULL)
4345                         CC_ALGO(tp)->cleanup(tp);
4346                 tp->tcp_cc_index = cc_index;
4347
4348                 /* Decide if the connection is just starting or if
4349                  * we have sent some packets on it.
4350                  */
4351                 if (tp->snd_nxt > tp->iss) {
4352                         /* Already sent some packets */
4353                         if (CC_ALGO(tp)->switch_to != NULL)
4354                                 CC_ALGO(tp)->switch_to(tp, old_cc_index);
4355                 } else {
4356                         if (CC_ALGO(tp)->init != NULL)
4357                                 CC_ALGO(tp)->init(tp);
4358                 }
4359                 DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp,
4360                         struct tcpcb *, tp, struct tcphdr *, NULL,
4361                         int32_t, TCP_CC_CHANGE_ALGO);
4362         }
4363 }
4364
4365 static int
4366 tcp_getstat SYSCTL_HANDLER_ARGS
4367 {
4368 #pragma unused(oidp, arg1, arg2)
4369
4370         int error;
4371
4372         if (req->oldptr == 0) {
4373                 req->oldlen= (size_t)sizeof(struct tcpstat);
4374         }
4375
4376         error = SYSCTL_OUT(req, &tcpstat, MIN(sizeof (tcpstat), req->oldlen));
4377
4378         return (error);
4379
4380 }
4381
4382 SYSCTL_PROC(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
4383     tcp_getstat, "S,tcpstat", "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
4384
4385 static int
4386 sysctl_rexmtthresh SYSCTL_HANDLER_ARGS
4387 {
4388 #pragma unused(arg1, arg2)
4389
4390         int error, val = tcprexmtthresh;
4391
4392         error = sysctl_handle_int(oidp, &val, 0, req);
4393         if (error || !req->newptr)
4394                 return (error);
4395
4396         /*
4397          * Constrain the number of duplicate ACKs
4398          * to consider for TCP fast retransmit
4399          * to either 2 or 3
4400          */
4401
4402         if (val < 2 || val > 3)
4403                 return (EINVAL);
4404
4405          tcprexmtthresh = val;
4406
4407         return (0);
4408 }
4409
4410 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmt_thresh, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED,
4411         &tcprexmtthresh, 0, &sysctl_rexmtthresh, "I", "Duplicate ACK Threshold for Fast Retransmit");