bsd/netinet/tcp_input.c

   1 /*
   2  * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
  30  *      The Regents of the University of California.  All rights reserved.
  31  *
  32  * Redistribution and use in source and binary forms, with or without
  33  * modification, are permitted provided that the following conditions
  34  * are met:
  35  * 1. Redistributions of source code must retain the above copyright
  36  *    notice, this list of conditions and the following disclaimer.
  37  * 2. Redistributions in binary form must reproduce the above copyright
  38  *    notice, this list of conditions and the following disclaimer in the
  39  *    documentation and/or other materials provided with the distribution.
  40  * 3. All advertising materials mentioning features or use of this software
  41  *    must display the following acknowledgement:
  42  *      This product includes software developed by the University of
  43  *      California, Berkeley and its contributors.
  44  * 4. Neither the name of the University nor the names of its contributors
  45  *    may be used to endorse or promote products derived from this software
  46  *    without specific prior written permission.
  47  *
  48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  58  * SUCH DAMAGE.
  59  *
  60  *      @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
  61  * $FreeBSD: src/sys/netinet/tcp_input.c,v 1.107.2.16 2001/08/22 00:59:12 silby Exp $
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/kernel.h>
  73 #include <sys/sysctl.h>
  74 #include <sys/malloc.h>
  75 #include <sys/mbuf.h>
  76 #include <sys/proc.h>           /* for proc0 declaration */
  77 #include <sys/protosw.h>
  78 #include <sys/socket.h>
  79 #include <sys/socketvar.h>
  80 #include <sys/syslog.h>
  81 #include <sys/mcache.h>
  82 #include <sys/kasl.h>
  83 #include <kern/cpu_number.h>    /* before tcp_seq.h, for tcp_random18() */
  84
  85 #include <machine/endian.h>
  86
  87 #include <net/if.h>
  88 #include <net/if_types.h>
  89 #include <net/route.h>
  90 #include <net/ntstat.h>
  91 #include <net/dlil.h>
  92
  93 #include <netinet/in.h>
  94 #include <netinet/in_systm.h>
  95 #include <netinet/ip.h>
  96 #include <netinet/ip_icmp.h>    /* for ICMP_BANDLIM             */
  97 #include <netinet/in_var.h>
  98 #include <netinet/icmp_var.h>   /* for ICMP_BANDLIM     */
  99 #include <netinet/in_pcb.h>
 100 #include <netinet/ip_var.h>
 101 #include <mach/sdt.h>
 102 #if INET6
 103 #include <netinet/ip6.h>
 104 #include <netinet/icmp6.h>
 105 #include <netinet6/nd6.h>
 106 #include <netinet6/ip6_var.h>
 107 #include <netinet6/in6_pcb.h>
 108 #endif
 109 #include <netinet/tcp.h>
 110 #include <netinet/tcp_cache.h>
 111 #include <netinet/tcp_fsm.h>
 112 #include <netinet/tcp_seq.h>
 113 #include <netinet/tcp_timer.h>
 114 #include <netinet/tcp_var.h>
 115 #include <netinet/tcp_cc.h>
 116 #include <dev/random/randomdev.h>
 117 #include <kern/zalloc.h>
 118 #if INET6
 119 #include <netinet6/tcp6_var.h>
 120 #endif
 121 #include <netinet/tcpip.h>
 122 #if TCPDEBUG
 123 #include <netinet/tcp_debug.h>
 124 u_char tcp_saveipgen[40]; /* the size must be of max ip header, now IPv6 */
 125 struct tcphdr tcp_savetcp;
 126 #endif /* TCPDEBUG */
 127
 128 #if IPSEC
 129 #include <netinet6/ipsec.h>
 130 #if INET6
 131 #include <netinet6/ipsec6.h>
 132 #endif
 133 #include <netkey/key.h>
 134 #endif /*IPSEC*/
 135
 136 #if CONFIG_MACF_NET || CONFIG_MACF_SOCKET
 137 #include <security/mac_framework.h>
 138 #endif /* CONFIG_MACF_NET || CONFIG_MACF_SOCKET */
 139
 140 #include <sys/kdebug.h>
 141 #include <netinet/lro_ext.h>
 142 #if MPTCP
 143 #include <netinet/mptcp_var.h>
 144 #include <netinet/mptcp.h>
 145 #include <netinet/mptcp_opt.h>
 146 #endif /* MPTCP */
 147
 148 #include <corecrypto/ccaes.h>
 149
 150 #define DBG_LAYER_BEG           NETDBG_CODE(DBG_NETTCP, 0)
 151 #define DBG_LAYER_END           NETDBG_CODE(DBG_NETTCP, 2)
 152 #define DBG_FNC_TCP_INPUT       NETDBG_CODE(DBG_NETTCP, (3 << 8))
 153 #define DBG_FNC_TCP_NEWCONN     NETDBG_CODE(DBG_NETTCP, (7 << 8))
 154
 155 tcp_cc  tcp_ccgen;
 156
 157 struct  tcpstat tcpstat;
 158
 159 static int log_in_vain = 0;
 160 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain,
 161     CTLFLAG_RW | CTLFLAG_LOCKED, &log_in_vain, 0,
 162     "Log all incoming TCP connections");
 163
 164 static int blackhole = 0;
 165 SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole,
 166     CTLFLAG_RW | CTLFLAG_LOCKED, &blackhole, 0,
 167     "Do not send RST when dropping refused connections");
 168
 169 int tcp_delack_enabled = 3;
 170 SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack,
 171     CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_delack_enabled, 0,
 172     "Delay ACK to try and piggyback it onto a data packet");
 173
 174 int tcp_lq_overflow = 1;
 175 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_lq_overflow,
 176     CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_lq_overflow, 0,
 177     "Listen Queue Overflow");
 178
 179 int tcp_recv_bg = 0;
 180 SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbg, CTLFLAG_RW | CTLFLAG_LOCKED,
 181     &tcp_recv_bg, 0, "Receive background");
 182
 183 #if TCP_DROP_SYNFIN
 184 static int drop_synfin = 1;
 185 SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin,
 186     CTLFLAG_RW | CTLFLAG_LOCKED, &drop_synfin, 0,
 187     "Drop TCP packets with SYN+FIN set");
 188 #endif
 189
 190 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW|CTLFLAG_LOCKED, 0,
 191     "TCP Segment Reassembly Queue");
 192
 193 static int tcp_reass_overflows = 0;
 194 SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows,
 195     CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_reass_overflows, 0,
 196     "Global number of TCP Segment Reassembly Queue Overflows");
 197
 198
 199 __private_extern__ int slowlink_wsize = 8192;
 200 SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowlink_wsize,
 201     CTLFLAG_RW | CTLFLAG_LOCKED,
 202     &slowlink_wsize, 0, "Maximum advertised window size for slowlink");
 203
 204 int maxseg_unacked = 8;
 205 SYSCTL_INT(_net_inet_tcp, OID_AUTO, maxseg_unacked,
 206     CTLFLAG_RW | CTLFLAG_LOCKED, &maxseg_unacked, 0,
 207     "Maximum number of outstanding segments left unacked");
 208
 209 int tcp_do_rfc3465 = 1;
 210 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW | CTLFLAG_LOCKED,
 211     &tcp_do_rfc3465, 0, "");
 212
 213 int tcp_do_rfc3465_lim2 = 1;
 214 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465_lim2,
 215     CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_rfc3465_lim2, 0,
 216     "Appropriate bytes counting w/ L=2*SMSS");
 217
 218 int rtt_samples_per_slot = 20;
 219
 220 int tcp_allowed_iaj = ALLOWED_IAJ;
 221 int tcp_acc_iaj_high_thresh = ACC_IAJ_HIGH_THRESH;
 222 u_int32_t tcp_autorcvbuf_inc_shift = 3;
 223 SYSCTL_INT(_net_inet_tcp, OID_AUTO, recv_allowed_iaj,
 224     CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_allowed_iaj, 0,
 225     "Allowed inter-packet arrival jiter");
 226 #if (DEVELOPMENT || DEBUG)
 227 SYSCTL_INT(_net_inet_tcp, OID_AUTO, acc_iaj_high_thresh,
 228     CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_acc_iaj_high_thresh, 0,
 229     "Used in calculating maximum accumulated IAJ");
 230
 231 SYSCTL_INT(_net_inet_tcp, OID_AUTO, autorcvbufincshift,
 232     CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_autorcvbuf_inc_shift, 0,
 233     "Shift for increment in receive socket buffer size");
 234 #endif /* (DEVELOPMENT || DEBUG) */
 235
 236 u_int32_t tcp_do_autorcvbuf = 1;
 237 SYSCTL_INT(_net_inet_tcp, OID_AUTO, doautorcvbuf,
 238     CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_autorcvbuf, 0,
 239     "Enable automatic socket buffer tuning");
 240
 241 u_int32_t tcp_autorcvbuf_max = 512 * 1024;
 242 SYSCTL_INT(_net_inet_tcp, OID_AUTO, autorcvbufmax,
 243     CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_autorcvbuf_max, 0,
 244     "Maximum receive socket buffer size");
 245
 246 int sw_lro = 0;
 247 SYSCTL_INT(_net_inet_tcp, OID_AUTO, lro, CTLFLAG_RW | CTLFLAG_LOCKED,
 248         &sw_lro, 0, "Used to coalesce TCP packets");
 249
 250 int lrodebug = 0;
 251 SYSCTL_INT(_net_inet_tcp, OID_AUTO, lrodbg,
 252     CTLFLAG_RW | CTLFLAG_LOCKED, &lrodebug, 0,
 253     "Used to debug SW LRO");
 254
 255 int lro_start = 4;
 256 SYSCTL_INT(_net_inet_tcp, OID_AUTO, lro_startcnt,
 257     CTLFLAG_RW | CTLFLAG_LOCKED, &lro_start, 0,
 258     "Segments for starting LRO computed as power of 2");
 259
 260 extern int tcp_do_autosendbuf;
 261
 262 int limited_txmt = 1;
 263 int early_rexmt = 1;
 264 int sack_ackadv = 1;
 265 int tcp_dsack_enable = 1;
 266
 267 #if (DEVELOPMENT || DEBUG)
 268 SYSCTL_INT(_net_inet_tcp, OID_AUTO, limited_transmit,
 269     CTLFLAG_RW | CTLFLAG_LOCKED, &limited_txmt, 0,
 270     "Enable limited transmit");
 271
 272 SYSCTL_INT(_net_inet_tcp, OID_AUTO, early_rexmt,
 273     CTLFLAG_RW | CTLFLAG_LOCKED, &early_rexmt, 0,
 274     "Enable Early Retransmit");
 275
 276 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sack_ackadv,
 277     CTLFLAG_RW | CTLFLAG_LOCKED, &sack_ackadv, 0,
 278     "Use SACK with cumulative ack advancement as a dupack");
 279
 280 SYSCTL_INT(_net_inet_tcp, OID_AUTO, dsack_enable,
 281     CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_dsack_enable, 0,
 282     "use DSACK TCP option to report duplicate segments");
 283 #endif /* (DEVELOPMENT || DEBUG) */
 284
 285 #if CONFIG_IFEF_NOWINDOWSCALE
 286 int tcp_obey_ifef_nowindowscale = 0;
 287 SYSCTL_INT(_net_inet_tcp, OID_AUTO, obey_ifef_nowindowscale,
 288     CTLFLAG_RW | CTLFLAG_LOCKED,
 289     &tcp_obey_ifef_nowindowscale, 0, "");
 290 #endif
 291
 292 extern int tcp_TCPTV_MIN;
 293 extern int tcp_acc_iaj_high;
 294 extern int tcp_acc_iaj_react_limit;
 295
 296 int tcprexmtthresh = 3;
 297
 298 u_int32_t tcp_now;
 299 struct timeval tcp_uptime;      /* uptime when tcp_now was last updated */
 300 lck_spin_t *tcp_uptime_lock;    /* Used to sychronize updates to tcp_now */
 301
 302 struct inpcbhead tcb;
 303 #define tcb6    tcb  /* for KAME src sync over BSD*'s */
 304 struct inpcbinfo tcbinfo;
 305
 306 static void tcp_dooptions(struct tcpcb *, u_char *, int, struct tcphdr *,
 307     struct tcpopt *);
 308 static void tcp_finalize_options(struct tcpcb *, struct tcpopt *, unsigned int);
 309 static void tcp_pulloutofband(struct socket *,
 310     struct tcphdr *, struct mbuf *, int);
 311 static int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *,
 312     struct ifnet *);
 313 static void tcp_xmit_timer(struct tcpcb *, int, u_int32_t, tcp_seq);
 314 static inline unsigned int tcp_maxmtu(struct rtentry *);
 315 static inline int tcp_stretch_ack_enable(struct tcpcb *tp);
 316 static inline void tcp_adaptive_rwtimo_check(struct tcpcb *, int);
 317
 318 #if TRAFFIC_MGT
 319 static inline void update_iaj_state(struct tcpcb *tp, uint32_t tlen,
 320     int reset_size);
 321 void compute_iaj(struct tcpcb *tp, int nlropkts, int lro_delay_factor);
 322 static void compute_iaj_meat(struct tcpcb *tp, uint32_t cur_iaj);
 323 #endif /* TRAFFIC_MGT */
 324
 325 #if INET6
 326 static inline unsigned int tcp_maxmtu6(struct rtentry *);
 327 #endif
 328
 329 static void tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sb,
 330     struct tcpopt *to, u_int32_t tlen);
 331
 332 void tcp_sbrcv_trim(struct tcpcb *tp, struct sockbuf *sb);
 333 static void tcp_sbsnd_trim(struct sockbuf *sbsnd);
 334 static inline void tcp_sbrcv_tstmp_check(struct tcpcb *tp);
 335 static inline void tcp_sbrcv_reserve(struct tcpcb *tp, struct sockbuf *sb,
 336     u_int32_t newsize, u_int32_t idealsize);
 337 static void tcp_bad_rexmt_restore_state(struct tcpcb *tp, struct tcphdr *th);
 338 static void tcp_compute_rtt(struct tcpcb *tp, struct tcpopt *to,
 339     struct tcphdr *th);
 340 static void tcp_early_rexmt_check(struct tcpcb *tp, struct tcphdr *th);
 341 static void tcp_bad_rexmt_check(struct tcpcb *tp, struct tcphdr *th,
 342     struct tcpopt *to);
 343 /*
 344  * Constants used for resizing receive socket buffer
 345  * when timestamps are not supported
 346  */
 347 #define TCPTV_RCVNOTS_QUANTUM 100
 348 #define TCP_RCVNOTS_BYTELEVEL 204800
 349
 350 /*
 351  * Constants used for limiting early retransmits
 352  * to 10 per minute.
 353  */
 354 #define TCP_EARLY_REXMT_WIN (60 * TCP_RETRANSHZ) /* 60 seconds */
 355 #define TCP_EARLY_REXMT_LIMIT 10
 356
 357 extern void ipfwsyslog( int level, const char *format,...);
 358 extern int fw_verbose;
 359
 360 #if IPFIREWALL
 361 extern void ipfw_stealth_stats_incr_tcp(void);
 362
 363 #define log_in_vain_log( a ) {            \
 364         if ( (log_in_vain == 3 ) && (fw_verbose == 2)) {        /* Apple logging, log to ipfw.log */ \
 365                 ipfwsyslog a ;  \
 366         } else if ( (log_in_vain == 4 ) && (fw_verbose == 2)) {   \
 367                 ipfw_stealth_stats_incr_tcp();                    \
 368         }                       \
 369         else log a ;            \
 370 }
 371 #else
 372 #define log_in_vain_log( a ) { log a; }
 373 #endif
 374
 375 int tcp_rcvunackwin = TCPTV_UNACKWIN;
 376 int tcp_maxrcvidle = TCPTV_MAXRCVIDLE;
 377 int tcp_rcvsspktcnt = TCP_RCV_SS_PKTCOUNT;
 378 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rcvsspktcnt, CTLFLAG_RW | CTLFLAG_LOCKED,
 379         &tcp_rcvsspktcnt, 0, "packets to be seen before receiver stretches acks");
 380
 381 #define DELAY_ACK(tp, th) \
 382         (CC_ALGO(tp)->delay_ack != NULL && CC_ALGO(tp)->delay_ack(tp, th))
 383
 384 static int tcp_dropdropablreq(struct socket *head);
 385 static void tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th);
 386 static void update_base_rtt(struct tcpcb *tp, uint32_t rtt);
 387 void tcp_set_background_cc(struct socket *so);
 388 void tcp_set_foreground_cc(struct socket *so);
 389 static void tcp_set_new_cc(struct socket *so, uint16_t cc_index);
 390 static void tcp_bwmeas_check(struct tcpcb *tp);
 391
 392 #if TRAFFIC_MGT
 393 void
 394 reset_acc_iaj(struct tcpcb *tp)
 395 {
 396         tp->acc_iaj = 0;
 397         tp->iaj_rwintop = 0;
 398         CLEAR_IAJ_STATE(tp);
 399 }
 400
 401 static inline void
 402 update_iaj_state(struct tcpcb *tp, uint32_t size, int rst_size)
 403 {
 404         if (rst_size > 0)
 405                 tp->iaj_size = 0;
 406         if (tp->iaj_size == 0 || size >= tp->iaj_size) {
 407                 tp->iaj_size = size;
 408                 tp->iaj_rcv_ts = tcp_now;
 409                 tp->iaj_small_pkt = 0;
 410         }
 411 }
 412
 413 /* For every 32 bit unsigned integer(v), this function will find the
 414  * largest integer n such that (n*n <= v). This takes at most 16 iterations
 415  * irrespective of the value of v and does not involve multiplications.
 416  */
 417 static inline int
 418 isqrt(unsigned int val) {
 419         unsigned int sqrt_cache[11] = {0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100};
 420         unsigned int temp, g=0, b=0x8000, bshft=15;
 421         if ( val <= 100) {
 422                 for (g = 0; g <= 10; ++g) {
 423                         if (sqrt_cache[g] > val) {
 424                                 g--;
 425                                 break;
 426                         } else if (sqrt_cache[g] == val) {
 427                                 break;
 428                         }
 429                 }
 430         } else {
 431                 do {
 432                         temp = (((g << 1) + b) << (bshft--));
 433                         if (val >= temp) {
 434                                 g += b;
 435                                 val -= temp;
 436                         }
 437                         b >>= 1;
 438                 } while ( b > 0 && val > 0);
 439         }
 440         return(g);
 441 }
 442
 443 /*
 444 * With LRO, roughly estimate the inter arrival time between
 445 * each sub coalesced packet as an average. Count the delay
 446 * cur_iaj to be the delay between the last packet received
 447 * and the first packet of the LRO stream. Due to round off errors
 448 * cur_iaj may be the same as lro_delay_factor. Averaging has
 449 * round off errors too. lro_delay_factor may be close to 0
 450 * in steady state leading to lower values fed to compute_iaj_meat.
 451 */
 452 void
 453 compute_iaj(struct tcpcb *tp, int nlropkts, int lro_delay_factor)
 454 {
 455         uint32_t cur_iaj = tcp_now - tp->iaj_rcv_ts;
 456         uint32_t timediff = 0;
 457
 458         if (cur_iaj >= lro_delay_factor) {
 459                 cur_iaj = cur_iaj - lro_delay_factor;
 460         }
 461
 462         compute_iaj_meat(tp, cur_iaj);
 463
 464         if (nlropkts <= 1)
 465                 return;
 466
 467         nlropkts--;
 468
 469         timediff = lro_delay_factor/nlropkts;
 470
 471         while (nlropkts > 0)
 472         {
 473                 compute_iaj_meat(tp, timediff);
 474                 nlropkts--;
 475         }
 476 }
 477
 478 static
 479 void compute_iaj_meat(struct tcpcb *tp, uint32_t cur_iaj)
 480 {
 481         /* When accumulated IAJ reaches MAX_ACC_IAJ in milliseconds,
 482          * throttle the receive window to a minimum of MIN_IAJ_WIN packets
 483          */
 484 #define MAX_ACC_IAJ (tcp_acc_iaj_high_thresh + tcp_acc_iaj_react_limit)
 485 #define IAJ_DIV_SHIFT 4
 486 #define IAJ_ROUNDUP_CONST (1 << (IAJ_DIV_SHIFT - 1))
 487
 488         uint32_t allowed_iaj, acc_iaj = 0;
 489
 490         uint32_t mean, temp;
 491         int32_t cur_iaj_dev;
 492
 493         cur_iaj_dev = (cur_iaj - tp->avg_iaj);
 494
 495         /* Allow a jitter of "allowed_iaj" milliseconds. Some connections
 496          * may have a constant jitter more than that. We detect this by
 497          * using standard deviation.
 498          */
 499         allowed_iaj = tp->avg_iaj + tp->std_dev_iaj;
 500         if (allowed_iaj < tcp_allowed_iaj)
 501                 allowed_iaj = tcp_allowed_iaj;
 502
 503         /* Initially when the connection starts, the senders congestion
 504          * window is small. During this period we avoid throttling a
 505          * connection because we do not have a good starting point for
 506          * allowed_iaj. IAJ_IGNORE_PKTCNT is used to quietly gloss over
 507          * the first few packets.
 508          */
 509         if (tp->iaj_pktcnt > IAJ_IGNORE_PKTCNT) {
 510                 if ( cur_iaj <= allowed_iaj ) {
 511                         if (tp->acc_iaj >= 2)
 512                                 acc_iaj = tp->acc_iaj - 2;
 513                         else
 514                                 acc_iaj = 0;
 515
 516                 } else {
 517                         acc_iaj = tp->acc_iaj + (cur_iaj - allowed_iaj);
 518                 }
 519
 520                 if (acc_iaj > MAX_ACC_IAJ)
 521                         acc_iaj = MAX_ACC_IAJ;
 522                 tp->acc_iaj = acc_iaj;
 523         }
 524
 525         /* Compute weighted average where the history has a weight of
 526          * 15 out of 16 and the current value has a weight of 1 out of 16.
 527          * This will make the short-term measurements have more weight.
 528          *
 529          * The addition of 8 will help to round-up the value
 530          * instead of round-down
 531          */
 532         tp->avg_iaj = (((tp->avg_iaj << IAJ_DIV_SHIFT) - tp->avg_iaj)
 533                 + cur_iaj + IAJ_ROUNDUP_CONST) >> IAJ_DIV_SHIFT;
 534
 535         /* Compute Root-mean-square of deviation where mean is a weighted
 536          * average as described above.
 537          */
 538         temp = tp->std_dev_iaj * tp->std_dev_iaj;
 539         mean = (((temp << IAJ_DIV_SHIFT) - temp)
 540                 + (cur_iaj_dev * cur_iaj_dev)
 541                 + IAJ_ROUNDUP_CONST) >> IAJ_DIV_SHIFT;
 542
 543         tp->std_dev_iaj = isqrt(mean);
 544
 545         DTRACE_TCP3(iaj, struct tcpcb *, tp, uint32_t, cur_iaj,
 546                 uint32_t, allowed_iaj);
 547
 548         return;
 549 }
 550 #endif /* TRAFFIC_MGT */
 551
 552 /* Check if enough amount of data has been acknowledged since
 553  * bw measurement was started
 554  */
 555 static void
 556 tcp_bwmeas_check(struct tcpcb *tp)
 557 {
 558         int32_t bw_meas_bytes;
 559         uint32_t bw, bytes, elapsed_time;
 560         bw_meas_bytes = tp->snd_una - tp->t_bwmeas->bw_start;
 561         if ((tp->t_flagsext & TF_BWMEAS_INPROGRESS) != 0 &&
 562             bw_meas_bytes >= (int32_t)(tp->t_bwmeas->bw_size)) {
 563                 bytes = bw_meas_bytes;
 564                 elapsed_time = tcp_now - tp->t_bwmeas->bw_ts;
 565                 if (elapsed_time > 0) {
 566                         bw = bytes / elapsed_time;
 567                         if ( bw > 0) {
 568                                 if (tp->t_bwmeas->bw_sndbw > 0) {
 569                                         tp->t_bwmeas->bw_sndbw =
 570                                             (((tp->t_bwmeas->bw_sndbw << 3) - tp->t_bwmeas->bw_sndbw) + bw) >> 3;
 571                                 } else {
 572                                         tp->t_bwmeas->bw_sndbw = bw;
 573                                 }
 574                         }
 575                 }
 576                 tp->t_flagsext &= ~(TF_BWMEAS_INPROGRESS);
 577         }
 578 }
 579
 580 static int
 581 tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m,
 582     struct ifnet *ifp)
 583 {
 584         struct tseg_qent *q;
 585         struct tseg_qent *p = NULL;
 586         struct tseg_qent *nq;
 587         struct tseg_qent *te = NULL;
 588         struct inpcb *inp = tp->t_inpcb;
 589         struct socket *so = inp->inp_socket;
 590         int flags = 0;
 591         int dowakeup = 0;
 592         struct mbuf *oodata = NULL;
 593         int copy_oodata = 0;
 594         u_int16_t qlimit;
 595         boolean_t cell = IFNET_IS_CELLULAR(ifp);
 596         boolean_t wifi = (!cell && IFNET_IS_WIFI(ifp));
 597         boolean_t wired = (!wifi && IFNET_IS_WIRED(ifp));
 598         boolean_t dsack_set = FALSE;
 599
 600         /*
 601          * Call with th==0 after become established to
 602          * force pre-ESTABLISHED data up to user socket.
 603          */
 604         if (th == NULL)
 605                 goto present;
 606
 607         /*
 608          * If the reassembly queue already has entries or if we are going
 609          * to add a new one, then the connection has reached a loss state.
 610          * Reset the stretch-ack algorithm at this point.
 611          */
 612         tcp_reset_stretch_ack(tp);
 613
 614 #if TRAFFIC_MGT
 615         if (tp->acc_iaj > 0)
 616                 reset_acc_iaj(tp);
 617 #endif /* TRAFFIC_MGT */
 618
 619         /*
 620          * Limit the number of segments in the reassembly queue to prevent
 621          * holding on to too many segments (and thus running out of mbufs).
 622          * Make sure to let the missing segment through which caused this
 623          * queue.  Always keep one global queue entry spare to be able to
 624          * process the missing segment.
 625          */
 626         qlimit = min(max(100, so->so_rcv.sb_hiwat >> 10),
 627             tcp_autorcvbuf_max >> 10);
 628         if (th->th_seq != tp->rcv_nxt &&
 629             (tp->t_reassqlen + 1) >= qlimit) {
 630                 tcp_reass_overflows++;
 631                 tcpstat.tcps_rcvmemdrop++;
 632                 m_freem(m);
 633                 *tlenp = 0;
 634                 return (0);
 635         }
 636
 637         /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
 638         te = (struct tseg_qent *) zalloc(tcp_reass_zone);
 639         if (te == NULL) {
 640                 tcpstat.tcps_rcvmemdrop++;
 641                 m_freem(m);
 642                 return (0);
 643         }
 644         tp->t_reassqlen++;
 645
 646         /*
 647          * Find a segment which begins after this one does.
 648          */
 649         LIST_FOREACH(q, &tp->t_segq, tqe_q) {
 650                 if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
 651                         break;
 652                 p = q;
 653         }
 654
 655         /*
 656          * If there is a preceding segment, it may provide some of
 657          * our data already.  If so, drop the data from the incoming
 658          * segment.  If it provides all of our data, drop us.
 659          */
 660         if (p != NULL) {
 661                 int i;
 662                 /* conversion to int (in i) handles seq wraparound */
 663                 i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
 664                 if (i > 0) {
 665                         if (TCP_DSACK_ENABLED(tp) && i > 1) {
 666                                 /*
 667                                  * Note duplicate data sequnce numbers
 668                                  * to report in DSACK option
 669                                  */
 670                                 tp->t_dsack_lseq = th->th_seq;
 671                                 tp->t_dsack_rseq = th->th_seq +
 672                                     min(i, *tlenp);
 673
 674                                 /*
 675                                  * Report only the first part of partial/
 676                                  * non-contiguous duplicate sequence space
 677                                  */
 678                                 dsack_set = TRUE;
 679                         }
 680                         if (i >= *tlenp) {
 681                                 tcpstat.tcps_rcvduppack++;
 682                                 tcpstat.tcps_rcvdupbyte += *tlenp;
 683                                 if (nstat_collect) {
 684                                         nstat_route_rx(inp->inp_route.ro_rt,
 685                                             1, *tlenp,
 686                                             NSTAT_RX_FLAG_DUPLICATE);
 687                                         INP_ADD_STAT(inp, cell, wifi, wired,
 688                                             rxpackets, 1);
 689                                         INP_ADD_STAT(inp, cell, wifi, wired,
 690                                             rxbytes, *tlenp);
 691                                         tp->t_stat.rxduplicatebytes += *tlenp;
 692                                 }
 693                                 m_freem(m);
 694                                 zfree(tcp_reass_zone, te);
 695                                 te = NULL;
 696                                 tp->t_reassqlen--;
 697                                 /*
 698                                  * Try to present any queued data
 699                                  * at the left window edge to the user.
 700                                  * This is needed after the 3-WHS
 701                                  * completes.
 702                                  */
 703                                 goto present;
 704                         }
 705                         m_adj(m, i);
 706                         *tlenp -= i;
 707                         th->th_seq += i;
 708                 }
 709         }
 710         tp->t_rcvoopack++;
 711         tcpstat.tcps_rcvoopack++;
 712         tcpstat.tcps_rcvoobyte += *tlenp;
 713         if (nstat_collect) {
 714                 nstat_route_rx(inp->inp_route.ro_rt, 1, *tlenp,
 715                     NSTAT_RX_FLAG_OUT_OF_ORDER);
 716                 INP_ADD_STAT(inp, cell, wifi, wired, rxpackets, 1);
 717                 INP_ADD_STAT(inp, cell, wifi, wired, rxbytes, *tlenp);
 718                 tp->t_stat.rxoutoforderbytes += *tlenp;
 719         }
 720
 721         /*
 722          * While we overlap succeeding segments trim them or,
 723          * if they are completely covered, dequeue them.
 724          */
 725         while (q) {
 726                 int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
 727                 if (i <= 0)
 728                         break;
 729
 730                 /*
 731                  * Report only the first part of partial/non-contiguous
 732                  * duplicate segment in dsack option. The variable
 733                  * dsack_set will be true if a previous entry has some of
 734                  * the duplicate sequence space.
 735                  */
 736                 if (TCP_DSACK_ENABLED(tp) && i > 1 && !dsack_set) {
 737                         if (tp->t_dsack_lseq == 0) {
 738                                 tp->t_dsack_lseq = q->tqe_th->th_seq;
 739                                 tp->t_dsack_rseq =
 740                                     tp->t_dsack_lseq + min(i, q->tqe_len);
 741                         } else {
 742                                 /*
 743                                  * this segment overlaps data in multple
 744                                  * entries in the reassembly queue, move
 745                                  * the right sequence number further.
 746                                  */
 747                                 tp->t_dsack_rseq =
 748                                     tp->t_dsack_rseq + min(i, q->tqe_len);
 749                         }
 750                 }
 751                 if (i < q->tqe_len) {
 752                         q->tqe_th->th_seq += i;
 753                         q->tqe_len -= i;
 754                         m_adj(q->tqe_m, i);
 755                         break;
 756                 }
 757
 758                 nq = LIST_NEXT(q, tqe_q);
 759                 LIST_REMOVE(q, tqe_q);
 760                 m_freem(q->tqe_m);
 761                 zfree(tcp_reass_zone, q);
 762                 tp->t_reassqlen--;
 763                 q = nq;
 764         }
 765
 766         /* Insert the new segment queue entry into place. */
 767         te->tqe_m = m;
 768         te->tqe_th = th;
 769         te->tqe_len = *tlenp;
 770
 771         if (p == NULL) {
 772                 LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
 773         } else {
 774                 LIST_INSERT_AFTER(p, te, tqe_q);
 775         }
 776
 777         /*
 778          * New out-of-order data exists, and is pointed to by
 779          * queue entry te. Set copy_oodata to 1 so out-of-order data
 780          * can be copied off to sockbuf after in-order data
 781          * is copied off.
 782          */
 783         if (!(so->so_state & SS_CANTRCVMORE))
 784                 copy_oodata = 1;
 785
 786 present:
 787         /*
 788          * Present data to user, advancing rcv_nxt through
 789          * completed sequence space.
 790          */
 791         if (!TCPS_HAVEESTABLISHED(tp->t_state))
 792                 return (0);
 793         q = LIST_FIRST(&tp->t_segq);
 794         if (!q || q->tqe_th->th_seq != tp->rcv_nxt) {
 795                 /* Stop using LRO once out of order packets arrive */
 796                 if (tp->t_flagsext & TF_LRO_OFFLOADED) {
 797                         tcp_lro_remove_state(inp->inp_laddr, inp->inp_faddr,
 798                                 th->th_dport, th->th_sport);
 799                         tp->t_flagsext &= ~TF_LRO_OFFLOADED;
 800                 }
 801
 802                 /*
 803                  * continue processing if out-of-order data
 804                  * can be delivered
 805                  */
 806                 if (q && (so->so_flags & SOF_ENABLE_MSGS))
 807                         goto msg_unordered_delivery;
 808
 809                 return (0);
 810         }
 811
 812         /* lost packet was recovered, so ooo data can be returned */
 813         tcpstat.tcps_recovered_pkts++;
 814
 815         do {
 816                 tp->rcv_nxt += q->tqe_len;
 817                 flags = q->tqe_th->th_flags & TH_FIN;
 818                 nq = LIST_NEXT(q, tqe_q);
 819                 LIST_REMOVE(q, tqe_q);
 820                 if (so->so_state & SS_CANTRCVMORE) {
 821                         m_freem(q->tqe_m);
 822                 } else {
 823                         so_recv_data_stat(so, q->tqe_m, 0); /* XXXX */
 824                         if (so->so_flags & SOF_ENABLE_MSGS) {
 825                                 /*
 826                                  * Append the inorder data as a message to the
 827                                  * receive socket buffer. Also check to see if
 828                                  * the data we are about to deliver is the same
 829                                  * data that we wanted to pass up to the user
 830                                  * out of order. If so, reset copy_oodata --
 831                                  * the received data filled a gap, and
 832                                  * is now in order!
 833                                  */
 834                                 if (q == te)
 835                                         copy_oodata = 0;
 836                         }
 837                         if (sbappendstream_rcvdemux(so, q->tqe_m,
 838                             q->tqe_th->th_seq - (tp->irs + 1), 0))
 839                                 dowakeup = 1;
 840                         if (tp->t_flagsext & TF_LRO_OFFLOADED) {
 841                                 tcp_update_lro_seq(tp->rcv_nxt,
 842                                  inp->inp_laddr, inp->inp_faddr,
 843                                  th->th_dport, th->th_sport);
 844                         }
 845                 }
 846                 zfree(tcp_reass_zone, q);
 847                 tp->t_reassqlen--;
 848                 q = nq;
 849         } while (q && q->tqe_th->th_seq == tp->rcv_nxt);
 850
 851 #if INET6
 852         if ((inp->inp_vflag & INP_IPV6) != 0) {
 853
 854                 KERNEL_DEBUG(DBG_LAYER_BEG,
 855                      ((inp->inp_fport << 16) | inp->inp_lport),
 856                      (((inp->in6p_laddr.s6_addr16[0] & 0xffff) << 16) |
 857                       (inp->in6p_faddr.s6_addr16[0] & 0xffff)),
 858                      0,0,0);
 859         }
 860         else
 861 #endif
 862         {
 863                 KERNEL_DEBUG(DBG_LAYER_BEG,
 864                      ((inp->inp_fport << 16) | inp->inp_lport),
 865                      (((inp->inp_laddr.s_addr & 0xffff) << 16) |
 866                       (inp->inp_faddr.s_addr & 0xffff)),
 867                      0,0,0);
 868         }
 869
 870 msg_unordered_delivery:
 871         /* Deliver out-of-order data as a message */
 872         if (te && (so->so_flags & SOF_ENABLE_MSGS) && copy_oodata && te->tqe_len) {
 873                 /*
 874                  * make a copy of the mbuf to be delivered up to
 875                  * the user, and add it to the sockbuf
 876                  */
 877                 oodata = m_copym(te->tqe_m, 0, M_COPYALL, M_DONTWAIT);
 878                 if (oodata != NULL) {
 879                         if (sbappendmsgstream_rcv(&so->so_rcv, oodata,
 880                                 te->tqe_th->th_seq - (tp->irs + 1), 1)) {
 881                                 dowakeup = 1;
 882                                 tcpstat.tcps_msg_unopkts++;
 883                         } else {
 884                                 tcpstat.tcps_msg_unoappendfail++;
 885                         }
 886                 }
 887         }
 888
 889         if (dowakeup)
 890                 sorwakeup(so); /* done with socket lock held */
 891         return (flags);
 892 }
 893
 894 /*
 895  * Reduce congestion window -- used when ECN is seen or when a tail loss
 896  * probe recovers the last packet.
 897  */
 898 static void
 899 tcp_reduce_congestion_window(
 900         struct tcpcb    *tp)
 901 {
 902         /*
 903          * If the current tcp cc module has
 904          * defined a hook for tasks to run
 905          * before entering FR, call it
 906          */
 907         if (CC_ALGO(tp)->pre_fr != NULL)
 908                 CC_ALGO(tp)->pre_fr(tp);
 909         ENTER_FASTRECOVERY(tp);
 910         if (tp->t_flags & TF_SENTFIN)
 911                 tp->snd_recover = tp->snd_max - 1;
 912         else
 913                 tp->snd_recover = tp->snd_max;
 914         tp->t_timer[TCPT_REXMT] = 0;
 915         tp->t_timer[TCPT_PTO] = 0;
 916         tp->t_rtttime = 0;
 917         if (tp->t_flagsext & TF_CWND_NONVALIDATED) {
 918                 tcp_cc_adjust_nonvalidated_cwnd(tp);
 919         } else {
 920                 tp->snd_cwnd = tp->snd_ssthresh +
 921                     tp->t_maxseg * tcprexmtthresh;
 922         }
 923 }
 924
 925 /*
 926  * This function is called upon reception of data on a socket. It's purpose is
 927  * to handle the adaptive keepalive timers that monitor whether the connection
 928  * is making progress. First the adaptive read-timer, second the TFO probe-timer.
 929  *
 930  * The application wants to get an event if there is a stall during read.
 931  * Set the initial keepalive timeout to be equal to twice RTO.
 932  *
 933  * If the outgoing interface is in marginal conditions, we need to
 934  * enable read probes for that too.
 935  */
 936 static inline void
 937 tcp_adaptive_rwtimo_check(struct tcpcb *tp, int tlen)
 938 {
 939         struct ifnet *outifp = tp->t_inpcb->inp_last_outifp;
 940
 941         if ((tp->t_adaptive_rtimo > 0 ||
 942             (outifp != NULL &&
 943             (outifp->if_eflags & IFEF_PROBE_CONNECTIVITY)))
 944             && tlen > 0 &&
 945             tp->t_state == TCPS_ESTABLISHED) {
 946                 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
 947                         (TCP_REXMTVAL(tp) << 1));
 948                 tp->t_flagsext |= TF_DETECT_READSTALL;
 949                 tp->t_rtimo_probes = 0;
 950         }
 951 }
 952
 953 inline void
 954 tcp_keepalive_reset(struct tcpcb *tp)
 955 {
 956         tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
 957                 TCP_CONN_KEEPIDLE(tp));
 958         tp->t_flagsext &= ~(TF_DETECT_READSTALL);
 959         tp->t_rtimo_probes = 0;
 960 }
 961
 962 /*
 963  * TCP input routine, follows pages 65-76 of the
 964  * protocol specification dated September, 1981 very closely.
 965  */
 966 #if INET6
 967 int
 968 tcp6_input(struct mbuf **mp, int *offp, int proto)
 969 {
 970 #pragma unused(proto)
 971         register struct mbuf *m = *mp;
 972         uint32_t ia6_flags;
 973         struct ifnet *ifp = m->m_pkthdr.rcvif;
 974
 975         IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), return IPPROTO_DONE);
 976
 977         /* Expect 32-bit aligned data pointer on strict-align platforms */
 978         MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
 979
 980         /*
 981          * draft-itojun-ipv6-tcp-to-anycast
 982          * better place to put this in?
 983          */
 984         if (ip6_getdstifaddr_info(m, NULL, &ia6_flags) == 0) {
 985                 if (ia6_flags & IN6_IFF_ANYCAST) {
 986                         struct ip6_hdr *ip6;
 987
 988                         ip6 = mtod(m, struct ip6_hdr *);
 989                         icmp6_error(m, ICMP6_DST_UNREACH,
 990                             ICMP6_DST_UNREACH_ADDR,
 991                             (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
 992
 993                         IF_TCP_STATINC(ifp, icmp6unreach);
 994
 995                         return (IPPROTO_DONE);
 996                 }
 997         }
 998
 999         tcp_input(m, *offp);
1000         return (IPPROTO_DONE);
1001 }
1002 #endif
1003
1004 /* Depending on the usage of mbuf space in the system, this function
1005  * will return true or false. This is used to determine if a socket
1006  * buffer can take more memory from the system for auto-tuning or not.
1007  */
1008 u_int8_t
1009 tcp_cansbgrow(struct sockbuf *sb)
1010 {
1011         /* Calculate the host level space limit in terms of MSIZE buffers.
1012          * We can use a maximum of half of the available mbuf space for
1013          * socket buffers.
1014          */
1015         u_int32_t mblim = ((nmbclusters >> 1) << (MCLSHIFT - MSIZESHIFT));
1016
1017         /* Calculate per sb limit in terms of bytes. We optimize this limit
1018          * for upto 16 socket buffers.
1019          */
1020
1021         u_int32_t sbspacelim = ((nmbclusters >> 4) << MCLSHIFT);
1022
1023         if ((total_sbmb_cnt < mblim) &&
1024                 (sb->sb_hiwat < sbspacelim)) {
1025                 return(1);
1026         } else {
1027                 OSIncrementAtomic64(&sbmb_limreached);
1028         }
1029         return(0);
1030 }
1031
1032 static void
1033 tcp_sbrcv_reserve(struct tcpcb *tp, struct sockbuf *sbrcv,
1034         u_int32_t newsize, u_int32_t idealsize)
1035 {
1036
1037         /* newsize should not exceed max */
1038         newsize = min(newsize, tcp_autorcvbuf_max);
1039
1040         /* The receive window scale negotiated at the
1041          * beginning of the connection will also set a
1042          * limit on the socket buffer size
1043          */
1044         newsize = min(newsize, TCP_MAXWIN << tp->rcv_scale);
1045
1046         /* Set new socket buffer size */
1047         if (newsize > sbrcv->sb_hiwat &&
1048                 (sbreserve(sbrcv, newsize) == 1)) {
1049                 sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize,
1050                         (idealsize != 0) ? idealsize : newsize),
1051                         tcp_autorcvbuf_max);
1052
1053                 /* Again check the limit set by the advertised
1054                  * window scale
1055                  */
1056                 sbrcv->sb_idealsize = min(sbrcv->sb_idealsize,
1057                         TCP_MAXWIN << tp->rcv_scale);
1058         }
1059 }
1060
1061 /*
1062  * This function is used to grow  a receive socket buffer. It
1063  * will take into account system-level memory usage and the
1064  * bandwidth available on the link to make a decision.
1065  */
1066 static void
1067 tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sbrcv,
1068         struct tcpopt *to, u_int32_t pktlen)
1069 {
1070         struct socket *so = sbrcv->sb_so;
1071
1072         /*
1073          * Do not grow the receive socket buffer if
1074          * - auto resizing is disabled, globally or on this socket
1075          * - the high water mark already reached the maximum
1076          * - the stream is in background and receive side is being
1077          * throttled
1078          * - if there are segments in reassembly queue indicating loss,
1079          * do not need to increase recv window during recovery as more
1080          * data is not going to be sent. A duplicate ack sent during
1081          * recovery should not change the receive window
1082          */
1083         if (tcp_do_autorcvbuf == 0 ||
1084                 (sbrcv->sb_flags & SB_AUTOSIZE) == 0 ||
1085                 tcp_cansbgrow(sbrcv) == 0 ||
1086                 sbrcv->sb_hiwat >= tcp_autorcvbuf_max ||
1087                 (tp->t_flagsext & TF_RECV_THROTTLE) ||
1088                 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ||
1089                 !LIST_EMPTY(&tp->t_segq)) {
1090                 /* Can not resize the socket buffer, just return */
1091                 goto out;
1092         }
1093
1094         if (TSTMP_GT(tcp_now,
1095                 tp->rfbuf_ts + TCPTV_RCVBUFIDLE)) {
1096                 /* If there has been an idle period in the
1097                  * connection, just restart the measurement
1098                  */
1099                 goto out;
1100         }
1101
1102         if (!TSTMP_SUPPORTED(tp)) {
1103                 /*
1104                  * Timestamp option is not supported on this connection.
1105                  * If the connection reached a state to indicate that
1106                  * the receive socket buffer needs to grow, increase
1107                  * the high water mark.
1108                  */
1109                 if (TSTMP_GEQ(tcp_now,
1110                         tp->rfbuf_ts + TCPTV_RCVNOTS_QUANTUM)) {
1111                         if (tp->rfbuf_cnt >= TCP_RCVNOTS_BYTELEVEL) {
1112                                 tcp_sbrcv_reserve(tp, sbrcv,
1113                                         tcp_autorcvbuf_max, 0);
1114                         }
1115                         goto out;
1116                 } else {
1117                         tp->rfbuf_cnt += pktlen;
1118                         return;
1119                 }
1120         } else if (to->to_tsecr != 0) {
1121                 /*
1122                  * If the timestamp shows that one RTT has
1123                  * completed, we can stop counting the
1124                  * bytes. Here we consider increasing
1125                  * the socket buffer if the bandwidth measured in
1126                  * last rtt, is more than half of sb_hiwat, this will
1127                  * help to scale the buffer according to the bandwidth
1128                  * on the link.
1129                  */
1130                 if (TSTMP_GEQ(to->to_tsecr, tp->rfbuf_ts)) {
1131                         if (tp->rfbuf_cnt > (sbrcv->sb_hiwat -
1132                                 (sbrcv->sb_hiwat >> 1))) {
1133                                 int32_t rcvbuf_inc, min_incr;
1134                                 /*
1135                                  * Increment the receive window by a
1136                                  * multiple of maximum sized segments.
1137                                  * This will prevent a connection from
1138                                  * sending smaller segments on wire if it
1139                                  * is limited by the receive window.
1140                                  *
1141                                  * Set the ideal size based on current
1142                                  * bandwidth measurements. We set the
1143                                  * ideal size on receive socket buffer to
1144                                  * be twice the bandwidth delay product.
1145                                  */
1146                                 rcvbuf_inc = (tp->rfbuf_cnt << 1)
1147                                     - sbrcv->sb_hiwat;
1148
1149                                 /*
1150                                  * Make the increment equal to 8 segments
1151                                  * at least
1152                                  */
1153                                 min_incr = tp->t_maxseg << tcp_autorcvbuf_inc_shift;
1154                                 if (rcvbuf_inc < min_incr)
1155                                     rcvbuf_inc = min_incr;
1156
1157                                 rcvbuf_inc =
1158                                     (rcvbuf_inc / tp->t_maxseg) * tp->t_maxseg;
1159                                 tcp_sbrcv_reserve(tp, sbrcv,
1160                                         sbrcv->sb_hiwat + rcvbuf_inc,
1161                                         (tp->rfbuf_cnt * 2));
1162                         }
1163                         goto out;
1164                 } else {
1165                         tp->rfbuf_cnt += pktlen;
1166                         return;
1167                 }
1168         }
1169 out:
1170         /* Restart the measurement */
1171         tp->rfbuf_ts = 0;
1172         tp->rfbuf_cnt = 0;
1173         return;
1174 }
1175
1176 /* This function will trim the excess space added to the socket buffer
1177  * to help a slow-reading app. The ideal-size of a socket buffer depends
1178  * on the link bandwidth or it is set by an application and we aim to
1179  * reach that size.
1180  */
1181 void
1182 tcp_sbrcv_trim(struct tcpcb *tp, struct sockbuf *sbrcv) {
1183         if (tcp_do_autorcvbuf == 1 && sbrcv->sb_idealsize > 0 &&
1184                 sbrcv->sb_hiwat > sbrcv->sb_idealsize) {
1185                 int32_t trim;
1186                 /* compute the difference between ideal and current sizes */
1187                 u_int32_t diff = sbrcv->sb_hiwat - sbrcv->sb_idealsize;
1188
1189                 /* Compute the maximum advertised window for
1190                  * this connection.
1191                  */
1192                 u_int32_t advwin = tp->rcv_adv - tp->rcv_nxt;
1193
1194                 /* How much can we trim the receive socket buffer?
1195                  * 1. it can not be trimmed beyond the max rcv win advertised
1196                  * 2. if possible, leave 1/16 of bandwidth*delay to
1197                  * avoid closing the win completely
1198                  */
1199                 u_int32_t leave = max(advwin, (sbrcv->sb_idealsize >> 4));
1200
1201                 /* Sometimes leave can be zero, in that case leave at least
1202                  * a few segments worth of space.
1203                  */
1204                 if (leave == 0)
1205                         leave = tp->t_maxseg << tcp_autorcvbuf_inc_shift;
1206
1207                 trim = sbrcv->sb_hiwat - (sbrcv->sb_cc + leave);
1208                 trim = imin(trim, (int32_t)diff);
1209
1210                 if (trim > 0)
1211                         sbreserve(sbrcv, (sbrcv->sb_hiwat - trim));
1212         }
1213 }
1214
1215 /* We may need to trim the send socket buffer size for two reasons:
1216  * 1. if the rtt seen on the connection is climbing up, we do not
1217  * want to fill the buffers any more.
1218  * 2. if the congestion win on the socket backed off, there is no need
1219  * to hold more mbufs for that connection than what the cwnd will allow.
1220  */
1221 void
1222 tcp_sbsnd_trim(struct sockbuf *sbsnd) {
1223         if (tcp_do_autosendbuf == 1 &&
1224                 ((sbsnd->sb_flags & (SB_AUTOSIZE | SB_TRIM)) ==
1225                         (SB_AUTOSIZE | SB_TRIM)) &&
1226                 (sbsnd->sb_idealsize > 0) &&
1227                 (sbsnd->sb_hiwat > sbsnd->sb_idealsize)) {
1228                 u_int32_t trim = 0;
1229                 if (sbsnd->sb_cc <= sbsnd->sb_idealsize) {
1230                         trim = sbsnd->sb_hiwat - sbsnd->sb_idealsize;
1231                 } else {
1232                         trim = sbsnd->sb_hiwat - sbsnd->sb_cc;
1233                 }
1234                 sbreserve(sbsnd, (sbsnd->sb_hiwat - trim));
1235         }
1236         if (sbsnd->sb_hiwat <= sbsnd->sb_idealsize)
1237                 sbsnd->sb_flags &= ~(SB_TRIM);
1238 }
1239
1240 /*
1241  * If timestamp option was not negotiated on this connection
1242  * and this connection is on the receiving side of a stream
1243  * then we can not measure the delay on the link accurately.
1244  * Instead of enabling automatic receive socket buffer
1245  * resizing, just give more space to the receive socket buffer.
1246  */
1247 static inline void
1248 tcp_sbrcv_tstmp_check(struct tcpcb *tp) {
1249         struct socket *so = tp->t_inpcb->inp_socket;
1250         u_int32_t newsize = 2 * tcp_recvspace;
1251         struct sockbuf *sbrcv = &so->so_rcv;
1252
1253         if ((tp->t_flags & (TF_REQ_TSTMP | TF_RCVD_TSTMP)) !=
1254                 (TF_REQ_TSTMP | TF_RCVD_TSTMP) &&
1255                 (sbrcv->sb_flags & SB_AUTOSIZE) != 0) {
1256                 tcp_sbrcv_reserve(tp, sbrcv, newsize, 0);
1257         }
1258 }
1259
1260 /* A receiver will evaluate the flow of packets on a connection
1261  * to see if it can reduce ack traffic. The receiver will start
1262  * stretching acks if all of the following conditions are met:
1263  * 1. tcp_delack_enabled is set to 3
1264  * 2. If the bytes received in the last 100ms is greater than a threshold
1265  *      defined by maxseg_unacked
1266  * 3. If the connection has not been idle for tcp_maxrcvidle period.
1267  * 4. If the connection has seen enough packets to let the slow-start
1268  *      finish after connection establishment or after some packet loss.
1269  *
1270  * The receiver will stop stretching acks if there is congestion/reordering
1271  * as indicated by packets on reassembly queue or an ECN. If the delayed-ack
1272  * timer fires while stretching acks, it means that the packet flow has gone
1273  * below the threshold defined by maxseg_unacked and the receiver will stop
1274  * stretching acks. The receiver gets no indication when slow-start is completed
1275  * or when the connection reaches an idle state. That is why we use
1276  * tcp_rcvsspktcnt to cover slow-start and tcp_maxrcvidle to identify idle
1277  * state.
1278  */
1279 static inline int
1280 tcp_stretch_ack_enable(struct tcpcb *tp)
1281 {
1282         if (!(tp->t_flagsext & (TF_NOSTRETCHACK|TF_DISABLE_STRETCHACK)) &&
1283                 tp->rcv_by_unackwin >= (maxseg_unacked * tp->t_maxseg) &&
1284                 TSTMP_GT(tp->rcv_unackwin + tcp_maxrcvidle, tcp_now) &&
1285                 (!(tp->t_flagsext & TF_RCVUNACK_WAITSS) ||
1286                 (tp->rcv_waitforss >= tcp_rcvsspktcnt))) {
1287                 return(1);
1288         }
1289
1290         return(0);
1291 }
1292
1293 /*
1294  * Reset the state related to stretch-ack algorithm. This will make
1295  * the receiver generate an ack every other packet. The receiver
1296  * will start re-evaluating the rate at which packets come to decide
1297  * if it can benefit by lowering the ack traffic.
1298  */
1299 void
1300 tcp_reset_stretch_ack(struct tcpcb *tp)
1301 {
1302         tp->t_flags &= ~(TF_STRETCHACK);
1303         tp->rcv_by_unackwin = 0;
1304         tp->rcv_unackwin = tcp_now + tcp_rcvunackwin;
1305
1306         /*
1307          * When there is packet loss or packet re-ordering or CWR due to
1308          * ECN, the sender's congestion window is reduced. In these states,
1309          * generate an ack for every other packet for some time to allow
1310          * the sender's congestion window to grow.
1311          */
1312         tp->t_flagsext |= TF_RCVUNACK_WAITSS;
1313         tp->rcv_waitforss = 0;
1314 }
1315
1316 /*
1317  * The last packet was a retransmission, check if this ack
1318  * indicates that the retransmission was spurious.
1319  *
1320  * If the connection supports timestamps, we could use it to
1321  * detect if the last retransmit was not needed. Otherwise,
1322  * we check if the ACK arrived within RTT/2 window, then it
1323  * was a mistake to do the retransmit in the first place.
1324  *
1325  * This function will return 1 if it is a spurious retransmit,
1326  * 0 otherwise.
1327  */
1328 int
1329 tcp_detect_bad_rexmt(struct tcpcb *tp, struct tcphdr *th,
1330         struct tcpopt *to, u_int32_t rxtime)
1331 {
1332         int32_t tdiff, bad_rexmt_win;
1333         bad_rexmt_win = (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
1334
1335         /* If the ack has ECN CE bit, then cwnd has to be adjusted */
1336         if (TCP_ECN_ENABLED(tp) && (th->th_flags & TH_ECE))
1337                 return (0);
1338         if (TSTMP_SUPPORTED(tp)) {
1339                 if (rxtime > 0 && (to->to_flags & TOF_TS)
1340                     && to->to_tsecr != 0
1341                     && TSTMP_LT(to->to_tsecr, rxtime))
1342                     return (1);
1343         } else {
1344                 if ((tp->t_rxtshift == 1
1345                     || (tp->t_flagsext & TF_SENT_TLPROBE))
1346                     && rxtime > 0) {
1347                         tdiff = (int32_t)(tcp_now - rxtime);
1348                         if (tdiff < bad_rexmt_win)
1349                                 return(1);
1350                 }
1351         }
1352         return(0);
1353 }
1354
1355
1356 /*
1357  * Restore congestion window state if a spurious timeout
1358  * was detected.
1359  */
1360 static void
1361 tcp_bad_rexmt_restore_state(struct tcpcb *tp, struct tcphdr *th)
1362 {
1363         if (TSTMP_SUPPORTED(tp)) {
1364                 u_int32_t fsize, acked;
1365                 fsize = tp->snd_max - th->th_ack;
1366                 acked = BYTES_ACKED(th, tp);
1367
1368                 /*
1369                  * Implement bad retransmit recovery as
1370                  * described in RFC 4015.
1371                  */
1372                 tp->snd_ssthresh = tp->snd_ssthresh_prev;
1373
1374                 /* Initialize cwnd to the initial window */
1375                 if (CC_ALGO(tp)->cwnd_init != NULL)
1376                         CC_ALGO(tp)->cwnd_init(tp);
1377
1378                 tp->snd_cwnd = fsize + min(acked, tp->snd_cwnd);
1379
1380         } else {
1381                 tp->snd_cwnd = tp->snd_cwnd_prev;
1382                 tp->snd_ssthresh = tp->snd_ssthresh_prev;
1383                 if (tp->t_flags & TF_WASFRECOVERY)
1384                         ENTER_FASTRECOVERY(tp);
1385
1386                 /* Do not use the loss flight size in this case */
1387                 tp->t_lossflightsize = 0;
1388         }
1389         tp->snd_cwnd = max(tp->snd_cwnd, TCP_CC_CWND_INIT_BYTES);
1390         tp->snd_recover = tp->snd_recover_prev;
1391         tp->snd_nxt = tp->snd_max;
1392         tp->t_rxtshift = 0;
1393         tp->t_rxtstart = 0;
1394
1395         /* Fix send socket buffer to reflect the change in cwnd */
1396         tcp_bad_rexmt_fix_sndbuf(tp);
1397
1398         /*
1399          * This RTT might reflect the extra delay induced
1400          * by the network. Skip using this sample for RTO
1401          * calculation and mark the connection so we can
1402          * recompute RTT when the next eligible sample is
1403          * found.
1404          */
1405         tp->t_flagsext |= TF_RECOMPUTE_RTT;
1406         tp->t_badrexmt_time = tcp_now;
1407         tp->t_rtttime = 0;
1408 }
1409
1410 /*
1411  * If the previous packet was sent in retransmission timer, and it was
1412  * not needed, then restore the congestion window to the state before that
1413  * transmission.
1414  *
1415  * If the last packet was sent in tail loss probe timeout, check if that
1416  * recovered the last packet. If so, that will indicate a real loss and
1417  * the congestion window needs to be lowered.
1418  */
1419 static void
1420 tcp_bad_rexmt_check(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to)
1421 {
1422         if (tp->t_rxtshift > 0 &&
1423             tcp_detect_bad_rexmt(tp, th, to, tp->t_rxtstart)) {
1424                 ++tcpstat.tcps_sndrexmitbad;
1425                 tcp_bad_rexmt_restore_state(tp, th);
1426                 tcp_ccdbg_trace(tp, th, TCP_CC_BAD_REXMT_RECOVERY);
1427         } else if ((tp->t_flagsext & TF_SENT_TLPROBE)
1428             && tp->t_tlphighrxt > 0
1429             && SEQ_GEQ(th->th_ack, tp->t_tlphighrxt)
1430             && !tcp_detect_bad_rexmt(tp, th, to, tp->t_tlpstart)) {
1431                 /*
1432                  * check DSACK information also to make sure that
1433                  * the TLP was indeed needed
1434                  */
1435                 if (tcp_rxtseg_dsack_for_tlp(tp)) {
1436                         /*
1437                          * received a DSACK to indicate that TLP was
1438                          * not needed
1439                          */
1440                         tcp_rxtseg_clean(tp);
1441                         goto out;
1442                 }
1443
1444                 /*
1445                  * The tail loss probe recovered the last packet and
1446                  * we need to adjust the congestion window to take
1447                  * this loss into account.
1448                  */
1449                 ++tcpstat.tcps_tlp_recoverlastpkt;
1450                 if (!IN_FASTRECOVERY(tp)) {
1451                         tcp_reduce_congestion_window(tp);
1452                         EXIT_FASTRECOVERY(tp);
1453                 }
1454                 tcp_ccdbg_trace(tp, th, TCP_CC_TLP_RECOVER_LASTPACKET);
1455         } else if (tcp_rxtseg_detect_bad_rexmt(tp, th->th_ack)) {
1456                 /*
1457                  * All of the retransmitted segments were duplicated, this
1458                  * can be an indication of bad fast retransmit.
1459                  */
1460                 tcpstat.tcps_dsack_badrexmt++;
1461                 tcp_bad_rexmt_restore_state(tp, th);
1462                 tcp_ccdbg_trace(tp, th, TCP_CC_DSACK_BAD_REXMT);
1463                 tcp_rxtseg_clean(tp);
1464         }
1465 out:
1466         tp->t_flagsext &= ~(TF_SENT_TLPROBE);
1467         tp->t_tlphighrxt = 0;
1468         tp->t_tlpstart = 0;
1469
1470         /*
1471          * check if the latest ack was for a segment sent during PMTU
1472          * blackhole detection. If the timestamp on the ack is before
1473          * PMTU blackhole detection, then revert the size of the max
1474          * segment to previous size.
1475          */
1476         if (tp->t_rxtshift > 0 && (tp->t_flags & TF_BLACKHOLE) &&
1477             tp->t_pmtud_start_ts > 0 && TSTMP_SUPPORTED(tp)) {
1478                 if ((to->to_flags & TOF_TS) && to->to_tsecr != 0
1479                     && TSTMP_LT(to->to_tsecr, tp->t_pmtud_start_ts)) {
1480                         tcp_pmtud_revert_segment_size(tp);
1481                 }
1482         }
1483         if (tp->t_pmtud_start_ts > 0)
1484                 tp->t_pmtud_start_ts = 0;
1485 }
1486
1487 /*
1488  * Check if early retransmit can be attempted according to RFC 5827.
1489  *
1490  * If packet reordering is detected on a connection, fast recovery will
1491  * be delayed until it is clear that the packet was lost and not reordered.
1492  * But reordering detection is done only when SACK is enabled.
1493  *
1494  * On connections that do not support SACK, there is a limit on the number
1495  * of early retransmits that can be done per minute. This limit is needed
1496  * to make sure that too many packets are not retransmitted when there is
1497  * packet reordering.
1498  */
1499 static void
1500 tcp_early_rexmt_check (struct tcpcb *tp, struct tcphdr *th)
1501 {
1502         u_int32_t obytes, snd_off;
1503         int32_t snd_len;
1504         struct socket *so = tp->t_inpcb->inp_socket;
1505
1506         if (early_rexmt && (SACK_ENABLED(tp) ||
1507             tp->t_early_rexmt_count < TCP_EARLY_REXMT_LIMIT) &&
1508             SEQ_GT(tp->snd_max, tp->snd_una) &&
1509             (tp->t_dupacks == 1 ||
1510             (SACK_ENABLED(tp) &&
1511             !TAILQ_EMPTY(&tp->snd_holes)))) {
1512                 /*
1513                  * If there are only a few outstanding
1514                  * segments on the connection, we might need
1515                  * to lower the retransmit threshold. This
1516                  * will allow us to do Early Retransmit as
1517                  * described in RFC 5827.
1518                  */
1519                 if (SACK_ENABLED(tp) &&
1520                     !TAILQ_EMPTY(&tp->snd_holes)) {
1521                         obytes = (tp->snd_max - tp->snd_fack) +
1522                                 tp->sackhint.sack_bytes_rexmit;
1523                 } else {
1524                         obytes = (tp->snd_max - tp->snd_una);
1525                 }
1526
1527                 /*
1528                  * In order to lower retransmit threshold the
1529                  * following two conditions must be met.
1530                  * 1. the amount of outstanding data is less
1531                  * than 4*SMSS bytes
1532                  * 2. there is no unsent data ready for
1533                  * transmission or the advertised window
1534                  * will limit sending new segments.
1535                  */
1536                 snd_off = tp->snd_max - tp->snd_una;
1537                 snd_len = min(so->so_snd.sb_cc, tp->snd_wnd) - snd_off;
1538                 if (obytes < (tp->t_maxseg << 2) &&
1539                     snd_len <= 0) {
1540                         u_int32_t osegs;
1541
1542                         osegs = obytes / tp->t_maxseg;
1543                         if ((osegs * tp->t_maxseg) < obytes)
1544                                 osegs++;
1545
1546                         /*
1547                          * Since the connection might have already
1548                          * received some dupacks, we add them to
1549                          * to the outstanding segments count to get
1550                          * the correct retransmit threshold.
1551                          *
1552                          * By checking for early retransmit after
1553                          * receiving some duplicate acks when SACK
1554                          * is supported, the connection will
1555                          * enter fast recovery even if multiple
1556                          * segments are lost in the same window.
1557                          */
1558                         osegs += tp->t_dupacks;
1559                         if (osegs < 4) {
1560                                 tp->t_rexmtthresh =
1561                                     ((osegs - 1) > 1) ? (osegs - 1) : 1;
1562                                 tp->t_rexmtthresh =
1563                                     min(tp->t_rexmtthresh, tcprexmtthresh);
1564                                 tp->t_rexmtthresh =
1565                                     max(tp->t_rexmtthresh, tp->t_dupacks);
1566
1567                                 if (tp->t_early_rexmt_count == 0)
1568                                         tp->t_early_rexmt_win = tcp_now;
1569
1570                                 if (tp->t_flagsext & TF_SENT_TLPROBE) {
1571                                         tcpstat.tcps_tlp_recovery++;
1572                                         tcp_ccdbg_trace(tp, th,
1573                                             TCP_CC_TLP_RECOVERY);
1574                                 } else {
1575                                         tcpstat.tcps_early_rexmt++;
1576                                         tp->t_early_rexmt_count++;
1577                                         tcp_ccdbg_trace(tp, th,
1578                                             TCP_CC_EARLY_RETRANSMIT);
1579                                 }
1580                         }
1581                 }
1582         }
1583
1584         /*
1585          * If we ever sent a TLP probe, the acknowledgement will trigger
1586          * early retransmit because the value of snd_fack will be close
1587          * to snd_max. This will take care of adjustments to the
1588          * congestion window. So we can reset TF_SENT_PROBE flag.
1589          */
1590         tp->t_flagsext &= ~(TF_SENT_TLPROBE);
1591         tp->t_tlphighrxt = 0;
1592         tp->t_tlpstart = 0;
1593 }
1594
1595 static boolean_t
1596 tcp_tfo_syn(tp, to)
1597         struct tcpcb *tp;
1598         struct tcpopt *to;
1599 {
1600         u_char out[CCAES_BLOCK_SIZE];
1601         unsigned char len;
1602
1603         if (!(to->to_flags & (TOF_TFO | TOF_TFOREQ)) ||
1604             !(tcp_fastopen & TCP_FASTOPEN_SERVER))
1605                 return (FALSE);
1606
1607         if ((to->to_flags & TOF_TFOREQ)) {
1608                 tp->t_tfo_flags |= TFO_F_OFFER_COOKIE;
1609
1610                 tp->t_tfo_stats |= TFO_S_COOKIEREQ_RECV;
1611                 tcpstat.tcps_tfo_cookie_req_rcv++;
1612                 return (FALSE);
1613         }
1614
1615         /* Ok, then it must be an offered cookie. We need to check that ... */
1616         tcp_tfo_gen_cookie(tp->t_inpcb, out, sizeof(out));
1617
1618         len = *to->to_tfo - TCPOLEN_FASTOPEN_REQ;
1619         to->to_tfo++;
1620         if (memcmp(out, to->to_tfo, len)) {
1621                 /* Cookies are different! Let's return and offer a new cookie */
1622                 tp->t_tfo_flags |= TFO_F_OFFER_COOKIE;
1623
1624                 tp->t_tfo_stats |= TFO_S_COOKIE_INVALID;
1625                 tcpstat.tcps_tfo_cookie_invalid++;
1626                 return (FALSE);
1627         }
1628
1629         if (OSIncrementAtomic(&tcp_tfo_halfcnt) >= tcp_tfo_backlog) {
1630                 /* Need to decrement again as we just increased it... */
1631                 OSDecrementAtomic(&tcp_tfo_halfcnt);
1632                 return (FALSE);
1633         }
1634
1635         tp->t_tfo_flags |= TFO_F_COOKIE_VALID;
1636
1637         tp->t_tfo_stats |= TFO_S_SYNDATA_RCV;
1638         tcpstat.tcps_tfo_syn_data_rcv++;
1639
1640         return (TRUE);
1641 }
1642
1643 static void
1644 tcp_tfo_synack(tp, to)
1645         struct tcpcb *tp;
1646         struct tcpopt *to;
1647 {
1648         if (to->to_flags & TOF_TFO) {
1649                 unsigned char len = *to->to_tfo - TCPOLEN_FASTOPEN_REQ;
1650
1651                 /*
1652                  * If this happens, things have gone terribly wrong. len should
1653                  * have been checked in tcp_dooptions.
1654                  */
1655                 VERIFY(len <= TFO_COOKIE_LEN_MAX);
1656
1657                 to->to_tfo++;
1658
1659                 tcp_cache_set_cookie(tp, to->to_tfo, len);
1660                 tcp_heuristic_tfo_success(tp);
1661
1662                 tp->t_tfo_stats |= TFO_S_COOKIE_RCV;
1663                 tcpstat.tcps_tfo_cookie_rcv++;
1664         } else {
1665                 /*
1666                  * Thus, no cookie in the response, but we either asked for one
1667                  * or sent SYN+DATA. Now, we need to check whether we had to
1668                  * rexmit the SYN. If that's the case, it's better to start
1669                  * backing of TFO-cookie requests.
1670                  */
1671                 if (tp->t_tfo_flags & TFO_F_SYN_LOSS)
1672                         tcp_heuristic_inc_loss(tp, 1, 0);
1673                 else
1674                         tcp_heuristic_reset_loss(tp, 1, 0);
1675         }
1676 }
1677
1678 static void
1679 tcp_tfo_rcv_probe(struct tcpcb *tp, int tlen)
1680 {
1681         if (tlen == 0) {
1682                 tp->t_tfo_probe_state = TFO_PROBE_PROBING;
1683
1684                 /*
1685                  * We send the probe out rather quickly (after one RTO). It does not
1686                  * really hurt that much, it's only one additional segment on the wire.
1687                  */
1688                 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, (TCP_REXMTVAL(tp)));
1689         } else {
1690                 /* If SYN/ACK+data, don't probe. We got the data! */
1691                 tcp_heuristic_tfo_rcv_good(tp);
1692         }
1693 }
1694
1695 static void
1696 tcp_tfo_rcv_data(struct tcpcb *tp)
1697 {
1698         /* Transition from PROBING to NONE as data has been received */
1699         if (tp->t_tfo_probe_state >= TFO_PROBE_PROBING) {
1700                 tp->t_tfo_probe_state = TFO_PROBE_NONE;
1701
1702                 /* Data has been received - we are good to go! */
1703                 tcp_heuristic_tfo_rcv_good(tp);
1704         }
1705 }
1706
1707 static void
1708 tcp_tfo_rcv_ack(struct tcpcb *tp, struct tcphdr *th)
1709 {
1710         if (tp->t_tfo_probe_state == TFO_PROBE_PROBING &&
1711             tp->t_tfo_probes > 0) {
1712                 if (th->th_seq == tp->rcv_nxt) {
1713                         /* No hole, so stop probing */
1714                         tp->t_tfo_probe_state = TFO_PROBE_NONE;
1715                 } else if (SEQ_GT(th->th_seq, tp->rcv_nxt)) {
1716                         /* There is a hole! Wait a bit for data... */
1717                         tp->t_tfo_probe_state = TFO_PROBE_WAIT_DATA;
1718                         tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
1719                             TCP_REXMTVAL(tp));
1720                 }
1721         }
1722 }
1723
1724 void
1725 tcp_input(m, off0)
1726         struct mbuf *m;
1727         int off0;
1728 {
1729         register struct tcphdr *th;
1730         register struct ip *ip = NULL;
1731         register struct inpcb *inp;
1732         u_char *optp = NULL;
1733         int optlen = 0;
1734         int tlen, off;
1735         int drop_hdrlen;
1736         register struct tcpcb *tp = 0;
1737         register int thflags;
1738         struct socket *so = 0;
1739         int todrop, acked, ourfinisacked, needoutput = 0;
1740         struct in_addr laddr;
1741 #if INET6
1742         struct in6_addr laddr6;
1743 #endif
1744         int dropsocket = 0;
1745         int iss = 0, nosock = 0;
1746         u_int32_t tiwin, sack_bytes_acked = 0;
1747         struct tcpopt to;               /* options in this segment */
1748 #if TCPDEBUG
1749         short ostate = 0;
1750 #endif
1751 #if IPFIREWALL
1752         struct sockaddr_in *next_hop = NULL;
1753         struct m_tag *fwd_tag;
1754 #endif /* IPFIREWALL */
1755         u_char ip_ecn = IPTOS_ECN_NOTECT;
1756         unsigned int ifscope;
1757         uint8_t isconnected, isdisconnected;
1758         struct ifnet *ifp = m->m_pkthdr.rcvif;
1759         int pktf_sw_lro_pkt = (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) ? 1 : 0;
1760         int nlropkts = (pktf_sw_lro_pkt == 1) ? m->m_pkthdr.lro_npkts : 1;
1761         int turnoff_lro = 0, win;
1762 #if MPTCP
1763         struct mptcb *mp_tp = NULL;
1764 #endif /* MPTCP */
1765         boolean_t cell = IFNET_IS_CELLULAR(ifp);
1766         boolean_t wifi = (!cell && IFNET_IS_WIFI(ifp));
1767         boolean_t wired = (!wifi && IFNET_IS_WIRED(ifp));
1768         boolean_t recvd_dsack = FALSE;
1769         struct tcp_respond_args tra;
1770
1771 #define TCP_INC_VAR(stat, npkts) do {                   \
1772                 stat += npkts;                          \
1773 } while (0)
1774
1775         TCP_INC_VAR(tcpstat.tcps_rcvtotal, nlropkts);
1776 #if IPFIREWALL
1777         /* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */
1778         if (!SLIST_EMPTY(&m->m_pkthdr.tags)) {
1779                 fwd_tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID,
1780                     KERNEL_TAG_TYPE_IPFORWARD, NULL);
1781         } else {
1782                 fwd_tag = NULL;
1783         }
1784         if (fwd_tag != NULL) {
1785                 struct ip_fwd_tag *ipfwd_tag =
1786                         (struct ip_fwd_tag *)(fwd_tag+1);
1787
1788                 next_hop = ipfwd_tag->next_hop;
1789                 m_tag_delete(m, fwd_tag);
1790         }
1791 #endif /* IPFIREWALL */
1792
1793 #if INET6
1794         struct ip6_hdr *ip6 = NULL;
1795         int isipv6;
1796 #endif /* INET6 */
1797         int rstreason; /* For badport_bandlim accounting purposes */
1798         struct proc *proc0=current_proc();
1799
1800         KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_START,0,0,0,0,0);
1801
1802 #if INET6
1803         isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
1804 #endif
1805         bzero((char *)&to, sizeof(to));
1806
1807 #if INET6
1808         if (isipv6) {
1809                 /*
1810                  * Expect 32-bit aligned data pointer on
1811                  * strict-align platforms
1812                  */
1813                 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
1814
1815                 /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */
1816                 ip6 = mtod(m, struct ip6_hdr *);
1817                 tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
1818                 th = (struct tcphdr *)(void *)((caddr_t)ip6 + off0);
1819
1820                 if (tcp_input_checksum(AF_INET6, m, th, off0, tlen))
1821                         goto dropnosock;
1822
1823                 KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport),
1824                      (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
1825                      th->th_seq, th->th_ack, th->th_win);
1826                 /*
1827                  * Be proactive about unspecified IPv6 address in source.
1828                  * As we use all-zero to indicate unbounded/unconnected pcb,
1829                  * unspecified IPv6 address can be used to confuse us.
1830                  *
1831                  * Note that packets with unspecified IPv6 destination is
1832                  * already dropped in ip6_input.
1833                  */
1834                 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
1835                         /* XXX stat */
1836                         IF_TCP_STATINC(ifp, unspecv6);
1837                         goto dropnosock;
1838                 }
1839                 DTRACE_TCP5(receive, struct mbuf *, m, struct inpcb *, NULL,
1840                         struct ip6_hdr *, ip6, struct tcpcb *, NULL,
1841                         struct tcphdr *, th);
1842
1843                 ip_ecn = (ntohl(ip6->ip6_flow) >> 20) & IPTOS_ECN_MASK;
1844         } else
1845 #endif /* INET6 */
1846         {
1847         /*
1848          * Get IP and TCP header together in first mbuf.
1849          * Note: IP leaves IP header in first mbuf.
1850          */
1851         if (off0 > sizeof (struct ip)) {
1852                 ip_stripoptions(m, (struct mbuf *)0);
1853                 off0 = sizeof(struct ip);
1854         }
1855         if (m->m_len < sizeof (struct tcpiphdr)) {
1856                 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) {
1857                         tcpstat.tcps_rcvshort++;
1858                         return;
1859                 }
1860         }
1861
1862         /* Expect 32-bit aligned data pointer on strict-align platforms */
1863         MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
1864
1865         ip = mtod(m, struct ip *);
1866         th = (struct tcphdr *)(void *)((caddr_t)ip + off0);
1867         tlen = ip->ip_len;
1868
1869         if (tcp_input_checksum(AF_INET, m, th, off0, tlen))
1870                 goto dropnosock;
1871
1872 #if INET6
1873         /* Re-initialization for later version check */
1874         ip->ip_v = IPVERSION;
1875 #endif
1876         ip_ecn = (ip->ip_tos & IPTOS_ECN_MASK);
1877
1878         DTRACE_TCP5(receive, struct mbuf *, m, struct inpcb *, NULL,
1879                 struct ip *, ip, struct tcpcb *, NULL, struct tcphdr *, th);
1880
1881         KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport),
1882                 (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
1883                   th->th_seq, th->th_ack, th->th_win);
1884
1885         }
1886
1887         /*
1888          * Check that TCP offset makes sense,
1889          * pull out TCP options and adjust length.              XXX
1890          */
1891         off = th->th_off << 2;
1892         if (off < sizeof (struct tcphdr) || off > tlen) {
1893                 tcpstat.tcps_rcvbadoff++;
1894                 IF_TCP_STATINC(ifp, badformat);
1895                 goto dropnosock;
1896         }
1897         tlen -= off;    /* tlen is used instead of ti->ti_len */
1898         if (off > sizeof (struct tcphdr)) {
1899 #if INET6
1900                 if (isipv6) {
1901                         IP6_EXTHDR_CHECK(m, off0, off, return);
1902                         ip6 = mtod(m, struct ip6_hdr *);
1903                         th = (struct tcphdr *)(void *)((caddr_t)ip6 + off0);
1904                 } else
1905 #endif /* INET6 */
1906                 {
1907                         if (m->m_len < sizeof(struct ip) + off) {
1908                                 if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) {
1909                                         tcpstat.tcps_rcvshort++;
1910                                         return;
1911                                 }
1912                                 ip = mtod(m, struct ip *);
1913                                 th = (struct tcphdr *)(void *)((caddr_t)ip + off0);
1914                         }
1915                 }
1916                 optlen = off - sizeof (struct tcphdr);
1917                 optp = (u_char *)(th + 1);
1918                 /*
1919                  * Do quick retrieval of timestamp options ("options
1920                  * prediction?").  If timestamp is the only option and it's
1921                  * formatted as recommended in RFC 1323 appendix A, we
1922                  * quickly get the values now and not bother calling
1923                  * tcp_dooptions(), etc.
1924                  */
1925                 if ((optlen == TCPOLEN_TSTAMP_APPA ||
1926                         (optlen > TCPOLEN_TSTAMP_APPA &&
1927                         optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
1928                         *(u_int32_t *)(void *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
1929                         (th->th_flags & TH_SYN) == 0) {
1930                         to.to_flags |= TOF_TS;
1931                         to.to_tsval = ntohl(*(u_int32_t *)(void *)(optp + 4));
1932                         to.to_tsecr = ntohl(*(u_int32_t *)(void *)(optp + 8));
1933                         optp = NULL;    /* we've parsed the options */
1934                 }
1935         }
1936         thflags = th->th_flags;
1937
1938 #if TCP_DROP_SYNFIN
1939         /*
1940          * If the drop_synfin option is enabled, drop all packets with
1941          * both the SYN and FIN bits set. This prevents e.g. nmap from
1942          * identifying the TCP/IP stack.
1943          *
1944          * This is a violation of the TCP specification.
1945          */
1946         if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN)) {
1947                 IF_TCP_STATINC(ifp, synfin);
1948                 goto dropnosock;
1949         }
1950 #endif
1951
1952         /*
1953          * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options,
1954          * until after ip6_savecontrol() is called and before other functions
1955          * which don't want those proto headers.
1956          * Because ip6_savecontrol() is going to parse the mbuf to
1957          * search for data to be passed up to user-land, it wants mbuf
1958          * parameters to be unchanged.
1959          */
1960         drop_hdrlen = off0 + off;
1961
1962         /* Since this is an entry point for input processing of tcp packets, we
1963          * can update the tcp clock here.
1964          */
1965         calculate_tcp_clock();
1966
1967         /*
1968          * Record the interface where this segment arrived on; this does not
1969          * affect normal data output (for non-detached TCP) as it provides a
1970          * hint about which route and interface to use for sending in the
1971          * absence of a PCB, when scoped routing (and thus source interface
1972          * selection) are enabled.
1973          */
1974         if ((m->m_pkthdr.pkt_flags & PKTF_LOOP) || m->m_pkthdr.rcvif == NULL)
1975                 ifscope = IFSCOPE_NONE;
1976         else
1977                 ifscope = m->m_pkthdr.rcvif->if_index;
1978
1979         /*
1980          * Convert TCP protocol specific fields to host format.
1981          */
1982
1983 #if BYTE_ORDER != BIG_ENDIAN
1984         NTOHL(th->th_seq);
1985         NTOHL(th->th_ack);
1986         NTOHS(th->th_win);
1987         NTOHS(th->th_urp);
1988 #endif
1989
1990         /*
1991          * Locate pcb for segment.
1992          */
1993 findpcb:
1994
1995         isconnected = FALSE;
1996         isdisconnected = FALSE;
1997
1998 #if IPFIREWALL_FORWARD
1999         if (next_hop != NULL
2000 #if INET6
2001             && isipv6 == 0 /* IPv6 support is not yet */
2002 #endif /* INET6 */
2003             ) {
2004                 /*
2005                  * Diverted. Pretend to be the destination.
2006                  * already got one like this?
2007                  */
2008                 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
2009                         ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif);
2010                 if (!inp) {
2011                         /*
2012                          * No, then it's new. Try find the ambushing socket
2013                          */
2014                         if (!next_hop->sin_port) {
2015                                 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src,
2016                                     th->th_sport, next_hop->sin_addr,
2017                                     th->th_dport, 1, m->m_pkthdr.rcvif);
2018                         } else {
2019                                 inp = in_pcblookup_hash(&tcbinfo,
2020                                     ip->ip_src, th->th_sport,
2021                                     next_hop->sin_addr,
2022                                     ntohs(next_hop->sin_port), 1,
2023                                     m->m_pkthdr.rcvif);
2024                         }
2025                 }
2026         } else
2027 #endif  /* IPFIREWALL_FORWARD */
2028       {
2029 #if INET6
2030         if (isipv6)
2031                 inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport,
2032                                          &ip6->ip6_dst, th->th_dport, 1,
2033                                          m->m_pkthdr.rcvif);
2034         else
2035 #endif /* INET6 */
2036         inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
2037             ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif);
2038       }
2039
2040         /*
2041          * Use the interface scope information from the PCB for outbound
2042          * segments.  If the PCB isn't present and if scoped routing is
2043          * enabled, tcp_respond will use the scope of the interface where
2044          * the segment arrived on.
2045          */
2046         if (inp != NULL && (inp->inp_flags & INP_BOUND_IF))
2047                 ifscope = inp->inp_boundifp->if_index;
2048
2049         /*
2050          * If the state is CLOSED (i.e., TCB does not exist) then
2051          * all data in the incoming segment is discarded.
2052          * If the TCB exists but is in CLOSED state, it is embryonic,
2053          * but should either do a listen or a connect soon.
2054          */
2055         if (inp == NULL) {
2056                 if (log_in_vain) {
2057 #if INET6
2058                         char dbuf[MAX_IPv6_STR_LEN], sbuf[MAX_IPv6_STR_LEN];
2059 #else /* INET6 */
2060                         char dbuf[MAX_IPv4_STR_LEN], sbuf[MAX_IPv4_STR_LEN];
2061 #endif /* INET6 */
2062
2063 #if INET6
2064                         if (isipv6) {
2065                                 inet_ntop(AF_INET6, &ip6->ip6_dst, dbuf, sizeof(dbuf));
2066                                 inet_ntop(AF_INET6, &ip6->ip6_src, sbuf, sizeof(sbuf));
2067                         } else
2068 #endif
2069                         {
2070                                 inet_ntop(AF_INET, &ip->ip_dst, dbuf, sizeof(dbuf));
2071                                 inet_ntop(AF_INET, &ip->ip_src, sbuf, sizeof(sbuf));
2072                         }
2073                         switch (log_in_vain) {
2074                         case 1:
2075                                 if(thflags & TH_SYN)
2076                                         log(LOG_INFO,
2077                                                 "Connection attempt to TCP %s:%d from %s:%d\n",
2078                                                 dbuf, ntohs(th->th_dport),
2079                                                 sbuf,
2080                                                 ntohs(th->th_sport));
2081                                 break;
2082                         case 2:
2083                                 log(LOG_INFO,
2084                                         "Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n",
2085                                         dbuf, ntohs(th->th_dport), sbuf,
2086                                         ntohs(th->th_sport), thflags);
2087                                 break;
2088                         case 3:
2089                         case 4:
2090                                 if ((thflags & TH_SYN) && !(thflags & TH_ACK) &&
2091                                         !(m->m_flags & (M_BCAST | M_MCAST)) &&
2092 #if INET6
2093                                         ((isipv6 && !IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) ||
2094                                          (!isipv6 && ip->ip_dst.s_addr != ip->ip_src.s_addr))
2095 #else
2096                                         ip->ip_dst.s_addr != ip->ip_src.s_addr
2097 #endif
2098                                          )
2099                                         log_in_vain_log((LOG_INFO,
2100                                                 "Stealth Mode connection attempt to TCP %s:%d from %s:%d\n",
2101                                                 dbuf, ntohs(th->th_dport),
2102                                                 sbuf,
2103                                                 ntohs(th->th_sport)));
2104                                 break;
2105                         default:
2106                                 break;
2107                         }
2108                 }
2109                 if (blackhole) {
2110                         if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type != IFT_LOOP)
2111
2112                                 switch (blackhole) {
2113                                 case 1:
2114                                         if (thflags & TH_SYN)
2115                                                 goto dropnosock;
2116                                         break;
2117                                 case 2:
2118                                         goto dropnosock;
2119                                 default:
2120                                         goto dropnosock;
2121                                 }
2122                 }
2123                 rstreason = BANDLIM_RST_CLOSEDPORT;
2124                 IF_TCP_STATINC(ifp, noconnnolist);
2125                 goto dropwithresetnosock;
2126         }
2127         so = inp->inp_socket;
2128         if (so == NULL) {
2129                 /* This case shouldn't happen  as the socket shouldn't be null
2130                  * if inp_state isn't set to INPCB_STATE_DEAD
2131                  * But just in case, we pretend we didn't find the socket if we hit this case
2132                  * as this isn't cause for a panic (the socket might be leaked however)...
2133                  */
2134                 inp = NULL;
2135 #if TEMPDEBUG
2136                 printf("tcp_input: no more socket for inp=%x. This shouldn't happen\n", inp);
2137 #endif
2138                 goto dropnosock;
2139         }
2140
2141         tcp_lock(so, 1, 0);
2142         if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
2143                 tcp_unlock(so, 1, (void *)2);
2144                 inp = NULL;     // pretend we didn't find it
2145                 goto dropnosock;
2146         }
2147
2148 #if NECP
2149 #if INET6
2150         if (isipv6) {
2151                 if (!necp_socket_is_allowed_to_send_recv_v6(inp, th->th_dport,
2152                                                             th->th_sport,
2153                                                             &ip6->ip6_dst,
2154                                                             &ip6->ip6_src,
2155                                                             ifp, NULL, NULL)) {
2156                         IF_TCP_STATINC(ifp, badformatipsec);
2157                         goto drop;
2158                 }
2159         } else
2160 #endif
2161         {
2162                 if (!necp_socket_is_allowed_to_send_recv_v4(inp, th->th_dport,
2163                                                             th->th_sport,
2164                                                             &ip->ip_dst,
2165                                                             &ip->ip_src,
2166                                                             ifp, NULL, NULL)) {
2167                         IF_TCP_STATINC(ifp, badformatipsec);
2168                         goto drop;
2169                 }
2170         }
2171 #endif /* NECP */
2172
2173         tp = intotcpcb(inp);
2174         if (tp == 0) {
2175                 rstreason = BANDLIM_RST_CLOSEDPORT;
2176                 IF_TCP_STATINC(ifp, noconnlist);
2177                 goto dropwithreset;
2178         }
2179         if (tp->t_state == TCPS_CLOSED)
2180                 goto drop;
2181
2182         /* Unscale the window into a 32-bit value. */
2183         if ((thflags & TH_SYN) == 0)
2184                 tiwin = th->th_win << tp->snd_scale;
2185         else
2186                 tiwin = th->th_win;
2187
2188 #if CONFIG_MACF_NET
2189         if (mac_inpcb_check_deliver(inp, m, AF_INET, SOCK_STREAM))
2190                 goto drop;
2191 #endif
2192
2193         /* Avoid processing packets while closing a listen socket */
2194         if (tp->t_state == TCPS_LISTEN &&
2195                 (so->so_options & SO_ACCEPTCONN) == 0)
2196                 goto drop;
2197
2198         if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
2199 #if TCPDEBUG
2200                 if (so->so_options & SO_DEBUG) {
2201                         ostate = tp->t_state;
2202 #if INET6
2203                         if (isipv6)
2204                                 bcopy((char *)ip6, (char *)tcp_saveipgen,
2205                                       sizeof(*ip6));
2206                         else
2207 #endif /* INET6 */
2208                         bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip));
2209                         tcp_savetcp = *th;
2210                 }
2211 #endif
2212                 if (so->so_options & SO_ACCEPTCONN) {
2213                     register struct tcpcb *tp0 = tp;
2214                         struct socket *so2;
2215                         struct socket *oso;
2216                         struct sockaddr_storage from;
2217 #if INET6
2218                         struct inpcb *oinp = sotoinpcb(so);
2219 #endif /* INET6 */
2220                         struct ifnet *head_ifscope;
2221                         unsigned int head_nocell, head_recvanyif,
2222                                      head_noexpensive, head_awdl_unrestricted;
2223
2224                         /* Get listener's bound-to-interface, if any */
2225                         head_ifscope = (inp->inp_flags & INP_BOUND_IF) ?
2226                             inp->inp_boundifp : NULL;
2227                         /* Get listener's no-cellular information, if any */
2228                         head_nocell = INP_NO_CELLULAR(inp);
2229                         /* Get listener's recv-any-interface, if any */
2230                         head_recvanyif = (inp->inp_flags & INP_RECV_ANYIF);
2231                         /* Get listener's no-expensive information, if any */
2232                         head_noexpensive = INP_NO_EXPENSIVE(inp);
2233                         head_awdl_unrestricted = INP_AWDL_UNRESTRICTED(inp);
2234
2235                         /*
2236                          * If the state is LISTEN then ignore segment if it contains an RST.
2237                          * If the segment contains an ACK then it is bad and send a RST.
2238                          * If it does not contain a SYN then it is not interesting; drop it.
2239                          * If it is from this socket, drop it, it must be forged.
2240                          */
2241                         if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
2242                                 IF_TCP_STATINC(ifp, listbadsyn);
2243
2244                                 if (thflags & TH_RST) {
2245                                         goto drop;
2246                                 }
2247                                 if (thflags & TH_ACK) {
2248                                         tp = NULL;
2249                                         tcpstat.tcps_badsyn++;
2250                                         rstreason = BANDLIM_RST_OPENPORT;
2251                                         goto dropwithreset;
2252                                 }
2253
2254                                 /* We come here if there is no SYN set */
2255                                 tcpstat.tcps_badsyn++;
2256                                 goto drop;
2257                         }
2258                         KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_START,0,0,0,0,0);
2259                         if (th->th_dport == th->th_sport) {
2260 #if INET6
2261                                 if (isipv6) {
2262                                         if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
2263                                                        &ip6->ip6_src))
2264                                                 goto drop;
2265                                 } else
2266 #endif /* INET6 */
2267                                         if (ip->ip_dst.s_addr == ip->ip_src.s_addr)
2268                                                 goto drop;
2269                         }
2270                         /*
2271                          * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
2272                          * in_broadcast() should never return true on a received
2273                          * packet with M_BCAST not set.
2274                          *
2275                          * Packets with a multicast source address should also
2276                          * be discarded.
2277                          */
2278                         if (m->m_flags & (M_BCAST|M_MCAST))
2279                                 goto drop;
2280 #if INET6
2281                         if (isipv6) {
2282                                 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
2283                                         IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
2284                                         goto drop;
2285                         } else
2286 #endif
2287                         if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
2288                                 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
2289                                 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
2290                                 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
2291                                 goto drop;
2292
2293
2294 #if INET6
2295                         /*
2296                          * If deprecated address is forbidden,
2297                          * we do not accept SYN to deprecated interface
2298                          * address to prevent any new inbound connection from
2299                          * getting established.
2300                          * When we do not accept SYN, we send a TCP RST,
2301                          * with deprecated source address (instead of dropping
2302                          * it).  We compromise it as it is much better for peer
2303                          * to send a RST, and RST will be the final packet
2304                          * for the exchange.
2305                          *
2306                          * If we do not forbid deprecated addresses, we accept
2307                          * the SYN packet.  RFC 4862 forbids dropping SYN in
2308                          * this case.
2309                          */
2310                         if (isipv6 && !ip6_use_deprecated) {
2311                                 uint32_t ia6_flags;
2312
2313                                 if (ip6_getdstifaddr_info(m, NULL,
2314                                     &ia6_flags) == 0) {
2315                                         if (ia6_flags & IN6_IFF_DEPRECATED) {
2316                                                 tp = NULL;
2317                                                 rstreason = BANDLIM_RST_OPENPORT;
2318                                                 IF_TCP_STATINC(ifp, deprecate6);
2319                                                 goto dropwithreset;
2320                                         }
2321                                 }
2322                         }
2323 #endif
2324                         if (so->so_filt) {
2325 #if INET6
2326                                 if (isipv6) {
2327                                         struct sockaddr_in6     *sin6 = (struct sockaddr_in6*)&from;
2328
2329                                         sin6->sin6_len = sizeof(*sin6);
2330                                         sin6->sin6_family = AF_INET6;
2331                                         sin6->sin6_port = th->th_sport;
2332                                         sin6->sin6_flowinfo = 0;
2333                                         sin6->sin6_addr = ip6->ip6_src;
2334                                         sin6->sin6_scope_id = 0;
2335                                 }
2336                                 else
2337 #endif
2338                                 {
2339                                         struct sockaddr_in *sin = (struct sockaddr_in*)&from;
2340
2341                                         sin->sin_len = sizeof(*sin);
2342                                         sin->sin_family = AF_INET;
2343                                         sin->sin_port = th->th_sport;
2344                                         sin->sin_addr = ip->ip_src;
2345                                 }
2346                                 so2 = sonewconn(so, 0, (struct sockaddr*)&from);
2347                         } else {
2348                                 so2 = sonewconn(so, 0, NULL);
2349                         }
2350                         if (so2 == 0) {
2351                                 tcpstat.tcps_listendrop++;
2352                                 if (tcp_dropdropablreq(so)) {
2353                                         if (so->so_filt)
2354                                                 so2 = sonewconn(so, 0, (struct sockaddr*)&from);
2355                                         else
2356                                                 so2 = sonewconn(so, 0, NULL);
2357                                 }
2358                                 if (!so2)
2359                                         goto drop;
2360                         }
2361
2362                         /* Point "inp" and "tp" in tandem to new socket */
2363                         inp = (struct inpcb *)so2->so_pcb;
2364                         tp = intotcpcb(inp);
2365
2366                         oso = so;
2367                         tcp_unlock(so, 0, 0); /* Unlock but keep a reference on listener for now */
2368
2369                         so = so2;
2370                         tcp_lock(so, 1, 0);
2371                         /*
2372                          * Mark socket as temporary until we're
2373                          * committed to keeping it.  The code at
2374                          * ``drop'' and ``dropwithreset'' check the
2375                          * flag dropsocket to see if the temporary
2376                          * socket created here should be discarded.
2377                          * We mark the socket as discardable until
2378                          * we're committed to it below in TCPS_LISTEN.
2379                          * There are some error conditions in which we
2380                          * have to drop the temporary socket.
2381                          */
2382                         dropsocket++;
2383                         /*
2384                          * Inherit INP_BOUND_IF from listener; testing if
2385                          * head_ifscope is non-NULL is sufficient, since it
2386                          * can only be set to a non-zero value earlier if
2387                          * the listener has such a flag set.
2388                          */
2389                         if (head_ifscope != NULL) {
2390                                 inp->inp_flags |= INP_BOUND_IF;
2391                                 inp->inp_boundifp = head_ifscope;
2392                         } else {
2393                                 inp->inp_flags &= ~INP_BOUND_IF;
2394                         }
2395                         /*
2396                          * Inherit restrictions from listener.
2397                          */
2398                         if (head_nocell)
2399                                 inp_set_nocellular(inp);
2400                         if (head_noexpensive)
2401                                 inp_set_noexpensive(inp);
2402                         if (head_awdl_unrestricted)
2403                                 inp_set_awdl_unrestricted(inp);
2404                         /*
2405                          * Inherit {IN,IN6}_RECV_ANYIF from listener.
2406                          */
2407                         if (head_recvanyif)
2408                                 inp->inp_flags |= INP_RECV_ANYIF;
2409                         else
2410                                 inp->inp_flags &= ~INP_RECV_ANYIF;
2411 #if INET6
2412                         if (isipv6)
2413                                 inp->in6p_laddr = ip6->ip6_dst;
2414                         else {
2415                                 inp->inp_vflag &= ~INP_IPV6;
2416                                 inp->inp_vflag |= INP_IPV4;
2417 #endif /* INET6 */
2418                                 inp->inp_laddr = ip->ip_dst;
2419 #if INET6
2420                         }
2421 #endif /* INET6 */
2422                         inp->inp_lport = th->th_dport;
2423                         if (in_pcbinshash(inp, 0) != 0) {
2424                                 /*
2425                                  * Undo the assignments above if we failed to
2426                                  * put the PCB on the hash lists.
2427                                  */
2428 #if INET6
2429                                 if (isipv6)
2430                                         inp->in6p_laddr = in6addr_any;
2431                                 else
2432 #endif /* INET6 */
2433                                         inp->inp_laddr.s_addr = INADDR_ANY;
2434                                 inp->inp_lport = 0;
2435                                 tcp_lock(oso, 0, 0);    /* release ref on parent */
2436                                 tcp_unlock(oso, 1, 0);
2437                                 goto drop;
2438                         }
2439 #if INET6
2440                         if (isipv6) {
2441                                 /*
2442                                  * Inherit socket options from the listening
2443                                  * socket.
2444                                  * Note that in6p_inputopts are not (even
2445                                  * should not be) copied, since it stores
2446                                  * previously received options and is used to
2447                                  * detect if each new option is different than
2448                                  * the previous one and hence should be passed
2449                                  * to a user.
2450                                  * If we copied in6p_inputopts, a user would
2451                                  * not be able to receive options just after
2452                                  * calling the accept system call.
2453                                  */
2454                                 inp->inp_flags |=
2455                                         oinp->inp_flags & INP_CONTROLOPTS;
2456                                 if (oinp->in6p_outputopts)
2457                                         inp->in6p_outputopts =
2458                                                 ip6_copypktopts(oinp->in6p_outputopts,
2459                                                                 M_NOWAIT);
2460                         } else
2461 #endif /* INET6 */
2462                         {
2463                                 inp->inp_options = ip_srcroute();
2464                                 inp->inp_ip_tos = oinp->inp_ip_tos;
2465                         }
2466                         tcp_lock(oso, 0, 0);
2467 #if IPSEC
2468                         /* copy old policy into new socket's */
2469                         if (sotoinpcb(oso)->inp_sp)
2470                         {
2471                                 int error = 0;
2472                                 /* Is it a security hole here to silently fail to copy the policy? */
2473                                 if (inp->inp_sp != NULL)
2474                                         error = ipsec_init_policy(so, &inp->inp_sp);
2475                                 if (error != 0 || ipsec_copy_policy(sotoinpcb(oso)->inp_sp, inp->inp_sp))
2476                                         printf("tcp_input: could not copy policy\n");
2477                         }
2478 #endif
2479                         /* inherit states from the listener */
2480                         DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
2481                                 struct tcpcb *, tp, int32_t, TCPS_LISTEN);
2482                         tp->t_state = TCPS_LISTEN;
2483                         tp->t_flags |= tp0->t_flags & (TF_NOPUSH|TF_NOOPT|TF_NODELAY);
2484                         tp->t_flagsext |= (tp0->t_flagsext & (TF_RXTFINDROP|TF_NOTIMEWAIT|TF_FASTOPEN));
2485                         tp->t_keepinit = tp0->t_keepinit;
2486                         tp->t_keepcnt = tp0->t_keepcnt;
2487                         tp->t_keepintvl = tp0->t_keepintvl;
2488                         tp->t_adaptive_wtimo = tp0->t_adaptive_wtimo;
2489                         tp->t_adaptive_rtimo = tp0->t_adaptive_rtimo;
2490                         tp->t_inpcb->inp_ip_ttl = tp0->t_inpcb->inp_ip_ttl;
2491                         if ((so->so_flags & SOF_NOTSENT_LOWAT) != 0)
2492                                 tp->t_notsent_lowat = tp0->t_notsent_lowat;
2493
2494                         /* now drop the reference on the listener */
2495                         tcp_unlock(oso, 1, 0);
2496
2497                         tcp_set_max_rwinscale(tp, so);
2498
2499                         KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_END,0,0,0,0,0);
2500                 }
2501         }
2502         lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx,
2503                 LCK_MTX_ASSERT_OWNED);
2504
2505         if (tp->t_state == TCPS_ESTABLISHED && tlen > 0) {
2506                 /*
2507                  * Evaluate the rate of arrival of packets to see if the
2508                  * receiver can reduce the ack traffic. The algorithm to
2509                  * stretch acks will be enabled if the connection meets
2510                  * certain criteria defined in tcp_stretch_ack_enable function.
2511                  */
2512                 if ((tp->t_flagsext & TF_RCVUNACK_WAITSS) != 0) {
2513                         TCP_INC_VAR(tp->rcv_waitforss, nlropkts);
2514                 }
2515                 if (tcp_stretch_ack_enable(tp)) {
2516                         tp->t_flags |= TF_STRETCHACK;
2517                         tp->t_flagsext &= ~(TF_RCVUNACK_WAITSS);
2518                         tp->rcv_waitforss = 0;
2519                 } else {
2520                         tp->t_flags &= ~(TF_STRETCHACK);
2521                 }
2522                 if (TSTMP_GT(tp->rcv_unackwin, tcp_now)) {
2523                         tp->rcv_by_unackwin += (tlen + off);
2524                 } else {
2525                         tp->rcv_unackwin = tcp_now + tcp_rcvunackwin;
2526                         tp->rcv_by_unackwin = tlen + off;
2527                 }
2528         }
2529
2530         /*
2531          * Keep track of how many bytes were received in the LRO packet
2532          */
2533         if ((pktf_sw_lro_pkt) && (nlropkts > 2))  {
2534                 tp->t_lropktlen += tlen;
2535         }
2536         /*
2537          * Explicit Congestion Notification - Flag that we need to send ECT if
2538          *      + The IP Congestion experienced flag was set.
2539          *      + Socket is in established state
2540          *      + We negotiated ECN in the TCP setup
2541          *      + This isn't a pure ack (tlen > 0)
2542          *      + The data is in the valid window
2543          *
2544          *      TE_SENDECE will be cleared when we receive a packet with TH_CWR set.
2545          */
2546         if (ip_ecn == IPTOS_ECN_CE && tp->t_state == TCPS_ESTABLISHED &&
2547             TCP_ECN_ENABLED(tp) && tlen > 0 &&
2548             SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
2549             SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
2550                 tp->t_ecn_recv_ce++;
2551                 tcpstat.tcps_ecn_recv_ce++;
2552                 INP_INC_IFNET_STAT(inp, ecn_recv_ce);
2553                 /* Mark this connection as it received CE from network */
2554                 tp->ecn_flags |= TE_RECV_ECN_CE;
2555                 tp->ecn_flags |= TE_SENDECE;
2556         }
2557
2558         /*
2559          * Clear TE_SENDECE if TH_CWR is set. This is harmless, so we don't
2560          * bother doing extensive checks for state and whatnot.
2561          */
2562         if (thflags & TH_CWR) {
2563                 tp->ecn_flags &= ~TE_SENDECE;
2564                 tp->t_ecn_recv_cwr++;
2565         }
2566
2567         /*
2568          * If we received an  explicit notification of congestion in
2569          * ip tos ecn bits or by the CWR bit in TCP header flags, reset
2570          * the ack-strteching state. We need to handle ECN notification if
2571          * an ECN setup SYN was sent even once.
2572          */
2573         if (tp->t_state == TCPS_ESTABLISHED
2574             && (tp->ecn_flags & TE_SETUPSENT)
2575             && (ip_ecn == IPTOS_ECN_CE || (thflags & TH_CWR))) {
2576                 tcp_reset_stretch_ack(tp);
2577                 CLEAR_IAJ_STATE(tp);
2578         }
2579
2580         if (ip_ecn == IPTOS_ECN_CE && tp->t_state == TCPS_ESTABLISHED &&
2581             !TCP_ECN_ENABLED(tp) && !(tp->ecn_flags & TE_CEHEURI_SET)) {
2582                 tcpstat.tcps_ecn_fallback_ce++;
2583                 tcp_heuristic_ecn_aggressive(tp);
2584                 tp->ecn_flags |= TE_CEHEURI_SET;
2585         }
2586
2587         if (tp->t_state == TCPS_ESTABLISHED && TCP_ECN_ENABLED(tp) &&
2588             ip_ecn == IPTOS_ECN_CE && !(tp->ecn_flags & TE_CEHEURI_SET)) {
2589                 if (inp->inp_stat->rxpackets < ECN_MIN_CE_PROBES) {
2590                         tp->t_ecn_recv_ce_pkt++;
2591                 } else if (tp->t_ecn_recv_ce_pkt > ECN_MAX_CE_RATIO) {
2592                         tcpstat.tcps_ecn_fallback_ce++;
2593                         tcp_heuristic_ecn_aggressive(tp);
2594                         tp->ecn_flags |= TE_CEHEURI_SET;
2595                         INP_INC_IFNET_STAT(inp,ecn_fallback_ce);
2596                 } else {
2597                         /* We tracked the first ECN_MIN_CE_PROBES segments, we
2598                          * now know that the path is good.
2599                          */
2600                         tp->ecn_flags |= TE_CEHEURI_SET;
2601                 }
2602         }
2603
2604         /*
2605          * Try to determine if we are receiving a packet after a long time.
2606          * Use our own approximation of idletime to roughly measure remote
2607          * end's idle time. Since slowstart is used after an idle period
2608          * we want to avoid doing LRO if the remote end is not up to date
2609          * on initial window support and starts with 1 or 2 packets as its IW.
2610          */
2611          if (sw_lro && (tp->t_flagsext & TF_LRO_OFFLOADED) &&
2612                 ((tcp_now - tp->t_rcvtime) >= (TCP_IDLETIMEOUT(tp)))) {
2613                 turnoff_lro = 1;
2614          }
2615
2616         /* Update rcvtime as a new segment was received on the connection */
2617         tp->t_rcvtime = tcp_now;
2618
2619         /*
2620          * Segment received on connection.
2621          * Reset idle time and keep-alive timer.
2622          */
2623         if (TCPS_HAVEESTABLISHED(tp->t_state))
2624                 tcp_keepalive_reset(tp);
2625
2626         /*
2627          * Process options if not in LISTEN state,
2628          * else do it below (after getting remote address).
2629          */
2630         if (tp->t_state != TCPS_LISTEN && optp) {
2631                 tcp_dooptions(tp, optp, optlen, th, &to);
2632 #if MPTCP
2633                 if (mptcp_input_preproc(tp, m, drop_hdrlen) != 0) {
2634                         tp->t_flags |= TF_ACKNOW;
2635                         (void) tcp_output(tp);
2636                         tcp_check_timer_state(tp);
2637                         tcp_unlock(so, 1, 0);
2638                         KERNEL_DEBUG(DBG_FNC_TCP_INPUT |
2639                             DBG_FUNC_END,0,0,0,0,0);
2640                         return;
2641                 }
2642 #endif /* MPTCP */
2643         }
2644         if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
2645                 if (!(thflags & TH_ACK) ||
2646                     (SEQ_GT(th->th_ack, tp->iss) &&
2647                     SEQ_LEQ(th->th_ack, tp->snd_max)))
2648                         tcp_finalize_options(tp, &to, ifscope);
2649         }
2650
2651 #if TRAFFIC_MGT
2652         /*
2653          * Compute inter-packet arrival jitter. According to RFC 3550,
2654          * inter-packet arrival jitter is defined as the difference in
2655          * packet spacing at the receiver compared to the sender for a
2656          * pair of packets. When two packets of maximum segment size come
2657          * one after the other with consecutive sequence numbers, we
2658          * consider them as packets sent together at the sender and use
2659          * them as a pair to compute inter-packet arrival jitter. This
2660          * metric indicates the delay induced by the network components due
2661          * to queuing in edge/access routers.
2662          */
2663         if (tp->t_state == TCPS_ESTABLISHED &&
2664             (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK|TH_ECE|TH_PUSH)) == TH_ACK &&
2665             ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
2666             ((to.to_flags & TOF_TS) == 0 ||
2667             TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
2668             th->th_seq == tp->rcv_nxt &&
2669             LIST_EMPTY(&tp->t_segq)) {
2670                 int seg_size = tlen;
2671                 if (tp->iaj_pktcnt <= IAJ_IGNORE_PKTCNT) {
2672                         TCP_INC_VAR(tp->iaj_pktcnt, nlropkts);
2673                 }
2674
2675                 if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) {
2676                         seg_size = m->m_pkthdr.lro_pktlen;
2677                 }
2678                 if ( tp->iaj_size == 0 || seg_size > tp->iaj_size ||
2679                         (seg_size == tp->iaj_size && tp->iaj_rcv_ts == 0)) {
2680                         /*
2681                          * State related to inter-arrival jitter is
2682                          * uninitialized or we are trying to find a good
2683                          * first packet to start computing the metric
2684                          */
2685                         update_iaj_state(tp, seg_size, 0);
2686                 } else {
2687                         if (seg_size == tp->iaj_size) {
2688                                 /*
2689                                  * Compute inter-arrival jitter taking
2690                                  * this packet as the second packet
2691                                  */
2692                                 if (pktf_sw_lro_pkt)
2693                                         compute_iaj(tp, nlropkts,
2694                                             m->m_pkthdr.lro_elapsed);
2695                                 else
2696                                         compute_iaj(tp, 1, 0);
2697                         }
2698                         if (seg_size  < tp->iaj_size) {
2699                                 /*
2700                                  * There is a smaller packet in the stream.
2701                                  * Some times the maximum size supported
2702                                  * on a path can change if there is a new
2703                                  * link with smaller MTU. The receiver will
2704                                  * not know about this change. If there
2705                                  * are too many packets smaller than
2706                                  * iaj_size, we try to learn the iaj_size
2707                                  * again.
2708                                  */
2709                                 TCP_INC_VAR(tp->iaj_small_pkt, nlropkts);
2710                                 if (tp->iaj_small_pkt > RESET_IAJ_SIZE_THRESH) {
2711                                         update_iaj_state(tp, seg_size, 1);
2712                                 } else {
2713                                         CLEAR_IAJ_STATE(tp);
2714                                 }
2715                         } else {
2716                                 update_iaj_state(tp, seg_size, 0);
2717                         }
2718                 }
2719         } else {
2720                 CLEAR_IAJ_STATE(tp);
2721         }
2722 #endif /* TRAFFIC_MGT */
2723
2724         /*
2725          * Header prediction: check for the two common cases
2726          * of a uni-directional data xfer.  If the packet has
2727          * no control flags, is in-sequence, the window didn't
2728          * change and we're not retransmitting, it's a
2729          * candidate.  If the length is zero and the ack moved
2730          * forward, we're the sender side of the xfer.  Just
2731          * free the data acked & wake any higher level process
2732          * that was blocked waiting for space.  If the length
2733          * is non-zero and the ack didn't move, we're the
2734          * receiver side.  If we're getting packets in-order
2735          * (the reassembly queue is empty), add the data to
2736          * the socket buffer and note that we need a delayed ack.
2737          * Make sure that the hidden state-flags are also off.
2738          * Since we check for TCPS_ESTABLISHED above, it can only
2739          * be TH_NEEDSYN.
2740          */
2741         if (tp->t_state == TCPS_ESTABLISHED &&
2742             (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK|TH_ECE|TH_CWR)) == TH_ACK &&
2743             ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
2744             ((to.to_flags & TOF_TS) == 0 ||
2745              TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
2746             th->th_seq == tp->rcv_nxt &&
2747             tiwin && tiwin == tp->snd_wnd &&
2748             tp->snd_nxt == tp->snd_max) {
2749
2750                 /*
2751                  * If last ACK falls within this segment's sequence numbers,
2752                  * record the timestamp.
2753                  * NOTE that the test is modified according to the latest
2754                  * proposal of the tcplw@cray.com list (Braden 1993/04/26).
2755                  */
2756                 if ((to.to_flags & TOF_TS) != 0 &&
2757                    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
2758                         tp->ts_recent_age = tcp_now;
2759                         tp->ts_recent = to.to_tsval;
2760                 }
2761
2762                 if (tlen == 0) {
2763                         if (SEQ_GT(th->th_ack, tp->snd_una) &&
2764                             SEQ_LEQ(th->th_ack, tp->snd_max) &&
2765                             tp->snd_cwnd >= tp->snd_ssthresh &&
2766                             (!IN_FASTRECOVERY(tp) &&
2767                             ((!(SACK_ENABLED(tp)) &&
2768                             tp->t_dupacks < tp->t_rexmtthresh) ||
2769                             (SACK_ENABLED(tp) && to.to_nsacks == 0 &&
2770                             TAILQ_EMPTY(&tp->snd_holes))))) {
2771                                 /*
2772                                  * this is a pure ack for outstanding data.
2773                                  */
2774                                 ++tcpstat.tcps_predack;
2775
2776                                 tcp_bad_rexmt_check(tp, th, &to),
2777
2778                                 /* Recalculate the RTT */
2779                                 tcp_compute_rtt(tp, &to, th);
2780
2781                                 VERIFY(SEQ_GEQ(th->th_ack, tp->snd_una));
2782                                 acked = BYTES_ACKED(th, tp);
2783                                 tcpstat.tcps_rcvackpack++;
2784                                 tcpstat.tcps_rcvackbyte += acked;
2785
2786                                 /*
2787                                  * Handle an ack that is in sequence during
2788                                  * congestion avoidance phase. The
2789                                  * calculations in this function
2790                                  * assume that snd_una is not updated yet.
2791                                  */
2792                                 if (CC_ALGO(tp)->congestion_avd != NULL)
2793                                         CC_ALGO(tp)->congestion_avd(tp, th);
2794                                 tcp_ccdbg_trace(tp, th, TCP_CC_INSEQ_ACK_RCVD);
2795                                 sbdrop(&so->so_snd, acked);
2796                                 if (so->so_flags & SOF_ENABLE_MSGS) {
2797                                         VERIFY(acked <= so->so_msg_state->msg_serial_bytes);
2798                                         so->so_msg_state->msg_serial_bytes -= acked;
2799                                 }
2800                                 tcp_sbsnd_trim(&so->so_snd);
2801
2802                                 if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
2803                                     SEQ_LEQ(th->th_ack, tp->snd_recover))
2804                                         tp->snd_recover = th->th_ack - 1;
2805                                 tp->snd_una = th->th_ack;
2806
2807                                 /*
2808                                  * pull snd_wl2 up to prevent seq wrap relative
2809                                  * to th_ack.
2810                                  */
2811                                 tp->snd_wl2 = th->th_ack;
2812
2813                                 if (tp->t_dupacks > 0) {
2814                                         tp->t_dupacks = 0;
2815                                         tp->t_rexmtthresh = tcprexmtthresh;
2816                                 }
2817
2818                                 m_freem(m);
2819
2820                                 /*
2821                                  * If all outstanding data are acked, stop
2822                                  * retransmit timer, otherwise restart timer
2823                                  * using current (possibly backed-off) value.
2824                                  * If process is waiting for space,
2825                                  * wakeup/selwakeup/signal.  If data
2826                                  * are ready to send, let tcp_output
2827                                  * decide between more output or persist.
2828                                  */
2829                                 if (tp->snd_una == tp->snd_max) {
2830                                         tp->t_timer[TCPT_REXMT] = 0;
2831                                         tp->t_timer[TCPT_PTO] = 0;
2832                                 } else if (tp->t_timer[TCPT_PERSIST] == 0) {
2833                                         tp->t_timer[TCPT_REXMT] =
2834                                             OFFSET_FROM_START(tp,
2835                                             tp->t_rxtcur);
2836                                 }
2837                                 if (!SLIST_EMPTY(&tp->t_rxt_segments) &&
2838                                     !TCP_DSACK_SEQ_IN_WINDOW(tp,
2839                                     tp->t_dsack_lastuna, tp->snd_una))
2840                                         tcp_rxtseg_clean(tp);
2841
2842                                 if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 &&
2843                                         tp->t_bwmeas != NULL)
2844                                         tcp_bwmeas_check(tp);
2845                                 sowwakeup(so); /* has to be done with socket lock held */
2846                                 if ((so->so_snd.sb_cc) || (tp->t_flags & TF_ACKNOW)) {
2847                                         (void) tcp_output(tp);
2848                                 }
2849
2850                                 tcp_tfo_rcv_ack(tp, th);
2851
2852                                 tcp_check_timer_state(tp);
2853                                 tcp_unlock(so, 1, 0);
2854                                 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
2855                                 return;
2856                         }
2857                 } else if (th->th_ack == tp->snd_una &&
2858                     LIST_EMPTY(&tp->t_segq) &&
2859                     tlen <= tcp_sbspace(tp)) {
2860                         /*
2861                          * this is a pure, in-sequence data packet
2862                          * with nothing on the reassembly queue and
2863                          * we have enough buffer space to take it.
2864                          */
2865
2866                         /*
2867                          * If this is a connection in steady state, start
2868                          * coalescing packets belonging to this flow.
2869                          */
2870                         if (turnoff_lro) {
2871                                 tcp_lro_remove_state(tp->t_inpcb->inp_laddr,
2872                                         tp->t_inpcb->inp_faddr,
2873                                         tp->t_inpcb->inp_lport,
2874                                         tp->t_inpcb->inp_fport);
2875                                 tp->t_flagsext &= ~TF_LRO_OFFLOADED;
2876                                 tp->t_idleat = tp->rcv_nxt;
2877                         } else if (sw_lro && !pktf_sw_lro_pkt && !isipv6 &&
2878                             (so->so_flags & SOF_USELRO) &&
2879                             !IFNET_IS_CELLULAR(m->m_pkthdr.rcvif) &&
2880                             (m->m_pkthdr.rcvif->if_type != IFT_LOOP) &&
2881                             ((th->th_seq - tp->irs) >
2882                             (tp->t_maxseg << lro_start)) &&
2883                             ((tp->t_idleat == 0) || ((th->th_seq -
2884                              tp->t_idleat) > (tp->t_maxseg << lro_start)))) {
2885                                 tp->t_flagsext |= TF_LRO_OFFLOADED;
2886                                 tcp_start_coalescing(ip, th, tlen);
2887                                 tp->t_idleat = 0;
2888                         }
2889
2890                         /* Clean receiver SACK report if present */
2891                         if (SACK_ENABLED(tp) && tp->rcv_numsacks)
2892                                 tcp_clean_sackreport(tp);
2893                         ++tcpstat.tcps_preddat;
2894                         tp->rcv_nxt += tlen;
2895                         /*
2896                          * Pull snd_wl1 up to prevent seq wrap relative to
2897                          * th_seq.
2898                          */
2899                         tp->snd_wl1 = th->th_seq;
2900                         /*
2901                          * Pull rcv_up up to prevent seq wrap relative to
2902                          * rcv_nxt.
2903                          */
2904                         tp->rcv_up = tp->rcv_nxt;
2905                         TCP_INC_VAR(tcpstat.tcps_rcvpack, nlropkts);
2906                         tcpstat.tcps_rcvbyte += tlen;
2907                         if (nstat_collect) {
2908                                 if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) {
2909                                         INP_ADD_STAT(inp, cell, wifi, wired,
2910                                             rxpackets, m->m_pkthdr.lro_npkts);
2911                                 } else {
2912                                         INP_ADD_STAT(inp, cell, wifi, wired,
2913                                             rxpackets, 1);
2914                                 }
2915                                 INP_ADD_STAT(inp, cell, wifi, wired,rxbytes,
2916                                     tlen);
2917                         }
2918
2919                         /*
2920                          * Calculate the RTT on the receiver only if the
2921                          * connection is in streaming mode and the last
2922                          * packet was not an end-of-write
2923                          */
2924                         if ((tp->t_flags & TF_STRETCHACK) &&
2925                                 !(tp->t_flagsext & TF_STREAMEOW))
2926                                 tcp_compute_rtt(tp, &to, th);
2927
2928                         tcp_sbrcv_grow(tp, &so->so_rcv, &to, tlen);
2929
2930                         /*
2931                          * Add data to socket buffer.
2932                          */
2933                         so_recv_data_stat(so, m, 0);
2934                         m_adj(m, drop_hdrlen);  /* delayed header drop */
2935
2936                         /*
2937                          * If message delivery (SOF_ENABLE_MSGS) is enabled on
2938                          * this socket, deliver the packet received as an
2939                          * in-order message with sequence number attached to it.
2940                          */
2941                         if (sbappendstream_rcvdemux(so, m,
2942                             th->th_seq - (tp->irs + 1), 0)) {
2943                                 sorwakeup(so);
2944                         }
2945 #if INET6
2946                         if (isipv6) {
2947                                 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
2948                                         (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
2949                                         th->th_seq, th->th_ack, th->th_win);
2950                         }
2951                         else
2952 #endif
2953                         {
2954                                 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
2955                                         (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
2956                                         th->th_seq, th->th_ack, th->th_win);
2957                         }
2958                         TCP_INC_VAR(tp->t_unacksegs, nlropkts);
2959                         if (DELAY_ACK(tp, th))  {
2960                                 if ((tp->t_flags & TF_DELACK) == 0) {
2961                                         tp->t_flags |= TF_DELACK;
2962                                         tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
2963                                 }
2964                         } else {
2965                                 tp->t_flags |= TF_ACKNOW;
2966                                 tcp_output(tp);
2967                         }
2968
2969                         tcp_adaptive_rwtimo_check(tp, tlen);
2970
2971                         if (tlen > 0)
2972                                 tcp_tfo_rcv_data(tp);
2973
2974                         tcp_check_timer_state(tp);
2975                         tcp_unlock(so, 1, 0);
2976                         KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
2977                         return;
2978                 }
2979         }
2980
2981         /*
2982          * Calculate amount of space in receive window,
2983          * and then do TCP input processing.
2984          * Receive window is amount of space in rcv queue,
2985          * but not less than advertised window.
2986          */
2987         lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx,
2988             LCK_MTX_ASSERT_OWNED);
2989         win = tcp_sbspace(tp);
2990         if (win < 0)
2991                 win = 0;
2992         else {  /* clip rcv window to 4K for modems */
2993                 if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0)
2994                         win = min(win, slowlink_wsize);
2995         }
2996         tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
2997 #if MPTCP
2998         /*
2999          * Ensure that the subflow receive window isn't greater
3000          * than the connection level receive window.
3001          */
3002         if ((tp->t_mpflags & TMPF_MPTCP_TRUE) &&
3003             (mp_tp = tptomptp(tp))) {
3004                 MPT_LOCK(mp_tp);
3005                 if (tp->rcv_wnd > mp_tp->mpt_rcvwnd) {
3006                         tp->rcv_wnd = mp_tp->mpt_rcvwnd;
3007                         tcpstat.tcps_mp_reducedwin++;
3008                 }
3009                 MPT_UNLOCK(mp_tp);
3010         }
3011 #endif /* MPTCP */
3012
3013         switch (tp->t_state) {
3014
3015         /*
3016          * Initialize tp->rcv_nxt, and tp->irs, select an initial
3017          * tp->iss, and send a segment:
3018          *     <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
3019          * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
3020          * Fill in remote peer address fields if not previously specified.
3021          * Enter SYN_RECEIVED state, and process any other fields of this
3022          * segment in this state.
3023          */
3024         case TCPS_LISTEN: {
3025                 register struct sockaddr_in *sin;
3026 #if INET6
3027                 register struct sockaddr_in6 *sin6;
3028 #endif
3029
3030                 lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx,
3031                     LCK_MTX_ASSERT_OWNED);
3032 #if INET6
3033                 if (isipv6) {
3034                         MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6,
3035                                M_SONAME, M_NOWAIT);
3036                         if (sin6 == NULL)
3037                                 goto drop;
3038                         bzero(sin6, sizeof(*sin6));
3039                         sin6->sin6_family = AF_INET6;
3040                         sin6->sin6_len = sizeof(*sin6);
3041                         sin6->sin6_addr = ip6->ip6_src;
3042                         sin6->sin6_port = th->th_sport;
3043                         laddr6 = inp->in6p_laddr;
3044                         if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
3045                                 inp->in6p_laddr = ip6->ip6_dst;
3046                         if (in6_pcbconnect(inp, (struct sockaddr *)sin6,
3047                                            proc0)) {
3048                                 inp->in6p_laddr = laddr6;
3049                                 FREE(sin6, M_SONAME);
3050                                 goto drop;
3051                         }
3052                         FREE(sin6, M_SONAME);
3053                 } else
3054 #endif
3055             {
3056                         lck_mtx_assert(
3057                             &((struct inpcb *)so->so_pcb)->inpcb_mtx,
3058                             LCK_MTX_ASSERT_OWNED);
3059                         MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME,
3060                        M_NOWAIT);
3061                         if (sin == NULL)
3062                                 goto drop;
3063                         sin->sin_family = AF_INET;
3064                         sin->sin_len = sizeof(*sin);
3065                         sin->sin_addr = ip->ip_src;
3066                         sin->sin_port = th->th_sport;
3067                         bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero));
3068                         laddr = inp->inp_laddr;
3069                         if (inp->inp_laddr.s_addr == INADDR_ANY)
3070                                 inp->inp_laddr = ip->ip_dst;
3071                         if (in_pcbconnect(inp, (struct sockaddr *)sin, proc0,
3072                             IFSCOPE_NONE, NULL)) {
3073                                 inp->inp_laddr = laddr;
3074                                 FREE(sin, M_SONAME);
3075                                 goto drop;
3076                         }
3077                         FREE(sin, M_SONAME);
3078                 }
3079
3080                 tcp_dooptions(tp, optp, optlen, th, &to);
3081                 tcp_finalize_options(tp, &to, ifscope);
3082
3083                 if (tfo_enabled(tp) && tcp_tfo_syn(tp, &to))
3084                         isconnected = TRUE;
3085
3086                 if (iss)
3087                         tp->iss = iss;
3088                 else {
3089                         tp->iss = tcp_new_isn(tp);
3090                 }
3091                 tp->irs = th->th_seq;
3092                 tcp_sendseqinit(tp);
3093                 tcp_rcvseqinit(tp);
3094                 tp->snd_recover = tp->snd_una;
3095                 /*
3096                  * Initialization of the tcpcb for transaction;
3097                  *   set SND.WND = SEG.WND,
3098                  *   initialize CCsend and CCrecv.
3099                  */
3100                 tp->snd_wnd = tiwin;    /* initial send-window */
3101                 tp->t_flags |= TF_ACKNOW;
3102                 tp->t_unacksegs = 0;
3103                 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
3104                         struct tcpcb *, tp, int32_t, TCPS_SYN_RECEIVED);
3105                 tp->t_state = TCPS_SYN_RECEIVED;
3106                 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
3107                         TCP_CONN_KEEPINIT(tp));
3108                 dropsocket = 0;         /* committed to socket */
3109
3110                 if (inp->inp_flowhash == 0)
3111                         inp->inp_flowhash = inp_calc_flowhash(inp);
3112 #if INET6
3113                 /* update flowinfo - RFC 6437 */
3114                 if (inp->inp_flow == 0 &&
3115                     inp->in6p_flags & IN6P_AUTOFLOWLABEL) {
3116                         inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
3117                         inp->inp_flow |=
3118                             (htonl(inp->inp_flowhash) & IPV6_FLOWLABEL_MASK);
3119                 }
3120 #endif /* INET6 */
3121
3122                 /* reset the incomp processing flag */
3123                 so->so_flags &= ~(SOF_INCOMP_INPROGRESS);
3124                 tcpstat.tcps_accepts++;
3125                 if ((thflags & (TH_ECE | TH_CWR)) == (TH_ECE | TH_CWR)) {
3126                         /* ECN-setup SYN */
3127                         tp->ecn_flags |= (TE_SETUPRECEIVED | TE_SENDIPECT);
3128                 }
3129
3130 #if CONFIG_IFEF_NOWINDOWSCALE
3131                 if (tcp_obey_ifef_nowindowscale && m->m_pkthdr.rcvif != NULL &&
3132                     (m->m_pkthdr.rcvif->if_eflags & IFEF_NOWINDOWSCALE)) {
3133                         /* Window scaling is not enabled on this interface */
3134                         tp->t_flags &= ~TF_REQ_SCALE;
3135                 }
3136 #endif
3137                 goto trimthenstep6;
3138                 }
3139
3140         /*
3141          * If the state is SYN_RECEIVED and the seg contains an ACK,
3142          * but not for our SYN/ACK, send a RST.
3143          */
3144         case TCPS_SYN_RECEIVED:
3145                 if ((thflags & TH_ACK) &&
3146                     (SEQ_LEQ(th->th_ack, tp->snd_una) ||
3147                      SEQ_GT(th->th_ack, tp->snd_max))) {
3148                                 rstreason = BANDLIM_RST_OPENPORT;
3149                                 IF_TCP_STATINC(ifp, ooopacket);
3150                                 goto dropwithreset;
3151                 }
3152
3153                 /*
3154                  * In SYN_RECEIVED state, if we recv some SYNS with
3155                  * window scale and others without, window scaling should
3156                  * be disabled. Otherwise the window advertised will be
3157                  * lower if we assume scaling and the other end does not.
3158                  */
3159                 if ((thflags & TH_SYN) &&
3160                     (tp->irs == th->th_seq) &&
3161                     !(to.to_flags & TOF_SCALE))
3162                         tp->t_flags &= ~TF_RCVD_SCALE;
3163                 break;
3164
3165         /*
3166          * If the state is SYN_SENT:
3167          *      if seg contains an ACK, but not for our SYN, drop the input.
3168          *      if seg contains a RST, then drop the connection.
3169          *      if seg does not contain SYN, then drop it.
3170          * Otherwise this is an acceptable SYN segment
3171          *      initialize tp->rcv_nxt and tp->irs
3172          *      if seg contains ack then advance tp->snd_una
3173          *      if SYN has been acked change to ESTABLISHED else SYN_RCVD state
3174          *      arrange for segment to be acked (eventually)
3175          *      continue processing rest of data/controls, beginning with URG
3176          */
3177         case TCPS_SYN_SENT:
3178                 if ((thflags & TH_ACK) &&
3179                     (SEQ_LEQ(th->th_ack, tp->iss) ||
3180                      SEQ_GT(th->th_ack, tp->snd_max))) {
3181                         rstreason = BANDLIM_UNLIMITED;
3182                         IF_TCP_STATINC(ifp, ooopacket);
3183                         goto dropwithreset;
3184                 }
3185                 if (thflags & TH_RST) {
3186                         if ((thflags & TH_ACK) != 0) {
3187 #if MPTCP
3188                                 if ((so->so_flags & SOF_MPTCP_FASTJOIN) &&
3189                                         SEQ_GT(th->th_ack, tp->iss+1)) {
3190                                         so->so_flags &= ~SOF_MPTCP_FASTJOIN;
3191                                         /* ignore the RST and retransmit SYN */
3192                                         goto drop;
3193                                 }
3194 #endif /* MPTCP */
3195                                 soevent(so,
3196                                     (SO_FILT_HINT_LOCKED |
3197                                     SO_FILT_HINT_CONNRESET));
3198                                 tp = tcp_drop(tp, ECONNREFUSED);
3199                                 postevent(so, 0, EV_RESET);
3200                         }
3201                         goto drop;
3202                 }
3203                 if ((thflags & TH_SYN) == 0)
3204                         goto drop;
3205                 tp->snd_wnd = th->th_win;       /* initial send window */
3206
3207                 tp->irs = th->th_seq;
3208                 tcp_rcvseqinit(tp);
3209                 if (thflags & TH_ACK) {
3210                         tcpstat.tcps_connects++;
3211
3212                         if ((thflags & (TH_ECE | TH_CWR)) == (TH_ECE)) {
3213                                 /* ECN-setup SYN-ACK */
3214                                 tp->ecn_flags |= TE_SETUPRECEIVED;
3215                                 if (TCP_ECN_ENABLED(tp)) {
3216                                         tcp_heuristic_reset_loss(tp, 0, 1);
3217                                         tcpstat.tcps_ecn_client_success++;
3218                                 }
3219                         } else {
3220                                 if (tp->ecn_flags & TE_SETUPSENT &&
3221                                     tp->t_rxtshift == 0) {
3222                                         tcp_heuristic_reset_loss(tp, 0, 1);
3223                                         tcpstat.tcps_ecn_not_supported++;
3224                                 }
3225                                 if (tp->ecn_flags & TE_SETUPSENT &&
3226                                     tp->t_rxtshift > 0)
3227                                         tcp_heuristic_inc_loss(tp, 0, 1);
3228
3229                                 /* non-ECN-setup SYN-ACK */
3230                                 tp->ecn_flags &= ~TE_SENDIPECT;
3231                         }
3232
3233 #if CONFIG_MACF_NET && CONFIG_MACF_SOCKET
3234                         /* XXXMAC: recursive lock: SOCK_LOCK(so); */
3235                         mac_socketpeer_label_associate_mbuf(m, so);
3236                         /* XXXMAC: SOCK_UNLOCK(so); */
3237 #endif
3238                         /* Do window scaling on this connection? */
3239                         if (TCP_WINDOW_SCALE_ENABLED(tp)) {
3240                                 tp->snd_scale = tp->requested_s_scale;
3241                                 tp->rcv_scale = tp->request_r_scale;
3242                         }
3243
3244                         tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN << tp->rcv_scale);
3245                         tp->snd_una++;          /* SYN is acked */
3246                         if (SEQ_LT(tp->snd_nxt, tp->snd_una))
3247                                 tp->snd_nxt = tp->snd_una;
3248
3249                         /*
3250                          * We have sent more in the SYN than what is being
3251                          * acked. (e.g., TFO)
3252                          * We should restart the sending from what the receiver
3253                          * has acknowledged immediately.
3254                          */
3255                         if (SEQ_GT(tp->snd_nxt, th->th_ack))
3256                                 tp->snd_max = tp->snd_nxt = th->th_ack;
3257
3258                         /*
3259                          * If there's data, delay ACK; if there's also a FIN
3260                          * ACKNOW will be turned on later.
3261                          */
3262                         TCP_INC_VAR(tp->t_unacksegs, nlropkts);
3263                         if (DELAY_ACK(tp, th) && tlen != 0 ) {
3264                                 if ((tp->t_flags & TF_DELACK) == 0) {
3265                                         tp->t_flags |= TF_DELACK;
3266                                         tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
3267                                 }
3268                         }
3269                         else {
3270                                 tp->t_flags |= TF_ACKNOW;
3271                         }
3272                         /*
3273                          * Received <SYN,ACK> in SYN_SENT[*] state.
3274                          * Transitions:
3275                          *      SYN_SENT  --> ESTABLISHED
3276                          *      SYN_SENT* --> FIN_WAIT_1
3277                          */
3278                         tp->t_starttime = tcp_now;
3279                         tcp_sbrcv_tstmp_check(tp);
3280                         if (tp->t_flags & TF_NEEDFIN) {
3281                                 DTRACE_TCP4(state__change, void, NULL,
3282                                     struct inpcb *, inp,
3283                                     struct tcpcb *, tp, int32_t,
3284                                     TCPS_FIN_WAIT_1);
3285                                 tp->t_state = TCPS_FIN_WAIT_1;
3286                                 tp->t_flags &= ~TF_NEEDFIN;
3287                                 thflags &= ~TH_SYN;
3288                         } else {
3289                                 DTRACE_TCP4(state__change, void, NULL,
3290                                     struct inpcb *, inp, struct tcpcb *,
3291                                     tp, int32_t, TCPS_ESTABLISHED);
3292                                 tp->t_state = TCPS_ESTABLISHED;
3293                                 tp->t_timer[TCPT_KEEP] =
3294                                     OFFSET_FROM_START(tp,
3295                                     TCP_CONN_KEEPIDLE(tp));
3296                                 if (nstat_collect)
3297                                         nstat_route_connect_success(
3298                                             tp->t_inpcb->inp_route.ro_rt);
3299                         }
3300 #if MPTCP
3301                         /*
3302                          * Do not send the connect notification for additional
3303                          * subflows until ACK for 3-way handshake arrives.
3304                          */
3305                         if ((!(tp->t_mpflags & TMPF_MPTCP_TRUE)) &&
3306                             (tp->t_mpflags & TMPF_SENT_JOIN)) {
3307                                 isconnected = FALSE;
3308                                 /* Start data xmit if fastjoin */
3309                                 if (mptcp_fastjoin && (so->so_flags & SOF_MPTCP_FASTJOIN)) {
3310                                         soevent(so, (SO_FILT_HINT_LOCKED |
3311                                             SO_FILT_HINT_MPFASTJ));
3312                                 }
3313                         } else
3314 #endif /* MPTCP */
3315                                 isconnected = TRUE;
3316
3317                         if (tp->t_tfo_flags & (TFO_F_COOKIE_REQ | TFO_F_COOKIE_SENT)) {
3318                                 tcp_tfo_synack(tp, &to);
3319
3320                                 if ((tp->t_tfo_stats & TFO_S_SYN_DATA_SENT) &&
3321                                     SEQ_LT(tp->snd_una, th->th_ack)) {
3322                                         tp->t_tfo_stats |= TFO_S_SYN_DATA_ACKED;
3323                                         tcpstat.tcps_tfo_syn_data_acked++;
3324 #if MPTCP
3325                                         if (so->so_flags & SOF_MP_SUBFLOW)
3326                                                 so->so_flags1 |= SOF1_TFO_REWIND;
3327 #endif
3328                                         if (!(tp->t_tfo_flags & TFO_F_NO_RCVPROBING))
3329                                                 tcp_tfo_rcv_probe(tp, tlen);
3330                                 }
3331                         }
3332                 } else {
3333                         /*
3334                          *  Received initial SYN in SYN-SENT[*] state => simul-
3335                          *  taneous open.  If segment contains CC option and there is
3336                          *  a cached CC, apply TAO test; if it succeeds, connection is
3337                          *  half-synchronized.  Otherwise, do 3-way handshake:
3338                          *        SYN-SENT -> SYN-RECEIVED
3339                          *        SYN-SENT* -> SYN-RECEIVED*
3340                          */
3341                         tp->t_flags |= TF_ACKNOW;
3342                         tp->t_timer[TCPT_REXMT] = 0;
3343                         DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
3344                                 struct tcpcb *, tp, int32_t, TCPS_SYN_RECEIVED);
3345                         tp->t_state = TCPS_SYN_RECEIVED;
3346
3347                         /*
3348                          * During simultaneous open, TFO should not be used.
3349                          * So, we disable it here, to prevent that data gets
3350                          * sent on the SYN/ACK.
3351                          */
3352                         tcp_disable_tfo(tp);
3353                 }
3354
3355 trimthenstep6:
3356                 /*
3357                  * Advance th->th_seq to correspond to first data byte.
3358                  * If data, trim to stay within window,
3359                  * dropping FIN if necessary.
3360                  */
3361                 th->th_seq++;
3362                 if (tlen > tp->rcv_wnd) {
3363                         todrop = tlen - tp->rcv_wnd;
3364                         m_adj(m, -todrop);
3365                         tlen = tp->rcv_wnd;
3366                         thflags &= ~TH_FIN;
3367                         tcpstat.tcps_rcvpackafterwin++;
3368                         tcpstat.tcps_rcvbyteafterwin += todrop;
3369                 }
3370                 tp->snd_wl1 = th->th_seq - 1;
3371                 tp->rcv_up = th->th_seq;
3372                 /*
3373                  *  Client side of transaction: already sent SYN and data.
3374                  *  If the remote host used T/TCP to validate the SYN,
3375                  *  our data will be ACK'd; if so, enter normal data segment
3376                  *  processing in the middle of step 5, ack processing.
3377                  *  Otherwise, goto step 6.
3378                  */
3379                 if (thflags & TH_ACK)
3380                         goto process_ACK;
3381                 goto step6;
3382         /*
3383          * If the state is LAST_ACK or CLOSING or TIME_WAIT:
3384          *      do normal processing.
3385          *
3386          * NB: Leftover from RFC1644 T/TCP.  Cases to be reused later.
3387          */
3388         case TCPS_LAST_ACK:
3389         case TCPS_CLOSING:
3390         case TCPS_TIME_WAIT:
3391                 break;  /* continue normal processing */
3392
3393         /* Received a SYN while connection is already established.
3394          * This is a "half open connection and other anomalies" described
3395          * in RFC793 page 34, send an ACK so the remote reset the connection
3396          * or recovers by adjusting its sequence numberering
3397          */
3398         case TCPS_ESTABLISHED:
3399                 if (thflags & TH_SYN)
3400                         goto dropafterack;
3401                 break;
3402         }
3403
3404         /*
3405          * States other than LISTEN or SYN_SENT.
3406          * First check the RST flag and sequence number since reset segments
3407          * are exempt from the timestamp and connection count tests.  This
3408          * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
3409          * below which allowed reset segments in half the sequence space
3410          * to fall though and be processed (which gives forged reset
3411          * segments with a random sequence number a 50 percent chance of
3412          * killing a connection).
3413          * Then check timestamp, if present.
3414          * Then check the connection count, if present.
3415          * Then check that at least some bytes of segment are within
3416          * receive window.  If segment begins before rcv_nxt,
3417          * drop leading data (and SYN); if nothing left, just ack.
3418          *
3419          *
3420          * If the RST bit is set, check the sequence number to see
3421          * if this is a valid reset segment.
3422          * RFC 793 page 37:
3423          *   In all states except SYN-SENT, all reset (RST) segments
3424          *   are validated by checking their SEQ-fields.  A reset is
3425          *   valid if its sequence number is in the window.
3426          * Note: this does not take into account delayed ACKs, so
3427          *   we should test against last_ack_sent instead of rcv_nxt.
3428          *   The sequence number in the reset segment is normally an
3429          *   echo of our outgoing acknowlegement numbers, but some hosts
3430          *   send a reset with the sequence number at the rightmost edge
3431          *   of our receive window, and we have to handle this case.
3432          * Note 2: Paul Watson's paper "Slipping in the Window" has shown
3433          *   that brute force RST attacks are possible.  To combat this,
3434          *   we use a much stricter check while in the ESTABLISHED state,
3435          *   only accepting RSTs where the sequence number is equal to
3436          *   last_ack_sent.  In all other states (the states in which a
3437          *   RST is more likely), the more permissive check is used.
3438          * If we have multiple segments in flight, the intial reset
3439          * segment sequence numbers will be to the left of last_ack_sent,
3440          * but they will eventually catch up.
3441          * In any case, it never made sense to trim reset segments to
3442          * fit the receive window since RFC 1122 says:
3443          *   4.2.2.12  RST Segment: RFC-793 Section 3.4
3444          *
3445          *    A TCP SHOULD allow a received RST segment to include data.
3446          *
3447          *    DISCUSSION
3448          *         It has been suggested that a RST segment could contain
3449          *         ASCII text that encoded and explained the cause of the
3450          *         RST.  No standard has yet been established for such
3451          *         data.
3452          *
3453          * If the reset segment passes the sequence number test examine
3454          * the state:
3455          *    SYN_RECEIVED STATE:
3456          *      If passive open, return to LISTEN state.
3457          *      If active open, inform user that connection was refused.
3458          *    ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES:
3459          *      Inform user that connection was reset, and close tcb.
3460          *    CLOSING, LAST_ACK STATES:
3461          *      Close the tcb.
3462          *    TIME_WAIT STATE:
3463          *      Drop the segment - see Stevens, vol. 2, p. 964 and
3464          *      RFC 1337.
3465          *
3466          *      Radar 4803931: Allows for the case where we ACKed the FIN but
3467          *                     there is already a RST in flight from the peer.
3468          *                     In that case, accept the RST for non-established
3469          *                     state if it's one off from last_ack_sent.
3470
3471          */
3472         if (thflags & TH_RST) {
3473                 if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
3474                     SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
3475                     (tp->rcv_wnd == 0 &&
3476                     ((tp->last_ack_sent == th->th_seq) ||
3477                     ((tp->last_ack_sent -1) == th->th_seq)))) {
3478                         switch (tp->t_state) {
3479
3480                         case TCPS_SYN_RECEIVED:
3481                                 IF_TCP_STATINC(ifp, rstinsynrcv);
3482                                 so->so_error = ECONNREFUSED;
3483                                 goto close;
3484
3485                         case TCPS_ESTABLISHED:
3486                                 if (tp->last_ack_sent != th->th_seq) {
3487                                         tcpstat.tcps_badrst++;
3488                                         goto drop;
3489                                 }
3490                         case TCPS_FIN_WAIT_1:
3491                         case TCPS_CLOSE_WAIT:
3492                                 /*
3493                                   Drop through ...
3494                                 */
3495                         case TCPS_FIN_WAIT_2:
3496                                 so->so_error = ECONNRESET;
3497                         close:
3498                                 postevent(so, 0, EV_RESET);
3499                                 soevent(so,
3500                                     (SO_FILT_HINT_LOCKED |
3501                                     SO_FILT_HINT_CONNRESET));
3502
3503                                 tcpstat.tcps_drops++;
3504                                 tp = tcp_close(tp);
3505                                 break;
3506
3507                         case TCPS_CLOSING:
3508                         case TCPS_LAST_ACK:
3509                                 tp = tcp_close(tp);
3510                                 break;
3511
3512                         case TCPS_TIME_WAIT:
3513                                 break;
3514                         }
3515                 }
3516                 goto drop;
3517         }
3518
3519         /*
3520          * RFC 1323 PAWS: If we have a timestamp reply on this segment
3521          * and it's less than ts_recent, drop it.
3522          */
3523         if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
3524             TSTMP_LT(to.to_tsval, tp->ts_recent)) {
3525
3526                 /* Check to see if ts_recent is over 24 days old.  */
3527                 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) {
3528                         /*
3529                          * Invalidate ts_recent.  If this segment updates
3530                          * ts_recent, the age will be reset later and ts_recent
3531                          * will get a valid value.  If it does not, setting
3532                          * ts_recent to zero will at least satisfy the
3533                          * requirement that zero be placed in the timestamp
3534                          * echo reply when ts_recent isn't valid.  The
3535                          * age isn't reset until we get a valid ts_recent
3536                          * because we don't want out-of-order segments to be
3537                          * dropped when ts_recent is old.
3538                          */
3539                         tp->ts_recent = 0;
3540                 } else {
3541                         tcpstat.tcps_rcvduppack++;
3542                         tcpstat.tcps_rcvdupbyte += tlen;
3543                         tp->t_pawsdrop++;
3544                         tcpstat.tcps_pawsdrop++;
3545
3546                         /*
3547                          * PAWS-drop when ECN is being used? That indicates
3548                          * that ECT-marked packets take a different path, with
3549                          * different congestion-characteristics.
3550                          *
3551                          * Only fallback when we did send less than 2GB as PAWS
3552                          * really has no reason to kick in earlier.
3553                          */
3554                         if (TCP_ECN_ENABLED(tp) &&
3555                             inp->inp_stat->rxbytes < 2147483648) {
3556                                 INP_INC_IFNET_STAT(inp, ecn_fallback_reorder);
3557                                 tcpstat.tcps_ecn_fallback_reorder++;
3558                                 tcp_heuristic_ecn_aggressive(tp);
3559                         }
3560
3561                         if (nstat_collect) {
3562                                 nstat_route_rx(tp->t_inpcb->inp_route.ro_rt,
3563                                         1, tlen, NSTAT_RX_FLAG_DUPLICATE);
3564                                 INP_ADD_STAT(inp, cell, wifi, wired,
3565                                     rxpackets, 1);
3566                                 INP_ADD_STAT(inp, cell, wifi, wired,
3567                                     rxbytes, tlen);
3568                                 tp->t_stat.rxduplicatebytes += tlen;
3569                         }
3570                         if (tlen > 0)
3571                                 goto dropafterack;
3572                         goto drop;
3573                 }
3574         }
3575
3576         /*
3577          * In the SYN-RECEIVED state, validate that the packet belongs to
3578          * this connection before trimming the data to fit the receive
3579          * window.  Check the sequence number versus IRS since we know
3580          * the sequence numbers haven't wrapped.  This is a partial fix
3581          * for the "LAND" DoS attack.
3582          */
3583         if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
3584                 rstreason = BANDLIM_RST_OPENPORT;
3585                 IF_TCP_STATINC(ifp, dospacket);
3586                 goto dropwithreset;
3587         }
3588
3589         todrop = tp->rcv_nxt - th->th_seq;
3590         if (todrop > 0) {
3591                 if (thflags & TH_SYN) {
3592                         thflags &= ~TH_SYN;
3593                         th->th_seq++;
3594                         if (th->th_urp > 1)
3595                                 th->th_urp--;
3596                         else
3597                                 thflags &= ~TH_URG;
3598                         todrop--;
3599                 }
3600                 /*
3601                  * Following if statement from Stevens, vol. 2, p. 960.
3602                  */
3603                 if (todrop > tlen
3604                     || (todrop == tlen && (thflags & TH_FIN) == 0)) {
3605                         /*
3606                          * Any valid FIN must be to the left of the window.
3607                          * At this point the FIN must be a duplicate or out
3608                          * of sequence; drop it.
3609                          */
3610                         thflags &= ~TH_FIN;
3611
3612                         /*
3613                          * Send an ACK to resynchronize and drop any data.
3614                          * But keep on processing for RST or ACK.
3615                          */
3616                         tp->t_flags |= TF_ACKNOW;
3617                         if (todrop == 1) {
3618                                 /* This could be a keepalive */
3619                                 soevent(so, SO_FILT_HINT_LOCKED |
3620                                         SO_FILT_HINT_KEEPALIVE);
3621                         }
3622                         todrop = tlen;
3623                         tcpstat.tcps_rcvduppack++;
3624                         tcpstat.tcps_rcvdupbyte += todrop;
3625                 } else {
3626                         tcpstat.tcps_rcvpartduppack++;
3627                         tcpstat.tcps_rcvpartdupbyte += todrop;
3628                 }
3629
3630                 if (TCP_DSACK_ENABLED(tp) && todrop > 1) {
3631                         /*
3632                          * Note the duplicate data sequence space so that
3633                          * it can be reported in DSACK option.
3634                          */
3635                         tp->t_dsack_lseq = th->th_seq;
3636                         tp->t_dsack_rseq = th->th_seq + todrop;
3637                         tp->t_flags |= TF_ACKNOW;
3638                 }
3639                 if (nstat_collect) {
3640                         nstat_route_rx(tp->t_inpcb->inp_route.ro_rt, 1,
3641                                 todrop, NSTAT_RX_FLAG_DUPLICATE);
3642                         INP_ADD_STAT(inp, cell, wifi, wired, rxpackets, 1);
3643                         INP_ADD_STAT(inp, cell, wifi, wired, rxbytes, todrop);
3644                         tp->t_stat.rxduplicatebytes += todrop;
3645                 }
3646                 drop_hdrlen += todrop;  /* drop from the top afterwards */
3647                 th->th_seq += todrop;
3648                 tlen -= todrop;
3649                 if (th->th_urp > todrop)
3650                         th->th_urp -= todrop;
3651                 else {
3652                         thflags &= ~TH_URG;
3653                         th->th_urp = 0;
3654                 }
3655         }
3656
3657         /*
3658          * If new data are received on a connection after the user
3659          * processes are gone, then RST the other end.
3660          * Send also a RST when we received a data segment after we've
3661          * sent our FIN when the socket is defunct.
3662          * Note that an MPTCP subflow socket would have SS_NOFDREF set
3663          * by default so check to make sure that we test for SOF_MP_SUBFLOW
3664          * socket flag (which would be cleared when the socket is closed.)
3665          */
3666         if (!(so->so_flags & SOF_MP_SUBFLOW) && tlen &&
3667             (((so->so_state & SS_NOFDREF) &&
3668             tp->t_state > TCPS_CLOSE_WAIT) ||
3669             ((so->so_flags & SOF_DEFUNCT) &&
3670             tp->t_state > TCPS_FIN_WAIT_1))) {
3671                 tp = tcp_close(tp);
3672                 tcpstat.tcps_rcvafterclose++;
3673                 rstreason = BANDLIM_UNLIMITED;
3674                 IF_TCP_STATINC(ifp, cleanup);
3675                 goto dropwithreset;
3676         }
3677
3678         /*
3679          * If segment ends after window, drop trailing data
3680          * (and PUSH and FIN); if nothing left, just ACK.
3681          */
3682         todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd);
3683         if (todrop > 0) {
3684                 tcpstat.tcps_rcvpackafterwin++;
3685                 if (todrop >= tlen) {
3686                         tcpstat.tcps_rcvbyteafterwin += tlen;
3687                         /*
3688                          * If a new connection request is received
3689                          * while in TIME_WAIT, drop the old connection
3690                          * and start over if the sequence numbers
3691                          * are above the previous ones.
3692                          */
3693                         if (thflags & TH_SYN &&
3694                             tp->t_state == TCPS_TIME_WAIT &&
3695                             SEQ_GT(th->th_seq, tp->rcv_nxt)) {
3696                                 iss = tcp_new_isn(tp);
3697                                 tp = tcp_close(tp);
3698                                 tcp_unlock(so, 1, 0);
3699                                 goto findpcb;
3700                         }
3701                         /*
3702                          * If window is closed can only take segments at
3703                          * window edge, and have to drop data and PUSH from
3704                          * incoming segments.  Continue processing, but
3705                          * remember to ack.  Otherwise, drop segment
3706                          * and ack.
3707                          */
3708                         if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
3709                                 tp->t_flags |= TF_ACKNOW;
3710                                 tcpstat.tcps_rcvwinprobe++;
3711                         } else
3712                                 goto dropafterack;
3713                 } else
3714                         tcpstat.tcps_rcvbyteafterwin += todrop;
3715                 m_adj(m, -todrop);
3716                 tlen -= todrop;
3717                 thflags &= ~(TH_PUSH|TH_FIN);
3718         }
3719
3720         /*
3721          * If last ACK falls within this segment's sequence numbers,
3722          * record its timestamp.
3723          * NOTE:
3724          * 1) That the test incorporates suggestions from the latest
3725          *    proposal of the tcplw@cray.com list (Braden 1993/04/26).
3726          * 2) That updating only on newer timestamps interferes with
3727          *    our earlier PAWS tests, so this check should be solely
3728          *    predicated on the sequence space of this segment.
3729          * 3) That we modify the segment boundary check to be
3730          *        Last.ACK.Sent <= SEG.SEQ + SEG.Len
3731          *    instead of RFC1323's
3732          *        Last.ACK.Sent < SEG.SEQ + SEG.Len,
3733          *    This modified check allows us to overcome RFC1323's
3734          *    limitations as described in Stevens TCP/IP Illustrated
3735          *    Vol. 2 p.869. In such cases, we can still calculate the
3736          *    RTT correctly when RCV.NXT == Last.ACK.Sent.
3737          */
3738         if ((to.to_flags & TOF_TS) != 0 &&
3739             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
3740             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
3741                 ((thflags & (TH_SYN|TH_FIN)) != 0))) {
3742                 tp->ts_recent_age = tcp_now;
3743                 tp->ts_recent = to.to_tsval;
3744         }
3745
3746         /*
3747          * If a SYN is in the window, then this is an
3748          * error and we send an RST and drop the connection.
3749          */
3750         if (thflags & TH_SYN) {
3751                 tp = tcp_drop(tp, ECONNRESET);
3752                 rstreason = BANDLIM_UNLIMITED;
3753                 postevent(so, 0, EV_RESET);
3754                 IF_TCP_STATINC(ifp, synwindow);
3755                 goto dropwithreset;
3756         }
3757
3758         /*
3759          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN
3760          * flag is on (half-synchronized state), then queue data for
3761          * later processing; else drop segment and return.
3762          */
3763         if ((thflags & TH_ACK) == 0) {
3764                 if (tp->t_state == TCPS_SYN_RECEIVED ||
3765                     (tp->t_flags & TF_NEEDSYN)) {
3766                         if ((tfo_enabled(tp))) {
3767                                 /*
3768                                  * So, we received a valid segment while in
3769                                  * SYN-RECEIVED (TF_NEEDSYN is actually never
3770                                  * set, so this is dead code).
3771                                  * As this cannot be an RST (see that if a bit
3772                                  * higher), and it does not have the ACK-flag
3773                                  * set, we want to retransmit the SYN/ACK.
3774                                  * Thus, we have to reset snd_nxt to snd_una to
3775                                  * trigger the going back to sending of the
3776                                  * SYN/ACK. This is more consistent with the
3777                                  * behavior of tcp_output(), which expects
3778                                  * to send the segment that is pointed to by
3779                                  * snd_nxt.
3780                                  */
3781                                 tp->snd_nxt = tp->snd_una;
3782
3783                                 /*
3784                                  * We need to make absolutely sure that we are
3785                                  * going to reply upon a duplicate SYN-segment.
3786                                  */
3787                                 if (th->th_flags & TH_SYN)
3788                                         needoutput = 1;
3789                         }
3790
3791                         goto step6;
3792                 } else if (tp->t_flags & TF_ACKNOW)
3793                         goto dropafterack;
3794                 else
3795                         goto drop;
3796         }
3797
3798         /*
3799          * Ack processing.
3800          */
3801
3802         switch (tp->t_state) {
3803
3804         /*
3805          * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
3806          * ESTABLISHED state and continue processing.
3807          * The ACK was checked above.
3808          */
3809         case TCPS_SYN_RECEIVED:
3810
3811                 tcpstat.tcps_connects++;
3812
3813                 /* Do window scaling? */
3814                 if (TCP_WINDOW_SCALE_ENABLED(tp)) {
3815                         tp->snd_scale = tp->requested_s_scale;
3816                         tp->rcv_scale = tp->request_r_scale;
3817                         tp->snd_wnd = th->th_win << tp->snd_scale;
3818                         tiwin = tp->snd_wnd;
3819                 }
3820                 /*
3821                  * Make transitions:
3822                  *      SYN-RECEIVED  -> ESTABLISHED
3823                  *      SYN-RECEIVED* -> FIN-WAIT-1
3824                  */
3825                 tp->t_starttime = tcp_now;
3826                 tcp_sbrcv_tstmp_check(tp);
3827                 if (tp->t_flags & TF_NEEDFIN) {
3828                         DTRACE_TCP4(state__change, void, NULL,
3829                             struct inpcb *, inp,
3830                             struct tcpcb *, tp, int32_t, TCPS_FIN_WAIT_1);
3831                         tp->t_state = TCPS_FIN_WAIT_1;
3832                         tp->t_flags &= ~TF_NEEDFIN;
3833                 } else {
3834                         DTRACE_TCP4(state__change, void, NULL,
3835                             struct inpcb *, inp,
3836                             struct tcpcb *, tp, int32_t, TCPS_ESTABLISHED);
3837                         tp->t_state = TCPS_ESTABLISHED;
3838                         tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
3839                                 TCP_CONN_KEEPIDLE(tp));
3840                         if (nstat_collect)
3841                                 nstat_route_connect_success(
3842                                     tp->t_inpcb->inp_route.ro_rt);
3843                 }
3844                 /*
3845                  * If segment contains data or ACK, will call tcp_reass()
3846                  * later; if not, do so now to pass queued data to user.
3847                  */
3848                 if (tlen == 0 && (thflags & TH_FIN) == 0)
3849                         (void) tcp_reass(tp, (struct tcphdr *)0, &tlen,
3850                             NULL, ifp);
3851                 tp->snd_wl1 = th->th_seq - 1;
3852
3853 #if MPTCP
3854                 /*
3855                  * Do not send the connect notification for additional subflows
3856                  * until ACK for 3-way handshake arrives.
3857                  */
3858                 if ((!(tp->t_mpflags & TMPF_MPTCP_TRUE)) &&
3859                     (tp->t_mpflags & TMPF_SENT_JOIN)) {
3860                         isconnected = FALSE;
3861                 } else
3862 #endif /* MPTCP */
3863                         isconnected = TRUE;
3864                 if ((tp->t_tfo_flags & TFO_F_COOKIE_VALID)) {
3865                         /* Done this when receiving the SYN */
3866                         isconnected = FALSE;
3867
3868                         OSDecrementAtomic(&tcp_tfo_halfcnt);
3869
3870                         /* Panic if something has gone terribly wrong. */
3871                         VERIFY(tcp_tfo_halfcnt >= 0);
3872
3873                         tp->t_tfo_flags &= ~TFO_F_COOKIE_VALID;
3874                 }
3875
3876                 /*
3877                  * In case there is data in the send-queue (e.g., TFO is being
3878                  * used, or connectx+data has been done), then if we would
3879                  * "FALLTHROUGH", we would handle this ACK as if data has been
3880                  * acknowledged. But, we have to prevent this. And this
3881                  * can be prevented by increasing snd_una by 1, so that the
3882                  * SYN is not considered as data (snd_una++ is actually also
3883                  * done in SYN_SENT-state as part of the regular TCP stack).
3884                  *
3885                  * In case there is data on this ack as well, the data will be
3886                  * handled by the label "dodata" right after step6.
3887                  */
3888                 if (so->so_snd.sb_cc) {
3889                         tp->snd_una++;  /* SYN is acked */
3890                         if (SEQ_LT(tp->snd_nxt, tp->snd_una))
3891                                 tp->snd_nxt = tp->snd_una;
3892
3893                         /*
3894                          * No duplicate-ACK handling is needed. So, we
3895                          * directly advance to processing the ACK (aka,
3896                          * updating the RTT estimation,...)
3897                          *
3898                          * But, we first need to handle eventual SACKs,
3899                          * because TFO will start sending data with the
3900                          * SYN/ACK, so it might be that the client
3901                          * includes a SACK with its ACK.
3902                          */
3903                         if (SACK_ENABLED(tp) &&
3904                             (to.to_nsacks > 0 ||
3905                              !TAILQ_EMPTY(&tp->snd_holes)))
3906                                 tcp_sack_doack(tp, &to, th,
3907                                     &sack_bytes_acked);
3908
3909                         goto process_ACK;
3910                 }
3911
3912                 /* FALLTHROUGH */
3913
3914         /*
3915          * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
3916          * ACKs.  If the ack is in the range
3917          *      tp->snd_una < th->th_ack <= tp->snd_max
3918          * then advance tp->snd_una to th->th_ack and drop
3919          * data from the retransmission queue.  If this ACK reflects
3920          * more up to date window information we update our window information.
3921          */
3922         case TCPS_ESTABLISHED:
3923         case TCPS_FIN_WAIT_1:
3924         case TCPS_FIN_WAIT_2:
3925         case TCPS_CLOSE_WAIT:
3926         case TCPS_CLOSING:
3927         case TCPS_LAST_ACK:
3928         case TCPS_TIME_WAIT:
3929                 if (SEQ_GT(th->th_ack, tp->snd_max)) {
3930                         tcpstat.tcps_rcvacktoomuch++;
3931                         goto dropafterack;
3932                 }
3933                 if (SACK_ENABLED(tp) && to.to_nsacks > 0) {
3934                         recvd_dsack = tcp_sack_process_dsack(tp, &to, th);
3935                         /*
3936                          * If DSACK is received and this packet has no
3937                          * other SACK information, it can be dropped.
3938                          * We do not want to treat it as a duplicate ack.
3939                          */
3940                         if (recvd_dsack &&
3941                             SEQ_LEQ(th->th_ack, tp->snd_una) &&
3942                             to.to_nsacks == 0) {
3943                                 tcp_bad_rexmt_check(tp, th, &to);
3944                                 goto drop;
3945                         }
3946                 }
3947
3948                 if (SACK_ENABLED(tp) &&
3949                     (to.to_nsacks > 0 || !TAILQ_EMPTY(&tp->snd_holes)))
3950                         tcp_sack_doack(tp, &to, th, &sack_bytes_acked);
3951
3952 #if MPTCP
3953                 if ((tp->t_mpuna) && (SEQ_GEQ(th->th_ack, tp->t_mpuna))) {
3954                         if (tp->t_mpflags & TMPF_PREESTABLISHED) {
3955                                 /* MP TCP establishment succeeded */
3956                                 tp->t_mpuna = 0;
3957                                 if (tp->t_mpflags & TMPF_JOINED_FLOW) {
3958                                         if (tp->t_mpflags & TMPF_SENT_JOIN) {
3959                                                 tp->t_mpflags &=
3960                                                     ~TMPF_PREESTABLISHED;
3961                                                 tp->t_mpflags |=
3962                                                     TMPF_MPTCP_TRUE;
3963                                                 so->so_flags |= SOF_MPTCP_TRUE;
3964                                                 mptcplog((LOG_DEBUG, "MPTCP "
3965                                                     "Sockets: %s \n",__func__),
3966                                                     MPTCP_SOCKET_DBG,
3967                                                     MPTCP_LOGLVL_LOG);
3968
3969                                                 tp->t_timer[TCPT_JACK_RXMT] = 0;
3970                                                 tp->t_mprxtshift = 0;
3971                                                 isconnected = TRUE;
3972                                         } else {
3973                                                 isconnected = FALSE;
3974                                         }
3975                                 } else {
3976                                         isconnected = TRUE;
3977                                         tp->t_mpflags &= ~TMPF_SENT_KEYS;
3978                                 }
3979                         }
3980                 }
3981 #endif /* MPTCP */
3982
3983                 tcp_tfo_rcv_ack(tp, th);
3984
3985                 /*
3986                  * If we have outstanding data (other than
3987                  * a window probe), this is a completely
3988                  * duplicate ack (ie, window info didn't
3989                  * change) and the ack is the biggest we've seen.
3990                  */
3991                 if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
3992                         if (tlen == 0 && tiwin == tp->snd_wnd) {
3993                                 /*
3994                                  * If both ends send FIN at the same time,
3995                                  * then the ack will be a duplicate ack
3996                                  * but we have to process the FIN. Check
3997                                  * for this condition and process the FIN
3998                                  * instead of the dupack
3999                                  */
4000                                 if ((thflags & TH_FIN) &&
4001                                     (tp->t_flags & TF_SENTFIN) &&
4002                                     !TCPS_HAVERCVDFIN(tp->t_state) &&
4003                                     (th->th_ack + 1) == tp->snd_max)
4004                                         break;
4005 process_dupack:
4006 #if MPTCP
4007                                 /*
4008                                  * MPTCP options that are ignored must
4009                                  * not be treated as duplicate ACKs.
4010                                  */
4011                                 if (to.to_flags & TOF_MPTCP) {
4012                                         goto drop;
4013                                 }
4014
4015                                 if ((isconnected) && (tp->t_mpflags & TMPF_JOINED_FLOW)) {
4016                                         mptcplog((LOG_DEBUG, "MPTCP "
4017                                             "Sockets: bypass ack recovery\n"),
4018                                             MPTCP_SOCKET_DBG,
4019                                             MPTCP_LOGLVL_VERBOSE);
4020                                         break;
4021                                 }
4022 #endif /* MPTCP */
4023                                 /*
4024                                  * If a duplicate acknowledgement was seen
4025                                  * after ECN, it indicates packet loss in
4026                                  * addition to ECN. Reset INRECOVERY flag
4027                                  * so that we can process partial acks
4028                                  * correctly
4029                                  */
4030                                 if (tp->ecn_flags & TE_INRECOVERY)
4031                                         tp->ecn_flags &= ~TE_INRECOVERY;
4032
4033                                 tcpstat.tcps_rcvdupack++;
4034                                 ++tp->t_dupacks;
4035
4036                                 /*
4037                                  * Check if we need to reset the limit on
4038                                  * early retransmit
4039                                  */
4040                                 if (tp->t_early_rexmt_count > 0 &&
4041                                     TSTMP_GEQ(tcp_now,
4042                                     (tp->t_early_rexmt_win +
4043                                     TCP_EARLY_REXMT_WIN)))
4044                                         tp->t_early_rexmt_count = 0;
4045
4046                                 /*
4047                                  * Is early retransmit needed? We check for
4048                                  * this when the connection is waiting for
4049                                  * duplicate acks to enter fast recovery.
4050                                  */
4051                                 if (!IN_FASTRECOVERY(tp))
4052                                         tcp_early_rexmt_check(tp, th);
4053
4054                                 /*
4055                                  * If we've seen exactly rexmt threshold
4056                                  * of duplicate acks, assume a packet
4057                                  * has been dropped and retransmit it.
4058                                  * Kludge snd_nxt & the congestion
4059                                  * window so we send only this one
4060                                  * packet.
4061                                  *
4062                                  * We know we're losing at the current
4063                                  * window size so do congestion avoidance
4064                                  * (set ssthresh to half the current window
4065                                  * and pull our congestion window back to
4066                                  * the new ssthresh).
4067                                  *
4068                                  * Dup acks mean that packets have left the
4069                                  * network (they're now cached at the receiver)
4070                                  * so bump cwnd by the amount in the receiver
4071                                  * to keep a constant cwnd packets in the
4072                                  * network.
4073                                  */
4074                                 if (tp->t_timer[TCPT_REXMT] == 0 ||
4075                                     (th->th_ack != tp->snd_una
4076                                     && sack_bytes_acked == 0)) {
4077                                         tp->t_dupacks = 0;
4078                                         tp->t_rexmtthresh = tcprexmtthresh;
4079                                 } else if (tp->t_dupacks > tp->t_rexmtthresh ||
4080                                         IN_FASTRECOVERY(tp)) {
4081
4082                                         /*
4083                                          * If this connection was seeing packet
4084                                          * reordering, then recovery might be
4085                                          * delayed to disambiguate between
4086                                          * reordering and loss
4087                                          */
4088                                         if (SACK_ENABLED(tp) && !IN_FASTRECOVERY(tp) &&
4089                                             (tp->t_flagsext &
4090                                             (TF_PKTS_REORDERED|TF_DELAY_RECOVERY)) ==
4091                                             (TF_PKTS_REORDERED|TF_DELAY_RECOVERY)) {
4092                                                 /*
4093                                                  * Since the SACK information is already
4094                                                  * updated, this ACK will be dropped
4095                                                  */
4096                                                 break;
4097                                         }
4098
4099                                         if (SACK_ENABLED(tp)
4100                                             && IN_FASTRECOVERY(tp)) {
4101                                                 int awnd;
4102
4103                                                 /*
4104                                                  * Compute the amount of data in flight first.
4105                                                  * We can inject new data into the pipe iff
4106                                                  * we have less than 1/2 the original window's
4107                                                  * worth of data in flight.
4108                                                  */
4109                                                 awnd = (tp->snd_nxt - tp->snd_fack) +
4110                                                         tp->sackhint.sack_bytes_rexmit;
4111                                                 if (awnd < tp->snd_ssthresh) {
4112                                                         tp->snd_cwnd += tp->t_maxseg;
4113                                                         if (tp->snd_cwnd > tp->snd_ssthresh)
4114                                                                 tp->snd_cwnd = tp->snd_ssthresh;
4115                                                 }
4116                                         } else
4117                                                 tp->snd_cwnd += tp->t_maxseg;
4118
4119                                         tcp_ccdbg_trace(tp, th, TCP_CC_IN_FASTRECOVERY);
4120
4121                                         (void) tcp_output(tp);
4122                                         goto drop;
4123                                 } else if (tp->t_dupacks == tp->t_rexmtthresh) {
4124                                         tcp_seq onxt = tp->snd_nxt;
4125
4126                                         /*
4127                                          * If we're doing sack, check to
4128                                          * see if we're already in sack
4129                                          * recovery. If we're not doing sack,
4130                                          * check to see if we're in newreno
4131                                          * recovery.
4132                                          */
4133                                         if (SACK_ENABLED(tp)) {
4134                                                 if (IN_FASTRECOVERY(tp)) {
4135                                                         tp->t_dupacks = 0;
4136                                                         break;
4137                                                 } else if (tp->t_flagsext & TF_DELAY_RECOVERY) {
4138                                                         break;
4139                                                 }
4140                                         } else {
4141                                                 if (SEQ_LEQ(th->th_ack,
4142                                                     tp->snd_recover)) {
4143                                                         tp->t_dupacks = 0;
4144                                                         break;
4145                                                 }
4146                                         }
4147                                         if (tp->t_flags & TF_SENTFIN)
4148                                                 tp->snd_recover = tp->snd_max - 1;
4149                                         else
4150                                                 tp->snd_recover = tp->snd_max;
4151                                         tp->t_timer[TCPT_PTO] = 0;
4152                                         tp->t_rtttime = 0;
4153
4154                                         /*
4155                                          * If the connection has seen pkt
4156                                          * reordering, delay recovery until
4157                                          * it is clear that the packet
4158                                          * was lost.
4159                                          */
4160                                         if (SACK_ENABLED(tp) &&
4161                                             (tp->t_flagsext &
4162                                             (TF_PKTS_REORDERED|TF_DELAY_RECOVERY))
4163                                             == TF_PKTS_REORDERED &&
4164                                             !IN_FASTRECOVERY(tp) &&
4165                                             tp->t_reorderwin > 0 &&
4166                                             (tp->t_state == TCPS_ESTABLISHED ||
4167                                             tp->t_state == TCPS_FIN_WAIT_1)) {
4168                                                 tp->t_timer[TCPT_DELAYFR] =
4169                                                     OFFSET_FROM_START(tp,
4170                                                     tp->t_reorderwin);
4171                                                 tp->t_flagsext |= TF_DELAY_RECOVERY;
4172                                                 tcpstat.tcps_delay_recovery++;
4173                                                 tcp_ccdbg_trace(tp, th,
4174                                                     TCP_CC_DELAY_FASTRECOVERY);
4175                                                 break;
4176                                         }
4177
4178                                         tcp_rexmt_save_state(tp);
4179                                         /*
4180                                          * If the current tcp cc module has
4181                                          * defined a hook for tasks to run
4182                                          * before entering FR, call it
4183                                          */
4184                                         if (CC_ALGO(tp)->pre_fr != NULL)
4185                                                 CC_ALGO(tp)->pre_fr(tp);
4186                                         ENTER_FASTRECOVERY(tp);
4187                                         tp->t_timer[TCPT_REXMT] = 0;
4188                                         if (TCP_ECN_ENABLED(tp))
4189                                                 tp->ecn_flags |= TE_SENDCWR;
4190
4191                                         if (SACK_ENABLED(tp)) {
4192                                                 tcpstat.tcps_sack_recovery_episode++;
4193                                                 tp->t_sack_recovery_episode++;
4194                                                 tp->sack_newdata = tp->snd_nxt;
4195                                                 tp->snd_cwnd = tp->t_maxseg;
4196                                                 tp->t_flagsext &=
4197                                                     ~TF_CWND_NONVALIDATED;
4198                                                 tcp_ccdbg_trace(tp, th,
4199                                                     TCP_CC_ENTER_FASTRECOVERY);
4200                                                 (void) tcp_output(tp);
4201                                                 goto drop;
4202                                         }
4203                                         tp->snd_nxt = th->th_ack;
4204                                         tp->snd_cwnd = tp->t_maxseg;
4205                                         (void) tcp_output(tp);
4206                                         if (tp->t_flagsext & TF_CWND_NONVALIDATED) {
4207                                                 tcp_cc_adjust_nonvalidated_cwnd(tp);
4208                                         } else {
4209                                                 tp->snd_cwnd = tp->snd_ssthresh +
4210                                                      tp->t_maxseg * tp->t_dupacks;
4211                                         }
4212                                         if (SEQ_GT(onxt, tp->snd_nxt))
4213                                                 tp->snd_nxt = onxt;
4214                                         tcp_ccdbg_trace(tp, th,
4215                                             TCP_CC_ENTER_FASTRECOVERY);
4216                                         goto drop;
4217                                 } else if (limited_txmt &&
4218                                         ALLOW_LIMITED_TRANSMIT(tp) &&
4219                                         (!(SACK_ENABLED(tp)) || sack_bytes_acked > 0) &&
4220                                         (so->so_snd.sb_cc - (tp->snd_max - tp->snd_una)) > 0) {
4221                                         u_int32_t incr = (tp->t_maxseg * tp->t_dupacks);
4222
4223                                         /* Use Limited Transmit algorithm on the first two
4224                                          * duplicate acks when there is new data to transmit
4225                                          */
4226                                         tp->snd_cwnd += incr;
4227                                         tcpstat.tcps_limited_txt++;
4228                                         (void) tcp_output(tp);
4229
4230                                         tcp_ccdbg_trace(tp, th, TCP_CC_LIMITED_TRANSMIT);
4231
4232                                         /* Reset snd_cwnd back to normal */
4233                                         tp->snd_cwnd -= incr;
4234                                 }
4235                         } else {
4236                                 tp->t_dupacks = 0;
4237                                 tp->t_rexmtthresh = tcprexmtthresh;
4238                         }
4239                         break;
4240                 }
4241                 /*
4242                  * If the congestion window was inflated to account
4243                  * for the other side's cached packets, retract it.
4244                  */
4245                 if (IN_FASTRECOVERY(tp)) {
4246                         if (SEQ_LT(th->th_ack, tp->snd_recover)) {
4247                                 /*
4248                                  * If we received an ECE and entered
4249                                  * recovery, the subsequent ACKs should
4250                                  * not be treated as partial acks.
4251                                  */
4252                                 if (tp->ecn_flags & TE_INRECOVERY)
4253                                         goto process_ACK;
4254
4255                                 if (SACK_ENABLED(tp))
4256                                         tcp_sack_partialack(tp, th);
4257                                 else
4258                                         tcp_newreno_partial_ack(tp, th);
4259                                 tcp_ccdbg_trace(tp, th, TCP_CC_PARTIAL_ACK);
4260                         } else {
4261                                 EXIT_FASTRECOVERY(tp);
4262                                 if (CC_ALGO(tp)->post_fr != NULL)
4263                                         CC_ALGO(tp)->post_fr(tp, th);
4264                                 tp->t_pipeack = 0;
4265                                 tcp_clear_pipeack_state(tp);
4266                                 tcp_ccdbg_trace(tp, th,
4267                                     TCP_CC_EXIT_FASTRECOVERY);
4268                         }
4269                 } else if ((tp->t_flagsext &
4270                         (TF_PKTS_REORDERED|TF_DELAY_RECOVERY))
4271                         == (TF_PKTS_REORDERED|TF_DELAY_RECOVERY)) {
4272                         /*
4273                          * If the ack acknowledges upto snd_recover or if
4274                          * it acknowledges all the snd holes, exit
4275                          * recovery and cancel the timer. Otherwise,
4276                          * this is a partial ack. Wait for recovery timer
4277                          * to enter recovery. The snd_holes have already
4278                          * been updated.
4279                          */
4280                         if (SEQ_GEQ(th->th_ack, tp->snd_recover) ||
4281                             TAILQ_EMPTY(&tp->snd_holes)) {
4282                                 tp->t_timer[TCPT_DELAYFR] = 0;
4283                                 tp->t_flagsext &= ~TF_DELAY_RECOVERY;
4284                                 EXIT_FASTRECOVERY(tp);
4285                                 tcp_ccdbg_trace(tp, th,
4286                                     TCP_CC_EXIT_FASTRECOVERY);
4287                         }
4288                 } else {
4289                         /*
4290                          * We were not in fast recovery. Reset the
4291                          * duplicate ack counter.
4292                          */
4293                         tp->t_dupacks = 0;
4294                         tp->t_rexmtthresh = tcprexmtthresh;
4295                 }
4296
4297
4298                 /*
4299                  * If we reach this point, ACK is not a duplicate,
4300                  *     i.e., it ACKs something we sent.
4301                  */
4302                 if (tp->t_flags & TF_NEEDSYN) {
4303                         /*
4304                          * T/TCP: Connection was half-synchronized, and our
4305                          * SYN has been ACK'd (so connection is now fully
4306                          * synchronized).  Go to non-starred state,
4307                          * increment snd_una for ACK of SYN, and check if
4308                          * we can do window scaling.
4309                          */
4310                         tp->t_flags &= ~TF_NEEDSYN;
4311                         tp->snd_una++;
4312                         /* Do window scaling? */
4313                         if (TCP_WINDOW_SCALE_ENABLED(tp)) {
4314                                 tp->snd_scale = tp->requested_s_scale;
4315                                 tp->rcv_scale = tp->request_r_scale;
4316                         }
4317                 }
4318
4319 process_ACK:
4320                 VERIFY(SEQ_GEQ(th->th_ack, tp->snd_una));
4321                 acked = BYTES_ACKED(th, tp);
4322                 tcpstat.tcps_rcvackpack++;
4323                 tcpstat.tcps_rcvackbyte += acked;
4324
4325                 /*
4326                  * If the last packet was a retransmit, make sure
4327                  * it was not spurious.
4328                  *
4329                  * This will also take care of congestion window
4330                  * adjustment if a last packet was recovered due to a
4331                  * tail loss probe.
4332                  */
4333                 tcp_bad_rexmt_check(tp, th, &to);
4334
4335                 /* Recalculate the RTT */
4336                 tcp_compute_rtt(tp, &to, th);
4337
4338                 /*
4339                  * If all outstanding data is acked, stop retransmit
4340                  * timer and remember to restart (more output or persist).
4341                  * If there is more data to be acked, restart retransmit
4342                  * timer, using current (possibly backed-off) value.
4343                  */
4344                 if (th->th_ack == tp->snd_max) {
4345                         tp->t_timer[TCPT_REXMT] = 0;
4346                         tp->t_timer[TCPT_PTO] = 0;
4347                         needoutput = 1;
4348                 } else if (tp->t_timer[TCPT_PERSIST] == 0)
4349                         tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp,
4350                             tp->t_rxtcur);
4351
4352                 /*
4353                  * If no data (only SYN) was ACK'd, skip rest of ACK
4354                  * processing.
4355                  */
4356                 if (acked == 0)
4357                         goto step6;
4358
4359                 /*
4360                  * When outgoing data has been acked (except the SYN+data), we
4361                  * mark this connection as "sending good" for TFO.
4362                  */
4363                 if ((tp->t_tfo_stats & TFO_S_SYN_DATA_SENT) &&
4364                     !(tp->t_tfo_flags & TFO_F_NO_SNDPROBING) &&
4365                     !(th->th_flags & TH_SYN))
4366                         tcp_heuristic_tfo_snd_good(tp);
4367
4368                 /*
4369                  * If TH_ECE is received, make sure that ECN is enabled
4370                  * on that connection and we have sent ECT on data packets.
4371                  */
4372                 if ((thflags & TH_ECE) != 0 && TCP_ECN_ENABLED(tp) &&
4373                     (tp->ecn_flags & TE_SENDIPECT)) {
4374                         /*
4375                          * Reduce the congestion window if we haven't
4376                          * done so.
4377                          */
4378                         if (!IN_FASTRECOVERY(tp)) {
4379                                 tcp_reduce_congestion_window(tp);
4380                                 tp->ecn_flags |= (TE_INRECOVERY|TE_SENDCWR);
4381                                 /*
4382                                  * Also note that the connection received
4383                                  * ECE atleast once
4384                                  */
4385                                 tp->ecn_flags |= TE_RECV_ECN_ECE;
4386                                 INP_INC_IFNET_STAT(inp, ecn_recv_ece);
4387                                 tcpstat.tcps_ecn_recv_ece++;
4388                                 tcp_ccdbg_trace(tp, th, TCP_CC_ECN_RCVD);
4389                         }
4390                 }
4391
4392                 /*
4393                  * When new data is acked, open the congestion window.
4394                  * The specifics of how this is achieved are up to the
4395                  * congestion control algorithm in use for this connection.
4396                  *
4397                  * The calculations in this function assume that snd_una is
4398                  * not updated yet.
4399                  */
4400                 if (!IN_FASTRECOVERY(tp)) {
4401                         if (CC_ALGO(tp)->ack_rcvd != NULL)
4402                                 CC_ALGO(tp)->ack_rcvd(tp, th);
4403                         tcp_ccdbg_trace(tp, th, TCP_CC_ACK_RCVD);
4404                 }
4405                 if (acked > so->so_snd.sb_cc) {
4406                         tp->snd_wnd -= so->so_snd.sb_cc;
4407                         sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
4408                         if (so->so_flags & SOF_ENABLE_MSGS) {
4409                                 so->so_msg_state->msg_serial_bytes -=
4410                                         (int)so->so_snd.sb_cc;
4411                         }
4412                         ourfinisacked = 1;
4413                 } else {
4414                         sbdrop(&so->so_snd, acked);
4415                         if (so->so_flags & SOF_ENABLE_MSGS) {
4416                                 so->so_msg_state->msg_serial_bytes -=
4417                                         acked;
4418                         }
4419                         tcp_sbsnd_trim(&so->so_snd);
4420                         tp->snd_wnd -= acked;
4421                         ourfinisacked = 0;
4422                 }
4423                 /* detect una wraparound */
4424                 if ( !IN_FASTRECOVERY(tp) &&
4425                     SEQ_GT(tp->snd_una, tp->snd_recover) &&
4426                     SEQ_LEQ(th->th_ack, tp->snd_recover))
4427                         tp->snd_recover = th->th_ack - 1;
4428
4429                 if (IN_FASTRECOVERY(tp) &&
4430                     SEQ_GEQ(th->th_ack, tp->snd_recover))
4431                         EXIT_FASTRECOVERY(tp);
4432
4433                 tp->snd_una = th->th_ack;
4434                 if (SACK_ENABLED(tp)) {
4435                         if (SEQ_GT(tp->snd_una, tp->snd_recover))
4436                                 tp->snd_recover = tp->snd_una;
4437                 }
4438                 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
4439                         tp->snd_nxt = tp->snd_una;
4440                 if (!SLIST_EMPTY(&tp->t_rxt_segments) &&
4441                     !TCP_DSACK_SEQ_IN_WINDOW(tp, tp->t_dsack_lastuna,
4442                     tp->snd_una))
4443                         tcp_rxtseg_clean(tp);
4444                 if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 &&
4445                         tp->t_bwmeas != NULL)
4446                         tcp_bwmeas_check(tp);
4447
4448                 /*
4449                  * sowwakeup must happen after snd_una, et al. are updated so that
4450                  * the sequence numbers are in sync with so_snd
4451                  */
4452                 sowwakeup(so);
4453
4454                 switch (tp->t_state) {
4455
4456                 /*
4457                  * In FIN_WAIT_1 STATE in addition to the processing
4458                  * for the ESTABLISHED state if our FIN is now acknowledged
4459                  * then enter FIN_WAIT_2.
4460                  */
4461                 case TCPS_FIN_WAIT_1:
4462                         if (ourfinisacked) {
4463                                 /*
4464                                  * If we can't receive any more
4465                                  * data, then closing user can proceed.
4466                                  * Starting the TCPT_2MSL timer is contrary to the
4467                                  * specification, but if we don't get a FIN
4468                                  * we'll hang forever.
4469                                  */
4470                                 if (so->so_state & SS_CANTRCVMORE) {
4471                                         tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp,
4472                                                 TCP_CONN_MAXIDLE(tp));
4473                                         isconnected = FALSE;
4474                                         isdisconnected = TRUE;
4475                                 }
4476                                 DTRACE_TCP4(state__change, void, NULL,
4477                                         struct inpcb *, inp,
4478                                         struct tcpcb *, tp,
4479                                         int32_t, TCPS_FIN_WAIT_2);
4480                                 tp->t_state = TCPS_FIN_WAIT_2;
4481                                 /* fall through and make sure we also recognize
4482                                  * data ACKed with the FIN
4483                                  */
4484                         }
4485                         tp->t_flags |= TF_ACKNOW;
4486                         break;
4487
4488                 /*
4489                  * In CLOSING STATE in addition to the processing for
4490                  * the ESTABLISHED state if the ACK acknowledges our FIN
4491                  * then enter the TIME-WAIT state, otherwise ignore
4492                  * the segment.
4493                  */
4494                 case TCPS_CLOSING:
4495                         if (ourfinisacked) {
4496                                 DTRACE_TCP4(state__change, void, NULL,
4497                                         struct inpcb *, inp,
4498                                         struct tcpcb *, tp,
4499                                         int32_t, TCPS_TIME_WAIT);
4500                                 tp->t_state = TCPS_TIME_WAIT;
4501                                 tcp_canceltimers(tp);
4502                                 if (tp->t_flagsext & TF_NOTIMEWAIT) {
4503                                         tp->t_flags |= TF_CLOSING;
4504                                 } else {
4505                                         add_to_time_wait(tp, 2 * tcp_msl);
4506                                 }
4507                                 isconnected = FALSE;
4508                                 isdisconnected = TRUE;
4509                         }
4510                         tp->t_flags |= TF_ACKNOW;
4511                         break;
4512
4513                 /*
4514                  * In LAST_ACK, we may still be waiting for data to drain
4515                  * and/or to be acked, as well as for the ack of our FIN.
4516                  * If our FIN is now acknowledged, delete the TCB,
4517                  * enter the closed state and return.
4518                  */
4519                 case TCPS_LAST_ACK:
4520                         if (ourfinisacked) {
4521                                 tp = tcp_close(tp);
4522                                 goto drop;
4523                         }
4524                         break;
4525
4526                 /*
4527                  * In TIME_WAIT state the only thing that should arrive
4528                  * is a retransmission of the remote FIN.  Acknowledge
4529                  * it and restart the finack timer.
4530                  */
4531                 case TCPS_TIME_WAIT:
4532                         add_to_time_wait(tp, 2 * tcp_msl);
4533                         goto dropafterack;
4534                 }
4535
4536                 /*
4537                  * If there is a SACK option on the ACK and we
4538                  * haven't seen any duplicate acks before, count
4539                  * it as a duplicate ack even if the cumulative
4540                  * ack is advanced. If the receiver delayed an
4541                  * ack and detected loss afterwards, then the ack
4542                  * will advance cumulative ack and will also have
4543                  * a SACK option. So counting it as one duplicate
4544                  * ack is ok.
4545                  */
4546                 if (sack_ackadv == 1 &&
4547                     tp->t_state == TCPS_ESTABLISHED &&
4548                     SACK_ENABLED(tp) && sack_bytes_acked > 0 &&
4549                     to.to_nsacks > 0 && tp->t_dupacks == 0 &&
4550                     SEQ_LEQ(th->th_ack, tp->snd_una) && tlen == 0 &&
4551                     !(tp->t_flagsext & TF_PKTS_REORDERED)) {
4552                         tcpstat.tcps_sack_ackadv++;
4553                         goto process_dupack;
4554                 }
4555         }
4556
4557 step6:
4558         /*
4559          * Update window information.
4560          * Don't look at window if no ACK: TAC's send garbage on first SYN.
4561          */
4562         if ((thflags & TH_ACK) &&
4563             (SEQ_LT(tp->snd_wl1, th->th_seq) ||
4564             (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
4565              (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
4566                 /* keep track of pure window updates */
4567                 if (tlen == 0 &&
4568                     tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
4569                         tcpstat.tcps_rcvwinupd++;
4570                 tp->snd_wnd = tiwin;
4571                 tp->snd_wl1 = th->th_seq;
4572                 tp->snd_wl2 = th->th_ack;
4573                 if (tp->snd_wnd > tp->max_sndwnd)
4574                         tp->max_sndwnd = tp->snd_wnd;
4575                 needoutput = 1;
4576         }
4577
4578         /*
4579          * Process segments with URG.
4580          */
4581         if ((thflags & TH_URG) && th->th_urp &&
4582             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
4583                 /*
4584                  * This is a kludge, but if we receive and accept
4585                  * random urgent pointers, we'll crash in
4586                  * soreceive.  It's hard to imagine someone
4587                  * actually wanting to send this much urgent data.
4588                  */
4589                 if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
4590                         th->th_urp = 0;                 /* XXX */
4591                         thflags &= ~TH_URG;             /* XXX */
4592                         goto dodata;                    /* XXX */
4593                 }
4594                 /*
4595                  * If this segment advances the known urgent pointer,
4596                  * then mark the data stream.  This should not happen
4597                  * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
4598                  * a FIN has been received from the remote side.
4599                  * In these states we ignore the URG.
4600                  *
4601                  * According to RFC961 (Assigned Protocols),
4602                  * the urgent pointer points to the last octet
4603                  * of urgent data.  We continue, however,
4604                  * to consider it to indicate the first octet
4605                  * of data past the urgent section as the original
4606                  * spec states (in one of two places).
4607                  */
4608                 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
4609                         tp->rcv_up = th->th_seq + th->th_urp;
4610                         so->so_oobmark = so->so_rcv.sb_cc +
4611                             (tp->rcv_up - tp->rcv_nxt) - 1;
4612                         if (so->so_oobmark == 0) {
4613                                 so->so_state |= SS_RCVATMARK;
4614                                 postevent(so, 0, EV_OOB);
4615                         }
4616                         sohasoutofband(so);
4617                         tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
4618                 }
4619                 /*
4620                  * Remove out of band data so doesn't get presented to user.
4621                  * This can happen independent of advancing the URG pointer,
4622                  * but if two URG's are pending at once, some out-of-band
4623                  * data may creep in... ick.
4624                  */
4625                 if (th->th_urp <= (u_int32_t)tlen
4626 #if SO_OOBINLINE
4627                      && (so->so_options & SO_OOBINLINE) == 0
4628 #endif
4629                      )
4630                         tcp_pulloutofband(so, th, m,
4631                                 drop_hdrlen);   /* hdr drop is delayed */
4632         } else {
4633                 /*
4634                  * If no out of band data is expected,
4635                  * pull receive urgent pointer along
4636                  * with the receive window.
4637                  */
4638                 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
4639                         tp->rcv_up = tp->rcv_nxt;
4640         }
4641 dodata:
4642
4643         /* Set socket's connect or disconnect state correcly before doing data.
4644          * The following might unlock the socket if there is an upcall or a socket
4645          * filter.
4646          */
4647         if (isconnected) {
4648                 soisconnected(so);
4649         } else if (isdisconnected) {
4650                 soisdisconnected(so);
4651         }
4652
4653         /* Let's check the state of pcb just to make sure that it did not get closed
4654          * when we unlocked above
4655          */
4656         if (inp->inp_state == INPCB_STATE_DEAD) {
4657                 /* Just drop the packet that we are processing and return */
4658                 goto drop;
4659         }
4660
4661         /*
4662          * Process the segment text, merging it into the TCP sequencing queue,
4663          * and arranging for acknowledgment of receipt if necessary.
4664          * This process logically involves adjusting tp->rcv_wnd as data
4665          * is presented to the user (this happens in tcp_usrreq.c,
4666          * case PRU_RCVD).  If a FIN has already been received on this
4667          * connection then we just ignore the text.
4668          *
4669          * If we are in SYN-received state and got a valid TFO cookie, we want
4670          * to process the data.
4671          */
4672         if ((tlen || (thflags & TH_FIN)) &&
4673             TCPS_HAVERCVDFIN(tp->t_state) == 0 &&
4674             (TCPS_HAVEESTABLISHED(tp->t_state) ||
4675              (tp->t_state == TCPS_SYN_RECEIVED &&
4676              (tp->t_tfo_flags & TFO_F_COOKIE_VALID)))) {
4677                 tcp_seq save_start = th->th_seq;
4678                 tcp_seq save_end = th->th_seq + tlen;
4679                 m_adj(m, drop_hdrlen);  /* delayed header drop */
4680                 /*
4681                  * Insert segment which includes th into TCP reassembly queue
4682                  * with control block tp.  Set thflags to whether reassembly now
4683                  * includes a segment with FIN.  This handles the common case
4684                  * inline (segment is the next to be received on an established
4685                  * connection, and the queue is empty), avoiding linkage into
4686                  * and removal from the queue and repetition of various
4687                  * conversions.
4688                  * Set DELACK for segments received in order, but ack
4689                  * immediately when segments are out of order (so
4690                  * fast retransmit can work).
4691                  */
4692                 if (th->th_seq == tp->rcv_nxt && LIST_EMPTY(&tp->t_segq)) {
4693                         TCP_INC_VAR(tp->t_unacksegs, nlropkts);
4694                         /*
4695                          * Calculate the RTT on the receiver only if the
4696                          * connection is in streaming mode and the last
4697                          * packet was not an end-of-write
4698                          */
4699                         if ((tp->t_flags & TF_STRETCHACK) &&
4700                                 !(tp->t_flagsext & TF_STREAMEOW))
4701                                 tcp_compute_rtt(tp, &to, th);
4702
4703                         if (DELAY_ACK(tp, th) &&
4704                                 ((tp->t_flags & TF_ACKNOW) == 0) ) {
4705                                 if ((tp->t_flags & TF_DELACK) == 0) {
4706                                         tp->t_flags |= TF_DELACK;
4707                                         tp->t_timer[TCPT_DELACK] =
4708                                                 OFFSET_FROM_START(tp, tcp_delack);
4709                                 }
4710                         }
4711                         else {
4712                                 tp->t_flags |= TF_ACKNOW;
4713                         }
4714                         tp->rcv_nxt += tlen;
4715                         thflags = th->th_flags & TH_FIN;
4716                         TCP_INC_VAR(tcpstat.tcps_rcvpack, nlropkts);
4717                         tcpstat.tcps_rcvbyte += tlen;
4718                         if (nstat_collect) {
4719                                 if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) {
4720                                         INP_ADD_STAT(inp, cell, wifi, wired,
4721                                             rxpackets, m->m_pkthdr.lro_npkts);
4722                                 } else {
4723                                         INP_ADD_STAT(inp, cell, wifi, wired,
4724                                             rxpackets, 1);
4725                                 }
4726                                 INP_ADD_STAT(inp, cell, wifi, wired,
4727                                     rxbytes, tlen);
4728                         }
4729                         tcp_sbrcv_grow(tp, &so->so_rcv, &to, tlen);
4730                         so_recv_data_stat(so, m, drop_hdrlen);
4731
4732                         if (sbappendstream_rcvdemux(so, m,
4733                             th->th_seq - (tp->irs + 1), 0)) {
4734                                 sorwakeup(so);
4735                         }
4736                 } else {
4737                         thflags = tcp_reass(tp, th, &tlen, m, ifp);
4738                         tp->t_flags |= TF_ACKNOW;
4739                 }
4740
4741                 if (tlen > 0 && SACK_ENABLED(tp))
4742                         tcp_update_sack_list(tp, save_start, save_end);
4743
4744                 tcp_adaptive_rwtimo_check(tp, tlen);
4745
4746                 if (tlen > 0)
4747                         tcp_tfo_rcv_data(tp);
4748
4749                 if (tp->t_flags & TF_DELACK)
4750                 {
4751 #if INET6
4752                         if (isipv6) {
4753                                 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
4754                                         (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
4755                                         th->th_seq, th->th_ack, th->th_win);
4756                         }
4757                         else
4758 #endif
4759                         {
4760                                 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
4761                                         (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
4762                                         th->th_seq, th->th_ack, th->th_win);
4763                         }
4764
4765                 }
4766         } else {
4767                 m_freem(m);
4768                 thflags &= ~TH_FIN;
4769         }
4770
4771         /*
4772          * If FIN is received ACK the FIN and let the user know
4773          * that the connection is closing.
4774          */
4775         if (thflags & TH_FIN) {
4776                 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
4777                         socantrcvmore(so);
4778                         postevent(so, 0, EV_FIN);
4779                         /*
4780                          * If connection is half-synchronized
4781                          * (ie NEEDSYN flag on) then delay ACK,
4782                          * so it may be piggybacked when SYN is sent.
4783                          * Otherwise, since we received a FIN then no
4784                          * more input can be expected, send ACK now.
4785                          */
4786                         TCP_INC_VAR(tp->t_unacksegs, nlropkts);
4787                         if (DELAY_ACK(tp, th) && (tp->t_flags & TF_NEEDSYN)) {
4788                                 if ((tp->t_flags & TF_DELACK) == 0) {
4789                                         tp->t_flags |= TF_DELACK;
4790                                         tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
4791                                 }
4792                         } else {
4793                                 tp->t_flags |= TF_ACKNOW;
4794                         }
4795                         tp->rcv_nxt++;
4796                 }
4797                 switch (tp->t_state) {
4798
4799                 /*
4800                  * In SYN_RECEIVED and ESTABLISHED STATES
4801                  * enter the CLOSE_WAIT state.
4802                  */
4803                 case TCPS_SYN_RECEIVED:
4804                         tp->t_starttime = tcp_now;
4805                 case TCPS_ESTABLISHED:
4806                         DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
4807                                 struct tcpcb *, tp, int32_t, TCPS_CLOSE_WAIT);
4808                         tp->t_state = TCPS_CLOSE_WAIT;
4809                         break;
4810
4811                 /*
4812                  * If still in FIN_WAIT_1 STATE FIN has not been acked so
4813                  * enter the CLOSING state.
4814                  */
4815                 case TCPS_FIN_WAIT_1:
4816                         DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
4817                                 struct tcpcb *, tp, int32_t, TCPS_CLOSING);
4818                         tp->t_state = TCPS_CLOSING;
4819                         break;
4820
4821                 /*
4822                  * In FIN_WAIT_2 state enter the TIME_WAIT state,
4823                  * starting the time-wait timer, turning off the other
4824                  * standard timers.
4825                  */
4826                 case TCPS_FIN_WAIT_2:
4827                         DTRACE_TCP4(state__change, void, NULL,
4828                                 struct inpcb *, inp,
4829                                 struct tcpcb *, tp,
4830                                 int32_t, TCPS_TIME_WAIT);
4831                         tp->t_state = TCPS_TIME_WAIT;
4832                         tcp_canceltimers(tp);
4833                         tp->t_flags |= TF_ACKNOW;
4834                         if (tp->t_flagsext & TF_NOTIMEWAIT) {
4835                                 tp->t_flags |= TF_CLOSING;
4836                         } else {
4837                                 add_to_time_wait(tp, 2 * tcp_msl);
4838                         }
4839                         soisdisconnected(so);
4840                         break;
4841
4842                 /*
4843                  * In TIME_WAIT state restart the 2 MSL time_wait timer.
4844                  */
4845                 case TCPS_TIME_WAIT:
4846                         add_to_time_wait(tp, 2 * tcp_msl);
4847                         break;
4848                 }
4849         }
4850 #if TCPDEBUG
4851         if (so->so_options & SO_DEBUG)
4852                 tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
4853                           &tcp_savetcp, 0);
4854 #endif
4855
4856         /*
4857          * Return any desired output.
4858          */
4859         if (needoutput || (tp->t_flags & TF_ACKNOW)) {
4860                 (void) tcp_output(tp);
4861         }
4862
4863         tcp_check_timer_state(tp);
4864
4865
4866         tcp_unlock(so, 1, 0);
4867         KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
4868         return;
4869
4870 dropafterack:
4871         /*
4872          * Generate an ACK dropping incoming segment if it occupies
4873          * sequence space, where the ACK reflects our state.
4874          *
4875          * We can now skip the test for the RST flag since all
4876          * paths to this code happen after packets containing
4877          * RST have been dropped.
4878          *
4879          * In the SYN-RECEIVED state, don't send an ACK unless the
4880          * segment we received passes the SYN-RECEIVED ACK test.
4881          * If it fails send a RST.  This breaks the loop in the
4882          * "LAND" DoS attack, and also prevents an ACK storm
4883          * between two listening ports that have been sent forged
4884          * SYN segments, each with the source address of the other.
4885          */
4886         if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
4887             (SEQ_GT(tp->snd_una, th->th_ack) ||
4888              SEQ_GT(th->th_ack, tp->snd_max)) ) {
4889                 rstreason = BANDLIM_RST_OPENPORT;
4890                 IF_TCP_STATINC(ifp, dospacket);
4891                 goto dropwithreset;
4892         }
4893 #if TCPDEBUG
4894         if (so->so_options & SO_DEBUG)
4895                 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
4896                           &tcp_savetcp, 0);
4897 #endif
4898         m_freem(m);
4899         tp->t_flags |= TF_ACKNOW;
4900         (void) tcp_output(tp);
4901
4902         /* Don't need to check timer state as we should have done it during tcp_output */
4903         tcp_unlock(so, 1, 0);
4904         KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
4905         return;
4906 dropwithresetnosock:
4907         nosock = 1;
4908 dropwithreset:
4909         /*
4910          * Generate a RST, dropping incoming segment.
4911          * Make ACK acceptable to originator of segment.
4912          * Don't bother to respond if destination was broadcast/multicast.
4913          */
4914         if ((thflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST))
4915                 goto drop;
4916 #if INET6
4917         if (isipv6) {
4918                 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
4919                     IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
4920                         goto drop;
4921         } else
4922 #endif /* INET6 */
4923         if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
4924             IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
4925             ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
4926             in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
4927                 goto drop;
4928         /* IPv6 anycast check is done at tcp6_input() */
4929
4930         /*
4931          * Perform bandwidth limiting.
4932          */
4933 #if ICMP_BANDLIM
4934         if (badport_bandlim(rstreason) < 0)
4935                 goto drop;
4936 #endif
4937
4938 #if TCPDEBUG
4939         if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
4940                 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
4941                           &tcp_savetcp, 0);
4942 #endif
4943         bzero(&tra, sizeof(tra));
4944         tra.ifscope = ifscope;
4945         tra.awdl_unrestricted = 1;
4946         if (thflags & TH_ACK)
4947                 /* mtod() below is safe as long as hdr dropping is delayed */
4948                 tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack,
4949                     TH_RST, &tra);
4950         else {
4951                 if (thflags & TH_SYN)
4952                         tlen++;
4953                 /* mtod() below is safe as long as hdr dropping is delayed */
4954                 tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
4955                     (tcp_seq)0, TH_RST|TH_ACK, &tra);
4956         }
4957         /* destroy temporarily created socket */
4958         if (dropsocket) {
4959                 (void) soabort(so);
4960                 tcp_unlock(so, 1, 0);
4961         } else if ((inp != NULL) && (nosock == 0)) {
4962                 tcp_unlock(so, 1, 0);
4963         }
4964         KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
4965         return;
4966 dropnosock:
4967         nosock = 1;
4968 drop:
4969         /*
4970          * Drop space held by incoming segment and return.
4971          */
4972 #if TCPDEBUG
4973         if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
4974                 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
4975                           &tcp_savetcp, 0);
4976 #endif
4977         m_freem(m);
4978         /* destroy temporarily created socket */
4979         if (dropsocket) {
4980                 (void) soabort(so);
4981                 tcp_unlock(so, 1, 0);
4982         }
4983         else if (nosock == 0) {
4984                 tcp_unlock(so, 1, 0);
4985         }
4986         KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
4987         return;
4988 }
4989
4990 /*
4991  * Parse TCP options and place in tcpopt.
4992  */
4993 static void
4994 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th,
4995     struct tcpopt *to)
4996 {
4997         u_short mss = 0;
4998         int opt, optlen;
4999
5000         for (; cnt > 0; cnt -= optlen, cp += optlen) {
5001                 opt = cp[0];
5002                 if (opt == TCPOPT_EOL)
5003                         break;
5004                 if (opt == TCPOPT_NOP)
5005                         optlen = 1;
5006                 else {
5007                         if (cnt < 2)
5008                                 break;
5009                         optlen = cp[1];
5010                         if (optlen < 2 || optlen > cnt)
5011                                 break;
5012                 }
5013                 switch (opt) {
5014
5015                 default:
5016                         continue;
5017
5018                 case TCPOPT_MAXSEG:
5019                         if (optlen != TCPOLEN_MAXSEG)
5020                                 continue;
5021                         if (!(th->th_flags & TH_SYN))
5022                                 continue;
5023                         bcopy((char *) cp + 2, (char *) &mss, sizeof(mss));
5024                         NTOHS(mss);
5025                         to->to_mss = mss;
5026                         to->to_flags |= TOF_MSS;
5027                         break;
5028
5029                 case TCPOPT_WINDOW:
5030                         if (optlen != TCPOLEN_WINDOW)
5031                                 continue;
5032                         if (!(th->th_flags & TH_SYN))
5033                                 continue;
5034                         to->to_flags |= TOF_SCALE;
5035                         to->to_requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
5036                         break;
5037
5038                 case TCPOPT_TIMESTAMP:
5039                         if (optlen != TCPOLEN_TIMESTAMP)
5040                                 continue;
5041                         to->to_flags |= TOF_TS;
5042                         bcopy((char *)cp + 2,
5043                             (char *)&to->to_tsval, sizeof(to->to_tsval));
5044                         NTOHL(to->to_tsval);
5045                         bcopy((char *)cp + 6,
5046                             (char *)&to->to_tsecr, sizeof(to->to_tsecr));
5047                         NTOHL(to->to_tsecr);
5048                         /* Re-enable sending Timestamps if we received them */
5049                         if (!(tp->t_flags & TF_REQ_TSTMP) &&
5050                             tcp_do_rfc1323 == 1)
5051                                 tp->t_flags |= TF_REQ_TSTMP;
5052                         break;
5053                 case TCPOPT_SACK_PERMITTED:
5054                         if (!tcp_do_sack ||
5055                             optlen != TCPOLEN_SACK_PERMITTED)
5056                                 continue;
5057                         if (th->th_flags & TH_SYN)
5058                                 to->to_flags |= TOF_SACK;
5059                         break;
5060                 case TCPOPT_SACK:
5061                         if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
5062                                 continue;
5063                         to->to_nsacks = (optlen - 2) / TCPOLEN_SACK;
5064                         to->to_sacks = cp + 2;
5065                         tcpstat.tcps_sack_rcv_blocks++;
5066
5067                         break;
5068                 case TCPOPT_FASTOPEN:
5069                         if (optlen == TCPOLEN_FASTOPEN_REQ) {
5070                                 if (tp->t_state != TCPS_LISTEN)
5071                                         continue;
5072
5073                                 to->to_flags |= TOF_TFOREQ;
5074                         } else {
5075                                 if (optlen < TCPOLEN_FASTOPEN_REQ ||
5076                                     (optlen - TCPOLEN_FASTOPEN_REQ) > TFO_COOKIE_LEN_MAX ||
5077                                     (optlen - TCPOLEN_FASTOPEN_REQ) < TFO_COOKIE_LEN_MIN)
5078                                         continue;
5079                                 if (tp->t_state != TCPS_LISTEN &&
5080                                     tp->t_state != TCPS_SYN_SENT)
5081                                         continue;
5082
5083                                 to->to_flags |= TOF_TFO;
5084                                 to->to_tfo = cp + 1;
5085                         }
5086
5087                         break;
5088 #if MPTCP
5089                 case TCPOPT_MULTIPATH:
5090                         tcp_do_mptcp_options(tp, cp, th, to, optlen);
5091                         break;
5092 #endif /* MPTCP */
5093                 }
5094         }
5095 }
5096
5097 static void
5098 tcp_finalize_options(struct tcpcb *tp, struct tcpopt *to, unsigned int ifscope)
5099 {
5100         if (to->to_flags & TOF_TS) {
5101                 tp->t_flags |= TF_RCVD_TSTMP;
5102                 tp->ts_recent = to->to_tsval;
5103                 tp->ts_recent_age = tcp_now;
5104
5105         }
5106         if (to->to_flags & TOF_MSS)
5107                 tcp_mss(tp, to->to_mss, ifscope);
5108         if (SACK_ENABLED(tp)) {
5109                 if (!(to->to_flags & TOF_SACK))
5110                         tp->t_flagsext &= ~(TF_SACK_ENABLE);
5111                 else
5112                         tp->t_flags |= TF_SACK_PERMIT;
5113         }
5114         if (to->to_flags & TOF_SCALE) {
5115                 tp->t_flags |= TF_RCVD_SCALE;
5116                 tp->requested_s_scale = to->to_requested_s_scale;
5117
5118                 /* Re-enable window scaling, if the option is received */
5119                 if (tp->request_r_scale > 0)
5120                         tp->t_flags |= TF_REQ_SCALE;
5121         }
5122 }
5123
5124 /*
5125  * Pull out of band byte out of a segment so
5126  * it doesn't appear in the user's data queue.
5127  * It is still reflected in the segment length for
5128  * sequencing purposes.
5129  */
5130 static void
5131 tcp_pulloutofband(so, th, m, off)
5132         struct socket *so;
5133         struct tcphdr *th;
5134         register struct mbuf *m;
5135         int off;                /* delayed to be droped hdrlen */
5136 {
5137         int cnt = off + th->th_urp - 1;
5138
5139         while (cnt >= 0) {
5140                 if (m->m_len > cnt) {
5141                         char *cp = mtod(m, caddr_t) + cnt;
5142                         struct tcpcb *tp = sototcpcb(so);
5143
5144                         tp->t_iobc = *cp;
5145                         tp->t_oobflags |= TCPOOB_HAVEDATA;
5146                         bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
5147                         m->m_len--;
5148                         if (m->m_flags & M_PKTHDR)
5149                                 m->m_pkthdr.len--;
5150                         return;
5151                 }
5152                 cnt -= m->m_len;
5153                 m = m->m_next;
5154                 if (m == 0)
5155                         break;
5156         }
5157         panic("tcp_pulloutofband");
5158 }
5159
5160 uint32_t
5161 get_base_rtt(struct tcpcb *tp)
5162 {
5163         uint32_t base_rtt = 0, i;
5164         for (i = 0; i < N_RTT_BASE; ++i) {
5165                 if (tp->rtt_hist[i] != 0 &&
5166                         (base_rtt == 0 || tp->rtt_hist[i] < base_rtt))
5167                         base_rtt = tp->rtt_hist[i];
5168         }
5169         return base_rtt;
5170 }
5171
5172 /* Each value of RTT base represents the minimum RTT seen in a minute.
5173  * We keep upto N_RTT_BASE minutes worth of history.
5174  */
5175 void
5176 update_base_rtt(struct tcpcb *tp, uint32_t rtt)
5177 {
5178         int32_t i, qdelay;
5179         u_int32_t base_rtt;
5180
5181         if (++tp->rtt_count >= rtt_samples_per_slot) {
5182 #if TRAFFIC_MGT
5183                 /*
5184                  * If the recv side is being throttled, check if the
5185                  * current RTT is closer to the base RTT seen in
5186                  * first (recent) two slots. If so, unthrottle the stream.
5187                  */
5188                 if (tp->t_flagsext & TF_RECV_THROTTLE) {
5189                         base_rtt = min(tp->rtt_hist[0], tp->rtt_hist[1]);
5190                         qdelay = tp->t_rttcur - base_rtt;
5191                         if (qdelay < target_qdelay)
5192                                 tp->t_flagsext &= ~(TF_RECV_THROTTLE);
5193                 }
5194 #endif /* TRAFFIC_MGT */
5195
5196                 for (i = (N_RTT_BASE-1); i > 0; --i) {
5197                         tp->rtt_hist[i] = tp->rtt_hist[i-1];
5198                 }
5199                 tp->rtt_hist[0] = rtt;
5200                 tp->rtt_count = 0;
5201         } else {
5202                 tp->rtt_hist[0] = min(tp->rtt_hist[0], rtt);
5203         }
5204 }
5205
5206 /*
5207  * If we have a timestamp reply, update smoothed RTT. If no timestamp is
5208  * present but transmit timer is running and timed sequence number was
5209  * acked, update smoothed RTT.
5210  *
5211  * If timestamps are supported, a receiver can update RTT even if
5212  * there is no outstanding data.
5213  *
5214  * Some boxes send broken timestamp replies during the SYN+ACK phase,
5215  * ignore timestamps of 0or we could calculate a huge RTT and blow up
5216  * the retransmit timer.
5217  */
5218 static void
5219 tcp_compute_rtt(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th)
5220 {
5221         int rtt = 0;
5222         VERIFY(to != NULL && th != NULL);
5223         if (tp->t_rtttime != 0 && SEQ_GT(th->th_ack, tp->t_rtseq)) {
5224                 u_int32_t pipe_ack_val;
5225                 rtt = tcp_now - tp->t_rtttime;
5226                 /*
5227                  * Compute pipe ack -- the amount of data acknowledged
5228                  * in the last RTT
5229                  */
5230                 if (SEQ_GT(th->th_ack, tp->t_pipeack_lastuna)) {
5231                         pipe_ack_val = th->th_ack - tp->t_pipeack_lastuna;
5232                         /* Update the sample */
5233                         tp->t_pipeack_sample[tp->t_pipeack_ind++] =
5234                             pipe_ack_val;
5235                         tp->t_pipeack_ind %= TCP_PIPEACK_SAMPLE_COUNT;
5236
5237                         /* Compute the max of the pipeack samples */
5238                         pipe_ack_val = tcp_get_max_pipeack(tp);
5239                         tp->t_pipeack = (pipe_ack_val >
5240                                     TCP_CC_CWND_INIT_BYTES) ?
5241                                     pipe_ack_val : 0;
5242                 }
5243                 /* start another measurement */
5244                 tp->t_rtttime = 0;
5245         }
5246         if (((to->to_flags & TOF_TS) != 0) &&
5247                 (to->to_tsecr != 0) &&
5248                 TSTMP_GEQ(tcp_now, to->to_tsecr)) {
5249                 tcp_xmit_timer(tp, (tcp_now - to->to_tsecr),
5250                         to->to_tsecr, th->th_ack);
5251         } else if (rtt > 0) {
5252                 tcp_xmit_timer(tp, rtt, 0, th->th_ack);
5253         }
5254 }
5255
5256 /*
5257  * Collect new round-trip time estimate
5258  * and update averages and current timeout.
5259  */
5260 static void
5261 tcp_xmit_timer(register struct tcpcb *tp, int rtt,
5262         u_int32_t tsecr, tcp_seq th_ack)
5263 {
5264         register int delta;
5265
5266         if (tp->t_flagsext & TF_RECOMPUTE_RTT) {
5267                 if (SEQ_GT(th_ack, tp->snd_una) &&
5268                     SEQ_LEQ(th_ack, tp->snd_max) &&
5269                     (tsecr == 0 ||
5270                     TSTMP_GEQ(tsecr, tp->t_badrexmt_time))) {
5271                         /*
5272                          * We received a new ACk after a
5273                          * spurious timeout. Adapt retransmission
5274                          * timer as described in rfc 4015.
5275                          */
5276                         tp->t_flagsext &= ~(TF_RECOMPUTE_RTT);
5277                         tp->t_badrexmt_time = 0;
5278                         tp->t_srtt = max(tp->t_srtt_prev, rtt);
5279                         tp->t_srtt = tp->t_srtt << TCP_RTT_SHIFT;
5280                         tp->t_rttvar = max(tp->t_rttvar_prev, (rtt >> 1));
5281                         tp->t_rttvar = tp->t_rttvar << TCP_RTTVAR_SHIFT;
5282
5283                         if (tp->t_rttbest > (tp->t_srtt + tp->t_rttvar))
5284                                 tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
5285
5286                         goto compute_rto;
5287                 } else {
5288                         return;
5289                 }
5290         }
5291
5292         tcpstat.tcps_rttupdated++;
5293         tp->t_rttupdated++;
5294
5295         if (rtt > 0) {
5296                 tp->t_rttcur = rtt;
5297                 update_base_rtt(tp, rtt);
5298         }
5299
5300         if (tp->t_srtt != 0) {
5301                 /*
5302                  * srtt is stored as fixed point with 5 bits after the
5303                  * binary point (i.e., scaled by 32).  The following magic
5304                  * is equivalent to the smoothing algorithm in rfc793 with
5305                  * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
5306                  * point).
5307                  *
5308                  * Freebsd adjusts rtt to origin 0 by subtracting 1
5309                  * from the provided rtt value. This was required because
5310                  * of the way t_rtttime was initiailised to 1 before.
5311                  * Since we changed t_rtttime to be based on
5312                  * tcp_now, this extra adjustment is not needed.
5313                  */
5314                 delta = (rtt << TCP_DELTA_SHIFT)
5315                         - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
5316
5317                 if ((tp->t_srtt += delta) <= 0)
5318                         tp->t_srtt = 1;
5319
5320                 /*
5321                  * We accumulate a smoothed rtt variance (actually, a
5322                  * smoothed mean difference), then set the retransmit
5323                  * timer to smoothed rtt + 4 times the smoothed variance.
5324                  * rttvar is stored as fixed point with 4 bits after the
5325                  * binary point (scaled by 16).  The following is
5326                  * equivalent to rfc793 smoothing with an alpha of .75
5327                  * (rttvar = rttvar*3/4 + |delta| / 4).  This replaces
5328                  * rfc793's wired-in beta.
5329                  */
5330                 if (delta < 0)
5331                         delta = -delta;
5332                 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
5333                 if ((tp->t_rttvar += delta) <= 0)
5334                         tp->t_rttvar = 1;
5335                 if (tp->t_rttbest == 0  ||
5336                         tp->t_rttbest > (tp->t_srtt + tp->t_rttvar))
5337                         tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
5338         } else {
5339                 /*
5340                  * No rtt measurement yet - use the unsmoothed rtt.
5341                  * Set the variance to half the rtt (so our first
5342                  * retransmit happens at 3*rtt).
5343                  */
5344                 tp->t_srtt = rtt << TCP_RTT_SHIFT;
5345                 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
5346         }
5347
5348 compute_rto:
5349         nstat_route_rtt(tp->t_inpcb->inp_route.ro_rt, tp->t_srtt,
5350                 tp->t_rttvar);
5351         tp->t_rxtshift = 0;
5352         tp->t_rxtstart = 0;
5353
5354         /*
5355          * the retransmit should happen at rtt + 4 * rttvar.
5356          * Because of the way we do the smoothing, srtt and rttvar
5357          * will each average +1/2 tick of bias.  When we compute
5358          * the retransmit timer, we want 1/2 tick of rounding and
5359          * 1 extra tick because of +-1/2 tick uncertainty in the
5360          * firing of the timer.  The bias will give us exactly the
5361          * 1.5 tick we need.  But, because the bias is
5362          * statistical, we have to test that we don't drop below
5363          * the minimum feasible timer (which is 2 ticks).
5364          */
5365         TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
5366                 max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX,
5367                 TCP_ADD_REXMTSLOP(tp));
5368
5369         /*
5370          * We received an ack for a packet that wasn't retransmitted;
5371          * it is probably safe to discard any error indications we've
5372          * received recently.  This isn't quite right, but close enough
5373          * for now (a route might have failed after we sent a segment,
5374          * and the return path might not be symmetrical).
5375          */
5376         tp->t_softerror = 0;
5377 }
5378
5379 static inline unsigned int
5380 tcp_maxmtu(struct rtentry *rt)
5381 {
5382         unsigned int maxmtu;
5383
5384         RT_LOCK_ASSERT_HELD(rt);
5385         if (rt->rt_rmx.rmx_mtu == 0)
5386                 maxmtu = rt->rt_ifp->if_mtu;
5387         else
5388                 maxmtu = MIN(rt->rt_rmx.rmx_mtu, rt->rt_ifp->if_mtu);
5389
5390         return (maxmtu);
5391 }
5392
5393 #if INET6
5394 static inline unsigned int
5395 tcp_maxmtu6(struct rtentry *rt)
5396 {
5397         unsigned int maxmtu;
5398         struct nd_ifinfo *ndi = NULL;
5399
5400         RT_LOCK_ASSERT_HELD(rt);
5401         if ((ndi = ND_IFINFO(rt->rt_ifp)) != NULL && !ndi->initialized)
5402                 ndi = NULL;
5403         if (ndi != NULL)
5404                 lck_mtx_lock(&ndi->lock);
5405         if (rt->rt_rmx.rmx_mtu == 0)
5406                 maxmtu = IN6_LINKMTU(rt->rt_ifp);
5407         else
5408                 maxmtu = MIN(rt->rt_rmx.rmx_mtu, IN6_LINKMTU(rt->rt_ifp));
5409         if (ndi != NULL)
5410                 lck_mtx_unlock(&ndi->lock);
5411
5412         return (maxmtu);
5413 }
5414 #endif
5415
5416 /*
5417  * Determine a reasonable value for maxseg size.
5418  * If the route is known, check route for mtu.
5419  * If none, use an mss that can be handled on the outgoing
5420  * interface without forcing IP to fragment; if bigger than
5421  * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
5422  * to utilize large mbufs.  If no route is found, route has no mtu,
5423  * or the destination isn't local, use a default, hopefully conservative
5424  * size (usually 512 or the default IP max size, but no more than the mtu
5425  * of the interface), as we can't discover anything about intervening
5426  * gateways or networks.  We also initialize the congestion/slow start
5427  * window. While looking at the routing entry, we also initialize
5428  * other path-dependent parameters from pre-set or cached values
5429  * in the routing entry.
5430  *
5431  * Also take into account the space needed for options that we
5432  * send regularly.  Make maxseg shorter by that amount to assure
5433  * that we can send maxseg amount of data even when the options
5434  * are present.  Store the upper limit of the length of options plus
5435  * data in maxopd.
5436  *
5437  * NOTE that this routine is only called when we process an incoming
5438  * segment, for outgoing segments only tcp_mssopt is called.
5439  *
5440  */
5441 void
5442 tcp_mss(tp, offer, input_ifscope)
5443         struct tcpcb *tp;
5444         int offer;
5445         unsigned int input_ifscope;
5446 {
5447         register struct rtentry *rt;
5448         struct ifnet *ifp;
5449         register int rtt, mss;
5450         u_int32_t bufsize;
5451         struct inpcb *inp;
5452         struct socket *so;
5453         struct rmxp_tao *taop;
5454         int origoffer = offer;
5455         u_int32_t sb_max_corrected;
5456         int isnetlocal = 0;
5457 #if INET6
5458         int isipv6;
5459         int min_protoh;
5460 #endif
5461
5462         inp = tp->t_inpcb;
5463 #if INET6
5464         isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
5465         min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr)
5466                             : sizeof (struct tcpiphdr);
5467 #else
5468 #define min_protoh  (sizeof (struct tcpiphdr))
5469 #endif
5470
5471 #if INET6
5472         if (isipv6) {
5473                 rt = tcp_rtlookup6(inp, input_ifscope);
5474         }
5475         else
5476 #endif /* INET6 */
5477         {
5478                 rt = tcp_rtlookup(inp, input_ifscope);
5479         }
5480         isnetlocal = (tp->t_flags & TF_LOCAL);
5481
5482         if (rt == NULL) {
5483                 tp->t_maxopd = tp->t_maxseg =
5484 #if INET6
5485                 isipv6 ? tcp_v6mssdflt :
5486 #endif /* INET6 */
5487                 tcp_mssdflt;
5488                 return;
5489         }
5490         ifp = rt->rt_ifp;
5491         /*
5492          * Slower link window correction:
5493          * If a value is specificied for slowlink_wsize use it for
5494          * PPP links believed to be on a serial modem (speed <128Kbps).
5495          * Excludes 9600bps as it is the default value adversized
5496          * by pseudo-devices over ppp.
5497          */
5498         if (ifp->if_type == IFT_PPP && slowlink_wsize > 0 &&
5499             ifp->if_baudrate > 9600 && ifp->if_baudrate <= 128000) {
5500                 tp->t_flags |= TF_SLOWLINK;
5501         }
5502         so = inp->inp_socket;
5503
5504         taop = rmx_taop(rt->rt_rmx);
5505         /*
5506          * Offer == -1 means that we didn't receive SYN yet,
5507          * use cached value in that case;
5508          */
5509         if (offer == -1)
5510                 offer = taop->tao_mssopt;
5511         /*
5512          * Offer == 0 means that there was no MSS on the SYN segment,
5513          * in this case we use tcp_mssdflt.
5514          */
5515         if (offer == 0)
5516                 offer =
5517 #if INET6
5518                         isipv6 ? tcp_v6mssdflt :
5519 #endif /* INET6 */
5520                         tcp_mssdflt;
5521         else {
5522                 /*
5523                  * Prevent DoS attack with too small MSS. Round up
5524                  * to at least minmss.
5525                  */
5526                 offer = max(offer, tcp_minmss);
5527                 /*
5528                  * Sanity check: make sure that maxopd will be large
5529                  * enough to allow some data on segments even is the
5530                  * all the option space is used (40bytes).  Otherwise
5531                  * funny things may happen in tcp_output.
5532                  */
5533                 offer = max(offer, 64);
5534         }
5535         taop->tao_mssopt = offer;
5536
5537         /*
5538          * While we're here, check if there's an initial rtt
5539          * or rttvar.  Convert from the route-table units
5540          * to scaled multiples of the slow timeout timer.
5541          */
5542         if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt) != 0) {
5543                 tcp_getrt_rtt(tp, rt);
5544         } else {
5545                 tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN : TCPTV_REXMTMIN;
5546         }
5547
5548 #if INET6
5549         mss = (isipv6 ? tcp_maxmtu6(rt) : tcp_maxmtu(rt));
5550 #else
5551         mss = tcp_maxmtu(rt);
5552 #endif
5553
5554 #if NECP
5555         // At this point, the mss is just the MTU. Adjust if necessary.
5556         mss = necp_socket_get_effective_mtu(inp, mss);
5557 #endif /* NECP */
5558
5559         mss -= min_protoh;
5560
5561         if (rt->rt_rmx.rmx_mtu == 0) {
5562 #if INET6
5563                 if (isipv6) {
5564                         if (!isnetlocal)
5565                                 mss = min(mss, tcp_v6mssdflt);
5566                 } else
5567 #endif /* INET6 */
5568                 if (!isnetlocal)
5569                         mss = min(mss, tcp_mssdflt);
5570         }
5571
5572         mss = min(mss, offer);
5573         /*
5574          * maxopd stores the maximum length of data AND options
5575          * in a segment; maxseg is the amount of data in a normal
5576          * segment.  We need to store this value (maxopd) apart
5577          * from maxseg, because now every segment carries options
5578          * and thus we normally have somewhat less data in segments.
5579          */
5580         tp->t_maxopd = mss;
5581
5582         /*
5583          * origoffer==-1 indicates, that no segments were received yet.
5584          * In this case we just guess.
5585          */
5586         if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
5587             (origoffer == -1 ||
5588              (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
5589                 mss -= TCPOLEN_TSTAMP_APPA;
5590
5591 #if MPTCP
5592         mss -= mptcp_adj_mss(tp, FALSE);
5593 #endif /* MPTCP */
5594         tp->t_maxseg = mss;
5595
5596         /*
5597          * Calculate corrected value for sb_max; ensure to upgrade the
5598          * numerator for large sb_max values else it will overflow.
5599          */
5600         sb_max_corrected = (sb_max * (u_int64_t)MCLBYTES) / (MSIZE + MCLBYTES);
5601
5602         /*
5603          * If there's a pipesize (ie loopback), change the socket
5604          * buffer to that size only if it's bigger than the current
5605          * sockbuf size.  Make the socket buffers an integral
5606          * number of mss units; if the mss is larger than
5607          * the socket buffer, decrease the mss.
5608          */
5609 #if RTV_SPIPE
5610         bufsize = rt->rt_rmx.rmx_sendpipe;
5611         if (bufsize < so->so_snd.sb_hiwat)
5612 #endif
5613                 bufsize = so->so_snd.sb_hiwat;
5614         if (bufsize < mss)
5615                 mss = bufsize;
5616         else {
5617                 bufsize = (((bufsize + (u_int64_t)mss - 1) / (u_int64_t)mss) * (u_int64_t)mss);
5618                 if (bufsize > sb_max_corrected)
5619                         bufsize = sb_max_corrected;
5620                 (void)sbreserve(&so->so_snd, bufsize);
5621         }
5622         tp->t_maxseg = mss;
5623
5624 #if RTV_RPIPE
5625         bufsize = rt->rt_rmx.rmx_recvpipe;
5626         if (bufsize < so->so_rcv.sb_hiwat)
5627 #endif
5628                 bufsize = so->so_rcv.sb_hiwat;
5629         if (bufsize > mss) {
5630                 bufsize = (((bufsize + (u_int64_t)mss - 1) / (u_int64_t)mss) * (u_int64_t)mss);
5631                 if (bufsize > sb_max_corrected)
5632                         bufsize = sb_max_corrected;
5633                 (void)sbreserve(&so->so_rcv, bufsize);
5634         }
5635
5636         set_tcp_stream_priority(so);
5637
5638         if (rt->rt_rmx.rmx_ssthresh) {
5639                 /*
5640                  * There's some sort of gateway or interface
5641                  * buffer limit on the path.  Use this to set
5642                  * slow-start threshold, but set the threshold to
5643                  * no less than 2*mss.
5644                  */
5645                 tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
5646                 tcpstat.tcps_usedssthresh++;
5647         } else {
5648                 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
5649         }
5650
5651         /*
5652          * Set the slow-start flight size depending on whether this
5653          * is a local network or not.
5654          */
5655         if (CC_ALGO(tp)->cwnd_init != NULL)
5656                 CC_ALGO(tp)->cwnd_init(tp);
5657
5658         tcp_ccdbg_trace(tp, NULL, TCP_CC_CWND_INIT);
5659
5660         /* Route locked during lookup above */
5661         RT_UNLOCK(rt);
5662 }
5663
5664 /*
5665  * Determine the MSS option to send on an outgoing SYN.
5666  */
5667 int
5668 tcp_mssopt(tp)
5669         struct tcpcb *tp;
5670 {
5671         struct rtentry *rt;
5672         int mss;
5673 #if INET6
5674         int isipv6;
5675         int min_protoh;
5676 #endif
5677
5678 #if INET6
5679         isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
5680         min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr)
5681                             : sizeof (struct tcpiphdr);
5682 #else
5683 #define min_protoh  (sizeof (struct tcpiphdr))
5684 #endif
5685
5686 #if INET6
5687         if (isipv6)
5688                 rt = tcp_rtlookup6(tp->t_inpcb, IFSCOPE_NONE);
5689         else
5690 #endif /* INET6 */
5691         rt = tcp_rtlookup(tp->t_inpcb, IFSCOPE_NONE);
5692         if (rt == NULL) {
5693                 return (
5694 #if INET6
5695                         isipv6 ? tcp_v6mssdflt :
5696 #endif /* INET6 */
5697                         tcp_mssdflt);
5698         }
5699         /*
5700          * Slower link window correction:
5701          * If a value is specificied for slowlink_wsize use it for PPP links
5702          * believed to be on a serial modem (speed <128Kbps). Excludes 9600bps as
5703          * it is the default value adversized by pseudo-devices over ppp.
5704          */
5705         if (rt->rt_ifp->if_type == IFT_PPP && slowlink_wsize > 0 &&
5706             rt->rt_ifp->if_baudrate > 9600 && rt->rt_ifp->if_baudrate <= 128000) {
5707                 tp->t_flags |= TF_SLOWLINK;
5708         }
5709
5710 #if INET6
5711         mss = (isipv6 ? tcp_maxmtu6(rt) : tcp_maxmtu(rt));
5712 #else
5713         mss = tcp_maxmtu(rt);
5714 #endif
5715         /* Route locked during lookup above */
5716         RT_UNLOCK(rt);
5717
5718 #if NECP
5719         // At this point, the mss is just the MTU. Adjust if necessary.
5720         mss = necp_socket_get_effective_mtu(tp->t_inpcb, mss);
5721 #endif /* NECP */
5722
5723         return (mss - min_protoh);
5724 }
5725
5726 /*
5727  * On a partial ack arrives, force the retransmission of the
5728  * next unacknowledged segment.  Do not clear tp->t_dupacks.
5729  * By setting snd_nxt to th_ack, this forces retransmission timer to
5730  * be started again.
5731  */
5732 static void
5733 tcp_newreno_partial_ack(tp, th)
5734         struct tcpcb *tp;
5735         struct tcphdr *th;
5736 {
5737                 tcp_seq onxt = tp->snd_nxt;
5738                 u_int32_t  ocwnd = tp->snd_cwnd;
5739                 tp->t_timer[TCPT_REXMT] = 0;
5740                 tp->t_timer[TCPT_PTO] = 0;
5741                 tp->t_rtttime = 0;
5742                 tp->snd_nxt = th->th_ack;
5743                 /*
5744                  * Set snd_cwnd to one segment beyond acknowledged offset
5745                  * (tp->snd_una has not yet been updated when this function
5746                  *  is called)
5747                  */
5748                 tp->snd_cwnd = tp->t_maxseg + BYTES_ACKED(th, tp);
5749                 tp->t_flags |= TF_ACKNOW;
5750                 (void) tcp_output(tp);
5751                 tp->snd_cwnd = ocwnd;
5752                 if (SEQ_GT(onxt, tp->snd_nxt))
5753                         tp->snd_nxt = onxt;
5754                 /*
5755                  * Partial window deflation.  Relies on fact that tp->snd_una
5756                  * not updated yet.
5757                  */
5758                 if (tp->snd_cwnd > BYTES_ACKED(th, tp))
5759                         tp->snd_cwnd -= BYTES_ACKED(th, tp);
5760                 else
5761                         tp->snd_cwnd = 0;
5762                 tp->snd_cwnd += tp->t_maxseg;
5763
5764 }
5765
5766 /*
5767  * Drop a random TCP connection that hasn't been serviced yet and
5768  * is eligible for discard.  There is a one in qlen chance that
5769  * we will return a null, saying that there are no dropable
5770  * requests.  In this case, the protocol specific code should drop
5771  * the new request.  This insures fairness.
5772  *
5773  * The listening TCP socket "head" must be locked
5774  */
5775 static int
5776 tcp_dropdropablreq(struct socket *head)
5777 {
5778         struct socket *so, *sonext;
5779         unsigned int i, j, qlen;
5780         static u_int32_t rnd = 0;
5781         static u_int64_t old_runtime;
5782         static unsigned int cur_cnt, old_cnt;
5783         u_int64_t now_sec;
5784         struct inpcb *inp = NULL;
5785         struct tcpcb *tp;
5786
5787         if ((head->so_options & SO_ACCEPTCONN) == 0)
5788                 return (0);
5789
5790         if (TAILQ_EMPTY(&head->so_incomp))
5791                 return (0);
5792
5793         /*
5794          * Check if there is any socket in the incomp queue
5795          * that is closed because of a reset from the peer and is
5796          * waiting to be garbage collected. If so, pick that as
5797          * the victim
5798          */
5799         TAILQ_FOREACH_SAFE(so, &head->so_incomp, so_list, sonext) {
5800                 inp = sotoinpcb(so);
5801                 tp = intotcpcb(inp);
5802                 if (tp != NULL && tp->t_state == TCPS_CLOSED &&
5803                     so->so_head != NULL &&
5804                     (so->so_state & (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) ==
5805                     (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) {
5806                         /*
5807                          * The listen socket is already locked but we
5808                          * can lock this socket here without lock ordering
5809                          * issues because it is in the incomp queue and
5810                          * is not visible to others.
5811                          */
5812                         if (lck_mtx_try_lock(&inp->inpcb_mtx)) {
5813                                 so->so_usecount++;
5814                                 goto found_victim;
5815                         } else {
5816                                 continue;
5817                         }
5818                 }
5819         }
5820
5821         so = TAILQ_FIRST(&head->so_incomp);
5822
5823         now_sec = net_uptime();
5824         if ((i = (now_sec - old_runtime)) != 0) {
5825                 old_runtime = now_sec;
5826                 old_cnt = cur_cnt / i;
5827                 cur_cnt = 0;
5828         }
5829
5830
5831         qlen = head->so_incqlen;
5832         if (rnd == 0)
5833                 rnd = RandomULong();
5834
5835         if (++cur_cnt > qlen || old_cnt > qlen) {
5836                 rnd = (314159 * rnd + 66329) & 0xffff;
5837                 j = ((qlen + 1) * rnd) >> 16;
5838
5839                 while (j-- && so)
5840                         so = TAILQ_NEXT(so, so_list);
5841         }
5842         /* Find a connection that is not already closing (or being served) */
5843         while (so) {
5844                 inp = (struct inpcb *)so->so_pcb;
5845
5846                 sonext = TAILQ_NEXT(so, so_list);
5847
5848                 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0)
5849                         != WNT_STOPUSING) {
5850                         /*
5851                          * Avoid the issue of a socket being accepted
5852                          * by one input thread and being dropped by
5853                          * another input thread. If we can't get a hold
5854                          * on this mutex, then grab the next socket in
5855                          * line.
5856                          */
5857                         if (lck_mtx_try_lock(&inp->inpcb_mtx)) {
5858                                 so->so_usecount++;
5859                                 if ((so->so_usecount == 2) &&
5860                                     (so->so_state & SS_INCOMP) &&
5861                                     !(so->so_flags & SOF_INCOMP_INPROGRESS))  {
5862                                         break;
5863                                 } else {
5864                                         /*
5865                                          * don't use if being accepted or
5866                                          * used in any other way
5867                                          */
5868                                         in_pcb_checkstate(inp, WNT_RELEASE, 1);
5869                                         tcp_unlock(so, 1, 0);
5870                                 }
5871                         } else {
5872                                 /*
5873                                  * do not try to lock the inp in
5874                                  * in_pcb_checkstate because the lock
5875                                  * is already held in some other thread.
5876                                  * Only drop the inp_wntcnt reference.
5877                                  */
5878                                 in_pcb_checkstate(inp, WNT_RELEASE, 1);
5879                         }
5880                 }
5881                 so = sonext;
5882
5883         }
5884         if (so == NULL) {
5885                 return (0);
5886         }
5887
5888         /* Makes sure socket is still in the right state to be discarded */
5889
5890         if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
5891                 tcp_unlock(so, 1, 0);
5892                 return (0);
5893         }
5894
5895 found_victim:
5896         if (so->so_usecount != 2 || !(so->so_state & SS_INCOMP)) {
5897                 /* do not discard: that socket is being accepted */
5898                 tcp_unlock(so, 1, 0);
5899                 return (0);
5900         }
5901
5902         TAILQ_REMOVE(&head->so_incomp, so, so_list);
5903         tcp_unlock(head, 0, 0);
5904
5905         lck_mtx_assert(&inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
5906         tp = sototcpcb(so);
5907         so->so_flags |= SOF_OVERFLOW;
5908         so->so_head = NULL;
5909
5910         tcp_close(tp);
5911         if (inp->inp_wantcnt > 0 && inp->inp_wantcnt != WNT_STOPUSING) {
5912                 /*
5913                  * Some one has a wantcnt on this pcb. Since WNT_ACQUIRE
5914                  * doesn't require a lock, it could have happened while
5915                  * we are holding the lock. This pcb will have to
5916                  * be garbage collected later.
5917                  * Release the reference held for so_incomp queue
5918                  */
5919                 so->so_usecount--;
5920                 tcp_unlock(so, 1, 0);
5921         } else {
5922                 /*
5923                  * Unlock this socket and leave the reference on.
5924                  * We need to acquire the pcbinfo lock in order to
5925                  * fully dispose it off
5926                  */
5927                 tcp_unlock(so, 0, 0);
5928
5929                 lck_rw_lock_exclusive(tcbinfo.ipi_lock);
5930
5931                 tcp_lock(so, 0, 0);
5932                 /* Release the reference held for so_incomp queue */
5933                 so->so_usecount--;
5934
5935                 if (so->so_usecount != 1 ||
5936                     (inp->inp_wantcnt > 0 &&
5937                     inp->inp_wantcnt != WNT_STOPUSING)) {
5938                         /*
5939                          * There is an extra wantcount or usecount
5940                          * that must have been added when the socket
5941                          * was unlocked. This socket will have to be
5942                          * garbage collected later
5943                          */
5944                         tcp_unlock(so, 1, 0);
5945                 } else {
5946
5947                         /* Drop the reference held for this function */
5948                         so->so_usecount--;
5949
5950                         in_pcbdispose(inp);
5951                 }
5952                 lck_rw_done(tcbinfo.ipi_lock);
5953         }
5954         tcpstat.tcps_drops++;
5955
5956         tcp_lock(head, 0, 0);
5957         head->so_incqlen--;
5958         head->so_qlen--;
5959         return(1);
5960 }
5961
5962 /* Set background congestion control on a socket */
5963 void
5964 tcp_set_background_cc(struct socket *so)
5965 {
5966         tcp_set_new_cc(so, TCP_CC_ALGO_BACKGROUND_INDEX);
5967 }
5968
5969 /* Set foreground congestion control on a socket */
5970 void
5971 tcp_set_foreground_cc(struct socket *so)
5972 {
5973         if (tcp_use_newreno)
5974                 tcp_set_new_cc(so, TCP_CC_ALGO_NEWRENO_INDEX);
5975         else
5976                 tcp_set_new_cc(so, TCP_CC_ALGO_CUBIC_INDEX);
5977 }
5978
5979 static void
5980 tcp_set_new_cc(struct socket *so, uint16_t cc_index)
5981 {
5982         struct inpcb *inp = sotoinpcb(so);
5983         struct tcpcb *tp = intotcpcb(inp);
5984         u_char old_cc_index = 0;
5985         if (tp->tcp_cc_index != cc_index) {
5986
5987                 old_cc_index = tp->tcp_cc_index;
5988
5989                 if (CC_ALGO(tp)->cleanup != NULL)
5990                         CC_ALGO(tp)->cleanup(tp);
5991                 tp->tcp_cc_index = cc_index;
5992
5993                 tcp_cc_allocate_state(tp);
5994
5995                 if (CC_ALGO(tp)->switch_to != NULL)
5996                         CC_ALGO(tp)->switch_to(tp, old_cc_index);
5997
5998                 tcp_ccdbg_trace(tp, NULL, TCP_CC_CHANGE_ALGO);
5999         }
6000 }
6001
6002 void
6003 tcp_set_recv_bg(struct socket *so)
6004 {
6005         if (!IS_TCP_RECV_BG(so))
6006                 so->so_traffic_mgt_flags |= TRAFFIC_MGT_TCP_RECVBG;
6007
6008         /* Unset Large Receive Offload on background sockets */
6009         so_set_lro(so, SO_TC_BK);
6010 }
6011
6012 void
6013 tcp_clear_recv_bg(struct socket *so)
6014 {
6015         if (IS_TCP_RECV_BG(so))
6016                 so->so_traffic_mgt_flags &= ~(TRAFFIC_MGT_TCP_RECVBG);
6017
6018         /*
6019          * Set/unset use of Large Receive Offload depending on
6020          * the traffic class
6021          */
6022         so_set_lro(so, so->so_traffic_class);
6023 }
6024
6025 void
6026 inp_fc_unthrottle_tcp(struct inpcb *inp)
6027 {
6028         struct tcpcb *tp = inp->inp_ppcb;
6029         /*
6030          * Back off the slow-start threshold and enter
6031          * congestion avoidance phase
6032          */
6033         if (CC_ALGO(tp)->pre_fr != NULL)
6034                 CC_ALGO(tp)->pre_fr(tp);
6035
6036         tp->snd_cwnd = tp->snd_ssthresh;
6037         tp->t_flagsext &= ~TF_CWND_NONVALIDATED;
6038         /*
6039          * Restart counting for ABC as we changed the
6040          * congestion window just now.
6041          */
6042         tp->t_bytes_acked = 0;
6043
6044         /* Reset retransmit shift as we know that the reason
6045          * for delay in sending a packet is due to flow
6046          * control on the outgoing interface. There is no need
6047          * to backoff retransmit timer.
6048          */
6049         tp->t_rxtshift = 0;
6050         tp->t_rtttime = 0;
6051
6052         /*
6053          * Start the output stream again. Since we are
6054          * not retransmitting data, do not reset the
6055          * retransmit timer or rtt calculation.
6056          */
6057         tcp_output(tp);
6058 }
6059
6060 static int
6061 tcp_getstat SYSCTL_HANDLER_ARGS
6062 {
6063 #pragma unused(oidp, arg1, arg2)
6064
6065         int error;
6066
6067         proc_t caller = PROC_NULL;
6068         proc_t caller_parent = PROC_NULL;
6069         char command_name[MAXCOMLEN + 1] = "";
6070         char parent_name[MAXCOMLEN + 1] = "";
6071
6072         if ((caller = proc_self()) != PROC_NULL) {
6073                 /* get process name */
6074                 strlcpy(command_name, caller->p_comm, sizeof(command_name));
6075
6076                 /* get parent process name if possible */
6077                 if ((caller_parent = proc_find(caller->p_ppid)) != PROC_NULL) {
6078                         strlcpy(parent_name, caller_parent->p_comm,
6079                             sizeof(parent_name));
6080                         proc_rele(caller_parent);
6081                 }
6082
6083                 if ((escape_str(command_name, strlen(command_name),
6084                     sizeof(command_name)) == 0) &&
6085                     (escape_str(parent_name, strlen(parent_name),
6086                     sizeof(parent_name)) == 0)) {
6087                         kern_asl_msg(LOG_DEBUG, "messagetracer",
6088                             5,
6089                             "com.apple.message.domain",
6090                             "com.apple.kernel.tcpstat", /* 1 */
6091                             "com.apple.message.signature",
6092                             "tcpstat", /* 2 */
6093                             "com.apple.message.signature2", command_name, /* 3 */
6094                             "com.apple.message.signature3", parent_name, /* 4 */
6095                             "com.apple.message.summarize", "YES", /* 5 */
6096                             NULL);
6097                 }
6098         }
6099         if (caller != PROC_NULL)
6100                 proc_rele(caller);
6101
6102         if (req->oldptr == 0) {
6103                 req->oldlen= (size_t)sizeof(struct tcpstat);
6104         }
6105
6106         error = SYSCTL_OUT(req, &tcpstat, MIN(sizeof (tcpstat), req->oldlen));
6107
6108         return (error);
6109
6110 }
6111
6112 /*
6113  * Checksum extended TCP header and data.
6114  */
6115 int
6116 tcp_input_checksum(int af, struct mbuf *m, struct tcphdr *th, int off, int tlen)
6117 {
6118         struct ifnet *ifp = m->m_pkthdr.rcvif;
6119
6120         switch (af) {
6121         case AF_INET: {
6122                 struct ip *ip = mtod(m, struct ip *);
6123                 struct ipovly *ipov = (struct ipovly *)ip;
6124
6125                 if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_DID_CSUM)
6126                         return (0);
6127
6128                 if ((hwcksum_rx || (ifp->if_flags & IFF_LOOPBACK) ||
6129                     (m->m_pkthdr.pkt_flags & PKTF_LOOP)) &&
6130                     (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)) {
6131                         if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
6132                                 th->th_sum = m->m_pkthdr.csum_rx_val;
6133                         } else {
6134                                 uint16_t sum = m->m_pkthdr.csum_rx_val;
6135                                 uint16_t start = m->m_pkthdr.csum_rx_start;
6136
6137                                 /*
6138                                  * Perform 1's complement adjustment of octets
6139                                  * that got included/excluded in the hardware-
6140                                  * calculated checksum value.  Ignore cases
6141                                  * where the value includes or excludes the IP
6142                                  * header span, as the sum for those octets
6143                                  * would already be 0xffff and thus no-op.
6144                                  */
6145                                 if ((m->m_pkthdr.csum_flags & CSUM_PARTIAL) &&
6146                                     start != 0 && (off - start) != off) {
6147 #if BYTE_ORDER != BIG_ENDIAN
6148                                         if (start < off) {
6149                                                 HTONS(ip->ip_len);
6150                                                 HTONS(ip->ip_off);
6151                                         }
6152 #endif
6153                                         /* callee folds in sum */
6154                                         sum = m_adj_sum16(m, start, off, sum);
6155 #if BYTE_ORDER != BIG_ENDIAN
6156                                         if (start < off) {
6157                                                 NTOHS(ip->ip_off);
6158                                                 NTOHS(ip->ip_len);
6159                                         }
6160 #endif
6161                                 }
6162
6163                                 /* callee folds in sum */
6164                                 th->th_sum = in_pseudo(ip->ip_src.s_addr,
6165                                     ip->ip_dst.s_addr,
6166                                     sum + htonl(tlen + IPPROTO_TCP));
6167                         }
6168                         th->th_sum ^= 0xffff;
6169                 } else {
6170                         uint16_t ip_sum;
6171                         int len;
6172                         char b[9];
6173
6174                         bcopy(ipov->ih_x1, b, sizeof (ipov->ih_x1));
6175                         bzero(ipov->ih_x1, sizeof (ipov->ih_x1));
6176                         ip_sum = ipov->ih_len;
6177                         ipov->ih_len = (u_short)tlen;
6178 #if BYTE_ORDER != BIG_ENDIAN
6179                         HTONS(ipov->ih_len);
6180 #endif
6181                         len = sizeof (struct ip) + tlen;
6182                         th->th_sum = in_cksum(m, len);
6183                         bcopy(b, ipov->ih_x1, sizeof (ipov->ih_x1));
6184                         ipov->ih_len = ip_sum;
6185
6186                         tcp_in_cksum_stats(len);
6187                 }
6188                 break;
6189         }
6190 #if INET6
6191         case AF_INET6: {
6192                 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
6193
6194                 if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_DID_CSUM)
6195                         return (0);
6196
6197                 if ((hwcksum_rx || (ifp->if_flags & IFF_LOOPBACK) ||
6198                     (m->m_pkthdr.pkt_flags & PKTF_LOOP)) &&
6199                     (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)) {
6200                         if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
6201                                 th->th_sum = m->m_pkthdr.csum_rx_val;
6202                         } else {
6203                                 uint16_t sum = m->m_pkthdr.csum_rx_val;
6204                                 uint16_t start = m->m_pkthdr.csum_rx_start;
6205
6206                                 /*
6207                                  * Perform 1's complement adjustment of octets
6208                                  * that got included/excluded in the hardware-
6209                                  * calculated checksum value.
6210                                  */
6211                                 if ((m->m_pkthdr.csum_flags & CSUM_PARTIAL) &&
6212                                     start != off) {
6213                                         uint16_t s, d;
6214
6215                                         if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src)) {
6216                                                 s = ip6->ip6_src.s6_addr16[1];
6217                                                 ip6->ip6_src.s6_addr16[1] = 0 ;
6218                                         }
6219                                         if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst)) {
6220                                                 d = ip6->ip6_dst.s6_addr16[1];
6221                                                 ip6->ip6_dst.s6_addr16[1] = 0;
6222                                         }
6223
6224                                         /* callee folds in sum */
6225                                         sum = m_adj_sum16(m, start, off, sum);
6226
6227                                         if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src))
6228                                                 ip6->ip6_src.s6_addr16[1] = s;
6229                                         if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst))
6230                                                 ip6->ip6_dst.s6_addr16[1] = d;
6231                                 }
6232
6233                                 th->th_sum = in6_pseudo(
6234                                     &ip6->ip6_src, &ip6->ip6_dst,
6235                                     sum + htonl(tlen + IPPROTO_TCP));
6236                         }
6237                         th->th_sum ^= 0xffff;
6238                 } else {
6239                         tcp_in6_cksum_stats(tlen);
6240                         th->th_sum = in6_cksum(m, IPPROTO_TCP, off, tlen);
6241                 }
6242                 break;
6243         }
6244 #endif /* INET6 */
6245         default:
6246                 VERIFY(0);
6247                 /* NOTREACHED */
6248         }
6249
6250         if (th->th_sum != 0) {
6251                 tcpstat.tcps_rcvbadsum++;
6252                 IF_TCP_STATINC(ifp, badformat);
6253                 return (-1);
6254         }
6255
6256         return (0);
6257 }
6258
6259 SYSCTL_PROC(_net_inet_tcp, TCPCTL_STATS, stats,
6260     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, tcp_getstat,
6261     "S,tcpstat", "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
6262
6263 static int
6264 sysctl_rexmtthresh SYSCTL_HANDLER_ARGS
6265 {
6266 #pragma unused(arg1, arg2)
6267
6268         int error, val = tcprexmtthresh;
6269
6270         error = sysctl_handle_int(oidp, &val, 0, req);
6271         if (error || !req->newptr)
6272                 return (error);
6273
6274         /*
6275          * Constrain the number of duplicate ACKs
6276          * to consider for TCP fast retransmit
6277          * to either 2 or 3
6278          */
6279
6280         if (val < 2 || val > 3)
6281                 return (EINVAL);
6282
6283          tcprexmtthresh = val;
6284
6285         return (0);
6286 }
6287
6288 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmt_thresh, CTLTYPE_INT | CTLFLAG_RW |
6289         CTLFLAG_LOCKED, &tcprexmtthresh, 0, &sysctl_rexmtthresh, "I",
6290         "Duplicate ACK Threshold for Fast Retransmit");
6291