bsd/netinet/tcp_subr.c

   1 /*
   2  * Copyright (c) 2000-2011 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  30  *      The Regents of the University of California.  All rights reserved.
  31  *
  32  * Redistribution and use in source and binary forms, with or without
  33  * modification, are permitted provided that the following conditions
  34  * are met:
  35  * 1. Redistributions of source code must retain the above copyright
  36  *    notice, this list of conditions and the following disclaimer.
  37  * 2. Redistributions in binary form must reproduce the above copyright
  38  *    notice, this list of conditions and the following disclaimer in the
  39  *    documentation and/or other materials provided with the distribution.
  40  * 3. All advertising materials mentioning features or use of this software
  41  *    must display the following acknowledgement:
  42  *      This product includes software developed by the University of
  43  *      California, Berkeley and its contributors.
  44  * 4. Neither the name of the University nor the names of its contributors
  45  *    may be used to endorse or promote products derived from this software
  46  *    without specific prior written permission.
  47  *
  48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  58  * SUCH DAMAGE.
  59  *
  60  *      @(#)tcp_subr.c  8.2 (Berkeley) 5/24/95
  61  * $FreeBSD: src/sys/netinet/tcp_subr.c,v 1.73.2.22 2001/08/22 00:59:12 silby Exp $
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/callout.h>
  73 #include <sys/kernel.h>
  74 #include <sys/sysctl.h>
  75 #include <sys/malloc.h>
  76 #include <sys/mbuf.h>
  77 #include <sys/domain.h>
  78 #include <sys/proc.h>
  79 #include <sys/kauth.h>
  80 #include <sys/socket.h>
  81 #include <sys/socketvar.h>
  82 #include <sys/protosw.h>
  83 #include <sys/random.h>
  84 #include <sys/syslog.h>
  85 #include <sys/mcache.h>
  86 #include <kern/locks.h>
  87 #include <kern/zalloc.h>
  88
  89 #include <net/route.h>
  90 #include <net/if.h>
  91
  92 #define tcp_minmssoverload fring
  93 #define _IP_VHL
  94 #include <netinet/in.h>
  95 #include <netinet/in_systm.h>
  96 #include <netinet/ip.h>
  97 #include <netinet/ip_icmp.h>
  98 #if INET6
  99 #include <netinet/ip6.h>
 100 #endif
 101 #include <netinet/in_pcb.h>
 102 #if INET6
 103 #include <netinet6/in6_pcb.h>
 104 #endif
 105 #include <netinet/in_var.h>
 106 #include <netinet/ip_var.h>
 107 #include <netinet/icmp_var.h>
 108 #if INET6
 109 #include <netinet6/ip6_var.h>
 110 #endif
 111 #include <netinet/tcp.h>
 112 #include <netinet/tcp_fsm.h>
 113 #include <netinet/tcp_seq.h>
 114 #include <netinet/tcp_timer.h>
 115 #include <netinet/tcp_var.h>
 116 #include <netinet/tcp_cc.h>
 117 #include <kern/thread_call.h>
 118
 119 #if INET6
 120 #include <netinet6/tcp6_var.h>
 121 #endif
 122 #include <netinet/tcpip.h>
 123 #if TCPDEBUG
 124 #include <netinet/tcp_debug.h>
 125 #endif
 126 #include <netinet6/ip6protosw.h>
 127
 128 #if IPSEC
 129 #include <netinet6/ipsec.h>
 130 #if INET6
 131 #include <netinet6/ipsec6.h>
 132 #endif
 133 #endif /*IPSEC*/
 134
 135 #undef tcp_minmssoverload
 136
 137 #if CONFIG_MACF_NET
 138 #include <security/mac_framework.h>
 139 #endif /* MAC_NET */
 140
 141 #include <libkern/crypto/md5.h>
 142 #include <sys/kdebug.h>
 143 #include <mach/sdt.h>
 144
 145 #define DBG_FNC_TCP_CLOSE       NETDBG_CODE(DBG_NETTCP, ((5 << 8) | 2))
 146
 147 extern int tcp_lq_overflow;
 148
 149 /* temporary: for testing */
 150 #if IPSEC
 151 extern int ipsec_bypass;
 152 #endif
 153
 154 int     tcp_mssdflt = TCP_MSS;
 155 SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW | CTLFLAG_LOCKED,
 156     &tcp_mssdflt , 0, "Default TCP Maximum Segment Size");
 157
 158 #if INET6
 159 int     tcp_v6mssdflt = TCP6_MSS;
 160 SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
 161         CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_v6mssdflt , 0,
 162         "Default TCP Maximum Segment Size for IPv6");
 163 #endif
 164
 165 /*
 166  * Minimum MSS we accept and use. This prevents DoS attacks where
 167  * we are forced to a ridiculous low MSS like 20 and send hundreds
 168  * of packets instead of one. The effect scales with the available
 169  * bandwidth and quickly saturates the CPU and network interface
 170  * with packet generation and sending. Set to zero to disable MINMSS
 171  * checking. This setting prevents us from sending too small packets.
 172  */
 173 int     tcp_minmss = TCP_MINMSS;
 174 SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW | CTLFLAG_LOCKED,
 175     &tcp_minmss , 0, "Minmum TCP Maximum Segment Size");
 176
 177 /*
 178  * Number of TCP segments per second we accept from remote host
 179  * before we start to calculate average segment size. If average
 180  * segment size drops below the minimum TCP MSS we assume a DoS
 181  * attack and reset+drop the connection. Care has to be taken not to
 182  * set this value too small to not kill interactive type connections
 183  * (telnet, SSH) which send many small packets.
 184  */
 185 #ifdef FIX_WORKAROUND_FOR_3894301
 186 __private_extern__ int     tcp_minmssoverload = TCP_MINMSSOVERLOAD;
 187 #else
 188 __private_extern__ int     tcp_minmssoverload = 0;
 189 #endif
 190 SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmssoverload, CTLFLAG_RW | CTLFLAG_LOCKED,
 191     &tcp_minmssoverload , 0, "Number of TCP Segments per Second allowed to"
 192     "be under the MINMSS Size");
 193
 194 static int      tcp_do_rfc1323 = 1;
 195 SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW | CTLFLAG_LOCKED,
 196     &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions");
 197
 198 // Not used
 199 static int      tcp_do_rfc1644 = 0;
 200 SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW | CTLFLAG_LOCKED,
 201     &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions");
 202
 203 static int      do_tcpdrain = 0;
 204 SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW | CTLFLAG_LOCKED, &do_tcpdrain, 0,
 205      "Enable tcp_drain routine for extra help when low on mbufs");
 206
 207 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
 208     &tcbinfo.ipi_count, 0, "Number of active PCBs");
 209
 210 static int      icmp_may_rst = 1;
 211 SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW | CTLFLAG_LOCKED, &icmp_may_rst, 0,
 212     "Certain ICMP unreachable messages may abort connections in SYN_SENT");
 213
 214 static int      tcp_strict_rfc1948 = 0;
 215 SYSCTL_INT(_net_inet_tcp, OID_AUTO, strict_rfc1948, CTLFLAG_RW | CTLFLAG_LOCKED,
 216     &tcp_strict_rfc1948, 0, "Determines if RFC1948 is followed exactly");
 217
 218 static int      tcp_isn_reseed_interval = 0;
 219 SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW | CTLFLAG_LOCKED,
 220     &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
 221 static int      tcp_background_io_enabled = 1;
 222 SYSCTL_INT(_net_inet_tcp, OID_AUTO, background_io_enabled, CTLFLAG_RW | CTLFLAG_LOCKED,
 223     &tcp_background_io_enabled, 0, "Background IO Enabled");
 224
 225 int     tcp_TCPTV_MIN = 100;    /* 100ms minimum RTT */
 226 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rtt_min, CTLFLAG_RW | CTLFLAG_LOCKED,
 227     &tcp_TCPTV_MIN, 0, "min rtt value allowed");
 228
 229 int tcp_rexmt_slop = TCPTV_REXMTSLOP;
 230 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmt_slop, CTLFLAG_RW,
 231         &tcp_rexmt_slop, 0, "Slop added to retransmit timeout");
 232
 233 __private_extern__ int tcp_use_randomport = 0;
 234 SYSCTL_INT(_net_inet_tcp, OID_AUTO, randomize_ports, CTLFLAG_RW | CTLFLAG_LOCKED,
 235     &tcp_use_randomport, 0, "Randomize TCP port numbers");
 236
 237 extern struct tcp_cc_algo tcp_cc_newreno;
 238 SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno_sockets, CTLFLAG_RD | CTLFLAG_LOCKED,
 239         &tcp_cc_newreno.num_sockets, 0, "Number of sockets using newreno");
 240
 241 extern struct tcp_cc_algo tcp_cc_ledbat;
 242 SYSCTL_INT(_net_inet_tcp, OID_AUTO, background_sockets, CTLFLAG_RD | CTLFLAG_LOCKED,
 243         &tcp_cc_ledbat.num_sockets, 0, "Number of sockets using background transport");
 244
 245 static void     tcp_cleartaocache(void);
 246 static void     tcp_notify(struct inpcb *, int);
 247 static void     tcp_cc_init(void);
 248
 249 struct zone     *sack_hole_zone;
 250 struct zone     *tcp_reass_zone;
 251
 252 /* The array containing pointers to currently implemented TCP CC algorithms */
 253 struct tcp_cc_algo* tcp_cc_algo_list[TCP_CC_ALGO_COUNT];
 254
 255 extern unsigned int total_mb_cnt;
 256 extern unsigned int total_cl_cnt;
 257 extern int sbspace_factor;
 258 extern int tcp_sockthreshold;
 259 extern int slowlink_wsize;      /* window correction for slow links */
 260 extern int path_mtu_discovery;
 261
 262
 263 /*
 264  * Target size of TCP PCB hash tables. Must be a power of two.
 265  *
 266  * Note that this can be overridden by the kernel environment
 267  * variable net.inet.tcp.tcbhashsize
 268  */
 269 #ifndef TCBHASHSIZE
 270 #define TCBHASHSIZE     CONFIG_TCBHASHSIZE
 271 #endif
 272
 273 __private_extern__ int  tcp_tcbhashsize = TCBHASHSIZE;
 274 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD | CTLFLAG_LOCKED,
 275      &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
 276
 277 /*
 278  * This is the actual shape of what we allocate using the zone
 279  * allocator.  Doing it this way allows us to protect both structures
 280  * using the same generation count, and also eliminates the overhead
 281  * of allocating tcpcbs separately.  By hiding the structure here,
 282  * we avoid changing most of the rest of the code (although it needs
 283  * to be changed, eventually, for greater efficiency).
 284  */
 285 #define ALIGNMENT       32
 286 struct  inp_tp {
 287         struct  inpcb   inp;
 288         struct  tcpcb   tcb __attribute__((aligned(ALIGNMENT)));
 289 };
 290 #undef ALIGNMENT
 291
 292 extern struct   inpcbhead       time_wait_slots[];
 293 extern struct tcptimerlist tcp_timer_list;
 294
 295 int  get_inpcb_str_size(void);
 296 int  get_tcp_str_size(void);
 297
 298 static void tcpcb_to_otcpcb(struct tcpcb *, struct otcpcb *);
 299
 300 static lck_attr_t *tcp_uptime_mtx_attr = NULL;          /* mutex attributes */
 301 static lck_grp_t *tcp_uptime_mtx_grp = NULL;            /* mutex group definition */
 302 static lck_grp_attr_t *tcp_uptime_mtx_grp_attr = NULL;  /* mutex group attributes */
 303
 304
 305 int  get_inpcb_str_size(void)
 306 {
 307         return sizeof(struct inpcb);
 308 }
 309
 310
 311 int  get_tcp_str_size(void)
 312 {
 313         return sizeof(struct tcpcb);
 314 }
 315
 316 int     tcp_freeq(struct tcpcb *tp);
 317
 318 /*
 319  * Initialize TCP congestion control algorithms.
 320  */
 321
 322 void
 323 tcp_cc_init(void)
 324 {
 325         bzero(&tcp_cc_algo_list, sizeof(tcp_cc_algo_list));
 326         tcp_cc_algo_list[TCP_CC_ALGO_NEWRENO_INDEX] = &tcp_cc_newreno;
 327         tcp_cc_algo_list[TCP_CC_ALGO_BACKGROUND_INDEX] = &tcp_cc_ledbat;
 328 }
 329
 330 /*
 331  * Tcp initialization
 332  */
 333 void
 334 tcp_init()
 335 {
 336         vm_size_t       str_size;
 337         int i;
 338         struct inpcbinfo *pcbinfo;
 339
 340         tcp_ccgen = 1;
 341         tcp_cleartaocache();
 342
 343         tcp_keepinit = TCPTV_KEEP_INIT;
 344         tcp_keepidle = TCPTV_KEEP_IDLE;
 345         tcp_keepintvl = TCPTV_KEEPINTVL;
 346         tcp_maxpersistidle = TCPTV_KEEP_IDLE;
 347         tcp_msl = TCPTV_MSL;
 348
 349         microuptime(&tcp_uptime);
 350         read_random(&tcp_now, sizeof(tcp_now));
 351         tcp_now = tcp_now & 0x3fffffff; /* Starts tcp internal clock at a random value */
 352
 353         LIST_INIT(&tcb);
 354         tcbinfo.listhead = &tcb;
 355         pcbinfo = &tcbinfo;
 356         if (!powerof2(tcp_tcbhashsize)) {
 357                 printf("WARNING: TCB hash size not a power of 2\n");
 358                 tcp_tcbhashsize = 512; /* safe default */
 359         }
 360         tcbinfo.hashsize = tcp_tcbhashsize;
 361         tcbinfo.hashbase = hashinit(tcp_tcbhashsize, M_PCB, &tcbinfo.hashmask);
 362         tcbinfo.porthashbase = hashinit(tcp_tcbhashsize, M_PCB,
 363                                         &tcbinfo.porthashmask);
 364         str_size = P2ROUNDUP(sizeof(struct inp_tp), sizeof(u_int64_t));
 365         tcbinfo.ipi_zone = (void *) zinit(str_size, 120000*str_size, 8192, "tcpcb");
 366         zone_change(tcbinfo.ipi_zone, Z_CALLERACCT, FALSE);
 367         zone_change(tcbinfo.ipi_zone, Z_EXPAND, TRUE);
 368
 369         str_size = P2ROUNDUP(sizeof(struct sackhole), sizeof(u_int64_t));
 370         sack_hole_zone = zinit(str_size, 120000*str_size, 8192, "sack_hole zone");
 371         zone_change(sack_hole_zone, Z_CALLERACCT, FALSE);
 372         zone_change(sack_hole_zone, Z_EXPAND, TRUE);
 373
 374         tcp_reass_maxseg = nmbclusters / 16;
 375         str_size = P2ROUNDUP(sizeof(struct tseg_qent), sizeof(u_int64_t));
 376         tcp_reass_zone = zinit(str_size, (tcp_reass_maxseg + 1) * str_size,
 377                 0, "tcp_reass_zone");
 378         if (tcp_reass_zone == NULL) {
 379                 panic("%s: failed allocating tcp_reass_zone", __func__);
 380                 /* NOTREACHED */
 381         }
 382         zone_change(tcp_reass_zone, Z_CALLERACCT, FALSE);
 383         zone_change(tcp_reass_zone, Z_EXPAND, TRUE);
 384
 385 #if INET6
 386 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
 387 #else /* INET6 */
 388 #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr))
 389 #endif /* INET6 */
 390         if (max_protohdr < TCP_MINPROTOHDR)
 391                 max_protohdr = TCP_MINPROTOHDR;
 392         if (max_linkhdr + TCP_MINPROTOHDR > MHLEN)
 393                 panic("tcp_init");
 394 #undef TCP_MINPROTOHDR
 395
 396         /*
 397          * allocate lock group attribute and group for tcp pcb mutexes
 398          */
 399         pcbinfo->mtx_grp_attr = lck_grp_attr_alloc_init();
 400         pcbinfo->mtx_grp = lck_grp_alloc_init("tcppcb", pcbinfo->mtx_grp_attr);
 401
 402         /*
 403          * allocate the lock attribute for tcp pcb mutexes
 404          */
 405         pcbinfo->mtx_attr = lck_attr_alloc_init();
 406
 407         if ((pcbinfo->mtx = lck_rw_alloc_init(pcbinfo->mtx_grp, pcbinfo->mtx_attr)) == NULL) {
 408                 printf("tcp_init: mutex not alloced!\n");
 409                 return; /* pretty much dead if this fails... */
 410         }
 411
 412         for (i=0; i < N_TIME_WAIT_SLOTS; i++) {
 413              LIST_INIT(&time_wait_slots[i]);
 414         }
 415
 416         bzero(&tcp_timer_list, sizeof(tcp_timer_list));
 417         LIST_INIT(&tcp_timer_list.lhead);
 418         /*
 419          * allocate lock group attribute, group and attribute for the tcp timer list
 420          */
 421         tcp_timer_list.mtx_grp_attr = lck_grp_attr_alloc_init();
 422         tcp_timer_list.mtx_grp = lck_grp_alloc_init("tcptimerlist", tcp_timer_list.mtx_grp_attr);
 423         tcp_timer_list.mtx_attr = lck_attr_alloc_init();
 424         if ((tcp_timer_list.mtx = lck_mtx_alloc_init(tcp_timer_list.mtx_grp, tcp_timer_list.mtx_attr)) == NULL) {
 425                 panic("failed to allocate memory for tcp_timer_list.mtx\n");
 426         };
 427         tcp_timer_list.fast_quantum = TCP_FASTTIMER_QUANTUM;
 428         tcp_timer_list.slow_quantum = TCP_SLOWTIMER_QUANTUM;
 429         if ((tcp_timer_list.call = thread_call_allocate(tcp_run_timerlist, NULL)) == NULL) {
 430                 panic("failed to allocate call entry 1 in tcp_init\n");
 431         }
 432
 433         /*
 434          * allocate lock group attribute, group and attribute for tcp_uptime_lock
 435          */
 436         tcp_uptime_mtx_grp_attr = lck_grp_attr_alloc_init();
 437         tcp_uptime_mtx_grp = lck_grp_alloc_init("tcpuptime", tcp_uptime_mtx_grp_attr);
 438         tcp_uptime_mtx_attr = lck_attr_alloc_init();
 439         tcp_uptime_lock = lck_spin_alloc_init(tcp_uptime_mtx_grp, tcp_uptime_mtx_attr);
 440
 441         /* Initialize TCP congestion control algorithms list */
 442         tcp_cc_init();
 443 }
 444
 445 /*
 446  * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
 447  * tcp_template used to store this data in mbufs, but we now recopy it out
 448  * of the tcpcb each time to conserve mbufs.
 449  */
 450 void
 451 tcp_fillheaders(tp, ip_ptr, tcp_ptr)
 452         struct tcpcb *tp;
 453         void *ip_ptr;
 454         void *tcp_ptr;
 455 {
 456         struct inpcb *inp = tp->t_inpcb;
 457         struct tcphdr *tcp_hdr = (struct tcphdr *)tcp_ptr;
 458
 459 #if INET6
 460         if ((inp->inp_vflag & INP_IPV6) != 0) {
 461                 struct ip6_hdr *ip6;
 462
 463                 ip6 = (struct ip6_hdr *)ip_ptr;
 464                 ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
 465                         (inp->in6p_flowinfo & IPV6_FLOWINFO_MASK);
 466                 ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
 467                         (IPV6_VERSION & IPV6_VERSION_MASK);
 468                 ip6->ip6_nxt = IPPROTO_TCP;
 469                 ip6->ip6_plen = sizeof(struct tcphdr);
 470                 ip6->ip6_src = inp->in6p_laddr;
 471                 ip6->ip6_dst = inp->in6p_faddr;
 472                 tcp_hdr->th_sum = in6_cksum_phdr(&inp->in6p_laddr,
 473                     &inp->in6p_faddr, htonl(sizeof(struct tcphdr)),
 474                     htonl(IPPROTO_TCP));
 475         } else
 476 #endif
 477         {
 478         struct ip *ip = (struct ip *) ip_ptr;
 479
 480         ip->ip_vhl = IP_VHL_BORING;
 481         ip->ip_tos = 0;
 482         ip->ip_len = 0;
 483         ip->ip_id = 0;
 484         ip->ip_off = 0;
 485         ip->ip_ttl = 0;
 486         ip->ip_sum = 0;
 487         ip->ip_p = IPPROTO_TCP;
 488         ip->ip_src = inp->inp_laddr;
 489         ip->ip_dst = inp->inp_faddr;
 490         tcp_hdr->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 491                 htons(sizeof(struct tcphdr) + IPPROTO_TCP));
 492         }
 493
 494         tcp_hdr->th_sport = inp->inp_lport;
 495         tcp_hdr->th_dport = inp->inp_fport;
 496         tcp_hdr->th_seq = 0;
 497         tcp_hdr->th_ack = 0;
 498         tcp_hdr->th_x2 = 0;
 499         tcp_hdr->th_off = 5;
 500         tcp_hdr->th_flags = 0;
 501         tcp_hdr->th_win = 0;
 502         tcp_hdr->th_urp = 0;
 503 }
 504
 505 /*
 506  * Create template to be used to send tcp packets on a connection.
 507  * Allocates an mbuf and fills in a skeletal tcp/ip header.  The only
 508  * use for this function is in keepalives, which use tcp_respond.
 509  */
 510 struct tcptemp *
 511 tcp_maketemplate(tp)
 512         struct tcpcb *tp;
 513 {
 514         struct mbuf *m;
 515         struct tcptemp *n;
 516
 517         m = m_get(M_DONTWAIT, MT_HEADER);
 518         if (m == NULL)
 519                 return (0);
 520         m->m_len = sizeof(struct tcptemp);
 521         n = mtod(m, struct tcptemp *);
 522
 523         tcp_fillheaders(tp, (void *)&n->tt_ipgen, (void *)&n->tt_t);
 524         return (n);
 525 }
 526
 527 /*
 528  * Send a single message to the TCP at address specified by
 529  * the given TCP/IP header.  If m == 0, then we make a copy
 530  * of the tcpiphdr at ti and send directly to the addressed host.
 531  * This is used to force keep alive messages out using the TCP
 532  * template for a connection.  If flags are given then we send
 533  * a message back to the TCP which originated the * segment ti,
 534  * and discard the mbuf containing it and any other attached mbufs.
 535  *
 536  * In any case the ack and sequence number of the transmitted
 537  * segment are as specified by the parameters.
 538  *
 539  * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
 540  */
 541 void
 542 tcp_respond(
 543         struct tcpcb *tp,
 544         void *ipgen,
 545         register struct tcphdr *th,
 546         register struct mbuf *m,
 547         tcp_seq ack,
 548         tcp_seq seq,
 549         int flags,
 550         unsigned int ifscope,
 551         unsigned int nocell
 552         )
 553 {
 554         register int tlen;
 555         int win = 0;
 556         struct route *ro = 0;
 557         struct route sro;
 558         struct ip *ip;
 559         struct tcphdr *nth;
 560 #if INET6
 561         struct route_in6 *ro6 = 0;
 562         struct route_in6 sro6;
 563         struct ip6_hdr *ip6;
 564         int isipv6;
 565 #endif /* INET6 */
 566         unsigned int outif;
 567
 568 #if INET6
 569         isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6;
 570         ip6 = ipgen;
 571 #endif /* INET6 */
 572         ip = ipgen;
 573
 574         if (tp) {
 575                 if (!(flags & TH_RST)) {
 576                         win = tcp_sbspace(tp);
 577                         if (win > (int32_t)TCP_MAXWIN << tp->rcv_scale)
 578                                 win = (int32_t)TCP_MAXWIN << tp->rcv_scale;
 579                 }
 580 #if INET6
 581                 if (isipv6)
 582                         ro6 = &tp->t_inpcb->in6p_route;
 583                 else
 584 #endif /* INET6 */
 585                 ro = &tp->t_inpcb->inp_route;
 586         } else {
 587 #if INET6
 588                 if (isipv6) {
 589                         ro6 = &sro6;
 590                         bzero(ro6, sizeof *ro6);
 591                 } else
 592 #endif /* INET6 */
 593                 {
 594                         ro = &sro;
 595                         bzero(ro, sizeof *ro);
 596                 }
 597         }
 598         if (m == 0) {
 599                 m = m_gethdr(M_DONTWAIT, MT_HEADER);    /* MAC-OK */
 600                 if (m == NULL)
 601                         return;
 602                 tlen = 0;
 603                 m->m_data += max_linkhdr;
 604 #if INET6
 605                 if (isipv6) {
 606                         bcopy((caddr_t)ip6, mtod(m, caddr_t),
 607                               sizeof(struct ip6_hdr));
 608                         ip6 = mtod(m, struct ip6_hdr *);
 609                         nth = (struct tcphdr *)(ip6 + 1);
 610                 } else
 611 #endif /* INET6 */
 612                 {
 613                         bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
 614                         ip = mtod(m, struct ip *);
 615                         nth = (struct tcphdr *)(ip + 1);
 616                 }
 617                 bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
 618                 flags = TH_ACK;
 619         } else {
 620                 m_freem(m->m_next);
 621                 m->m_next = 0;
 622                 m->m_data = (caddr_t)ipgen;
 623                 /* m_len is set later */
 624                 tlen = 0;
 625 #define xchg(a,b,type) { type t; t=a; a=b; b=t; }
 626 #if INET6
 627                 if (isipv6) {
 628                         xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
 629                         nth = (struct tcphdr *)(ip6 + 1);
 630                 } else
 631 #endif /* INET6 */
 632               {
 633                 xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long);
 634                 nth = (struct tcphdr *)(ip + 1);
 635               }
 636                 if (th != nth) {
 637                         /*
 638                          * this is usually a case when an extension header
 639                          * exists between the IPv6 header and the
 640                          * TCP header.
 641                          */
 642                         nth->th_sport = th->th_sport;
 643                         nth->th_dport = th->th_dport;
 644                 }
 645                 xchg(nth->th_dport, nth->th_sport, n_short);
 646 #undef xchg
 647         }
 648 #if INET6
 649         if (isipv6) {
 650                 ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) +
 651                                                 tlen));
 652                 tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
 653         } else
 654 #endif
 655       {
 656         tlen += sizeof (struct tcpiphdr);
 657         ip->ip_len = tlen;
 658         ip->ip_ttl = ip_defttl;
 659       }
 660         m->m_len = tlen;
 661         m->m_pkthdr.len = tlen;
 662         m->m_pkthdr.rcvif = 0;
 663 #if CONFIG_MACF_NET
 664         if (tp != NULL && tp->t_inpcb != NULL) {
 665                 /*
 666                  * Packet is associated with a socket, so allow the
 667                  * label of the response to reflect the socket label.
 668                  */
 669                 mac_mbuf_label_associate_inpcb(tp->t_inpcb, m);
 670         } else {
 671                 /*
 672                  * Packet is not associated with a socket, so possibly
 673                  * update the label in place.
 674                  */
 675                 mac_netinet_tcp_reply(m);
 676         }
 677 #endif
 678
 679         nth->th_seq = htonl(seq);
 680         nth->th_ack = htonl(ack);
 681         nth->th_x2 = 0;
 682         nth->th_off = sizeof (struct tcphdr) >> 2;
 683         nth->th_flags = flags;
 684         if (tp)
 685                 nth->th_win = htons((u_short) (win >> tp->rcv_scale));
 686         else
 687                 nth->th_win = htons((u_short)win);
 688         nth->th_urp = 0;
 689 #if INET6
 690         if (isipv6) {
 691                 nth->th_sum = 0;
 692                 nth->th_sum = in6_cksum_phdr(&ip6->ip6_src,
 693                     &ip6->ip6_dst, htons((u_short)(tlen - sizeof(struct ip6_hdr))),
 694                                 htonl(IPPROTO_TCP));
 695                 m->m_pkthdr.csum_flags = CSUM_TCPIPV6;
 696                 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 697                 ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL,
 698                                                ro6 && ro6->ro_rt ?
 699                                                ro6->ro_rt->rt_ifp :
 700                                                NULL);
 701         } else
 702 #endif /* INET6 */
 703         {
 704                 nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 705                 htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
 706                 m->m_pkthdr.csum_flags = CSUM_TCP;
 707                 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 708         }
 709 #if TCPDEBUG
 710         if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
 711                 tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
 712 #endif
 713 #if IPSEC
 714         if (ipsec_bypass == 0 && ipsec_setsocket(m, tp ? tp->t_inpcb->inp_socket : NULL) != 0) {
 715                 m_freem(m);
 716                 return;
 717         }
 718 #endif
 719
 720         if (tp != NULL)
 721                 set_packet_tclass(m, tp->t_inpcb->inp_socket, MBUF_TC_UNSPEC, isipv6);
 722
 723 #if INET6
 724         if (isipv6) {
 725                 struct ip6_out_args ip6oa = { ifscope, nocell };
 726
 727                 (void) ip6_output(m, NULL, ro6, IPV6_OUTARGS, NULL,
 728                     NULL, &ip6oa);
 729                 if (ro6->ro_rt != NULL) {
 730                         if (ro6 == &sro6) {
 731                                 rtfree(ro6->ro_rt);
 732                                 ro6->ro_rt = NULL;
 733                         } else if ((outif = ro6->ro_rt->rt_ifp->if_index) !=
 734                             tp->t_inpcb->in6p_last_outif) {
 735                                 tp->t_inpcb->in6p_last_outif = outif;
 736                         }
 737                 }
 738         } else
 739 #endif /* INET6 */
 740         {
 741                 struct ip_out_args ipoa = { ifscope, nocell };
 742
 743                 if (ro != &sro) {
 744                         /* Copy the cached route and take an extra reference */
 745                         inp_route_copyout(tp->t_inpcb, &sro);
 746                 }
 747                 /*
 748                  * For consistency, pass a local route copy.
 749                  */
 750                 (void) ip_output(m, NULL, &sro, IP_OUTARGS, NULL, &ipoa);
 751
 752                 if (ro != &sro) {
 753                         if (sro.ro_rt != NULL &&
 754                             (outif = sro.ro_rt->rt_ifp->if_index) !=
 755                             tp->t_inpcb->inp_last_outif)
 756                                 tp->t_inpcb->inp_last_outif = outif;
 757                         /* Synchronize cached PCB route */
 758                         inp_route_copyin(tp->t_inpcb, &sro);
 759                 } else if (sro.ro_rt != NULL) {
 760                         rtfree(sro.ro_rt);
 761                 }
 762         }
 763 }
 764
 765 /*
 766  * Create a new TCP control block, making an
 767  * empty reassembly queue and hooking it to the argument
 768  * protocol control block.  The `inp' parameter must have
 769  * come from the zone allocator set up in tcp_init().
 770  */
 771 struct tcpcb *
 772 tcp_newtcpcb(inp)
 773         struct inpcb *inp;
 774 {
 775         struct inp_tp *it;
 776         register struct tcpcb *tp;
 777         register struct socket *so = inp->inp_socket;
 778 #if INET6
 779         int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 780 #endif /* INET6 */
 781
 782         calculate_tcp_clock();
 783
 784         if (so->cached_in_sock_layer == 0) {
 785              it = (struct inp_tp *)inp;
 786              tp = &it->tcb;
 787         }
 788         else
 789              tp = (struct tcpcb *) inp->inp_saved_ppcb;
 790
 791         bzero((char *) tp, sizeof(struct tcpcb));
 792         LIST_INIT(&tp->t_segq);
 793         tp->t_maxseg = tp->t_maxopd =
 794 #if INET6
 795                 isipv6 ? tcp_v6mssdflt :
 796 #endif /* INET6 */
 797                 tcp_mssdflt;
 798
 799         if (tcp_do_rfc1323)
 800                 tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
 801         tp->sack_enable = tcp_do_sack;
 802         TAILQ_INIT(&tp->snd_holes);
 803         tp->t_inpcb = inp;      /* XXX */
 804         /*
 805          * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
 806          * rtt estimate.  Set rttvar so that srtt + 4 * rttvar gives
 807          * reasonable initial retransmit time.
 808          */
 809         tp->t_srtt = TCPTV_SRTTBASE;
 810         tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
 811         tp->t_rttmin = tcp_TCPTV_MIN;
 812         tp->t_rxtcur = TCPTV_RTOBASE;
 813
 814         /* Initialize congestion control algorithm for this connection
 815          * to newreno by default
 816          */
 817         tp->tcp_cc_index = TCP_CC_ALGO_NEWRENO_INDEX;
 818         if (CC_ALGO(tp)->init != NULL) {
 819                 CC_ALGO(tp)->init(tp);
 820         }
 821
 822         tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 823         tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 824         tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 825         tp->snd_ssthresh_prev = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 826         tp->t_rcvtime = tcp_now;
 827         tp->t_bw_rtttime = 0;
 828         tp->tentry.timer_start = tcp_now;
 829         tp->t_persist_timeout = tcp_max_persist_timeout;
 830         tp->t_persist_stop = 0;
 831         tp->t_flagsext |= TF_RCVUNACK_WAITSS;
 832         /*
 833          * IPv4 TTL initialization is necessary for an IPv6 socket as well,
 834          * because the socket may be bound to an IPv6 wildcard address,
 835          * which may match an IPv4-mapped IPv6 address.
 836          */
 837         inp->inp_ip_ttl = ip_defttl;
 838         inp->inp_ppcb = (caddr_t)tp;
 839         return (tp);            /* XXX */
 840 }
 841
 842 /*
 843  * Drop a TCP connection, reporting
 844  * the specified error.  If connection is synchronized,
 845  * then send a RST to peer.
 846  */
 847 struct tcpcb *
 848 tcp_drop(tp, errno)
 849         register struct tcpcb *tp;
 850         int errno;
 851 {
 852         struct socket *so = tp->t_inpcb->inp_socket;
 853 #if CONFIG_DTRACE
 854         struct inpcb *inp = tp->t_inpcb;
 855 #endif /* CONFIG_DTRACE */
 856
 857         if (TCPS_HAVERCVDSYN(tp->t_state)) {
 858                 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
 859                         struct tcpcb *, tp, int32_t, TCPS_CLOSED);
 860                 tp->t_state = TCPS_CLOSED;
 861                 (void) tcp_output(tp);
 862                 tcpstat.tcps_drops++;
 863         } else
 864                 tcpstat.tcps_conndrops++;
 865         if (errno == ETIMEDOUT && tp->t_softerror)
 866                 errno = tp->t_softerror;
 867         so->so_error = errno;
 868         return (tcp_close(tp));
 869 }
 870
 871 /*
 872  * Close a TCP control block:
 873  *      discard all space held by the tcp
 874  *      discard internet protocol block
 875  *      wake up any sleepers
 876  */
 877 struct tcpcb *
 878 tcp_close(tp)
 879         register struct tcpcb *tp;
 880 {
 881         struct inpcb *inp = tp->t_inpcb;
 882         struct socket *so = inp->inp_socket;
 883 #if INET6
 884         int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 885 #endif /* INET6 */
 886         struct rtentry *rt;
 887         int dosavessthresh;
 888
 889         if ( inp->inp_ppcb == NULL) /* tcp_close was called previously, bail */
 890                 return(NULL);
 891
 892         tcp_canceltimers(tp);
 893         KERNEL_DEBUG(DBG_FNC_TCP_CLOSE | DBG_FUNC_START, tp,0,0,0,0);
 894
 895         /*
 896          * If another thread for this tcp is currently in ip (indicated by
 897          * the TF_SENDINPROG flag), defer the cleanup until after it returns
 898          * back to tcp.  This is done to serialize the close until after all
 899          * pending output is finished, in order to avoid having the PCB be
 900          * detached and the cached route cleaned, only for ip to cache the
 901          * route back into the PCB again.  Note that we've cleared all the
 902          * timers at this point.  Set TF_CLOSING to indicate to tcp_output()
 903          * that is should call us again once it returns from ip; at that
 904          * point both flags should be cleared and we can proceed further
 905          * with the cleanup.
 906          */
 907         if (tp->t_flags & (TF_CLOSING|TF_SENDINPROG)) {
 908                 tp->t_flags |= TF_CLOSING;
 909                 return (NULL);
 910         }
 911
 912         if (CC_ALGO(tp)->cleanup != NULL) {
 913                 CC_ALGO(tp)->cleanup(tp);
 914         }
 915
 916 #if INET6
 917         rt = isipv6 ? inp->in6p_route.ro_rt : inp->inp_route.ro_rt;
 918 #else
 919         rt = inp->inp_route.ro_rt;
 920 #endif
 921         if (rt != NULL)
 922                 RT_LOCK_SPIN(rt);
 923
 924         /*
 925          * If we got enough samples through the srtt filter,
 926          * save the rtt and rttvar in the routing entry.
 927          * 'Enough' is arbitrarily defined as the 16 samples.
 928          * 16 samples is enough for the srtt filter to converge
 929          * to within 5% of the correct value; fewer samples and
 930          * we could save a very bogus rtt.
 931          *
 932          * Don't update the default route's characteristics and don't
 933          * update anything that the user "locked".
 934          */
 935         if (tp->t_rttupdated >= 16) {
 936                 register u_int32_t i = 0;
 937
 938 #if INET6
 939                 if (isipv6) {
 940                         struct sockaddr_in6 *sin6;
 941
 942                         if (rt == NULL)
 943                                 goto no_valid_rt;
 944                         sin6 = (struct sockaddr_in6 *)rt_key(rt);
 945                         if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
 946                                 goto no_valid_rt;
 947                 }
 948                 else
 949 #endif /* INET6 */
 950                 if (rt == NULL || !(rt->rt_flags & RTF_UP) ||
 951                     ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr ==
 952                     INADDR_ANY || rt->generation_id != route_generation) {
 953                         if (tp->t_state >= TCPS_CLOSE_WAIT) {
 954                                 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
 955                                         struct tcpcb *, tp, int32_t, TCPS_CLOSING);
 956                                 tp->t_state = TCPS_CLOSING;
 957                         }
 958                         goto no_valid_rt;
 959                 }
 960
 961                 RT_LOCK_ASSERT_HELD(rt);
 962                 if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
 963                         i = tp->t_srtt *
 964                             (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE));
 965                         if (rt->rt_rmx.rmx_rtt && i)
 966                                 /*
 967                                  * filter this update to half the old & half
 968                                  * the new values, converting scale.
 969                                  * See route.h and tcp_var.h for a
 970                                  * description of the scaling constants.
 971                                  */
 972                                 rt->rt_rmx.rmx_rtt =
 973                                     (rt->rt_rmx.rmx_rtt + i) / 2;
 974                         else
 975                                 rt->rt_rmx.rmx_rtt = i;
 976                         tcpstat.tcps_cachedrtt++;
 977                 }
 978                 if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
 979                         i = tp->t_rttvar *
 980                             (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTTVAR_SCALE));
 981                         if (rt->rt_rmx.rmx_rttvar && i)
 982                                 rt->rt_rmx.rmx_rttvar =
 983                                     (rt->rt_rmx.rmx_rttvar + i) / 2;
 984                         else
 985                                 rt->rt_rmx.rmx_rttvar = i;
 986                         tcpstat.tcps_cachedrttvar++;
 987                 }
 988                 /*
 989                  * The old comment here said:
 990                  * update the pipelimit (ssthresh) if it has been updated
 991                  * already or if a pipesize was specified & the threshhold
 992                  * got below half the pipesize.  I.e., wait for bad news
 993                  * before we start updating, then update on both good
 994                  * and bad news.
 995                  *
 996                  * But we want to save the ssthresh even if no pipesize is
 997                  * specified explicitly in the route, because such
 998                  * connections still have an implicit pipesize specified
 999                  * by the global tcp_sendspace.  In the absence of a reliable
1000                  * way to calculate the pipesize, it will have to do.
1001                  */
1002                 i = tp->snd_ssthresh;
1003                 if (rt->rt_rmx.rmx_sendpipe != 0)
1004                         dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2);
1005                 else
1006                         dosavessthresh = (i < so->so_snd.sb_hiwat / 2);
1007                 if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
1008                      i != 0 && rt->rt_rmx.rmx_ssthresh != 0)
1009                     || dosavessthresh) {
1010                         /*
1011                          * convert the limit from user data bytes to
1012                          * packets then to packet data bytes.
1013                          */
1014                         i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
1015                         if (i < 2)
1016                                 i = 2;
1017                         i *= (u_int32_t)(tp->t_maxseg +
1018 #if INET6
1019                                       (isipv6 ? sizeof (struct ip6_hdr) +
1020                                                sizeof (struct tcphdr) :
1021 #endif
1022                                        sizeof (struct tcpiphdr)
1023 #if INET6
1024                                        )
1025 #endif
1026                                       );
1027                         if (rt->rt_rmx.rmx_ssthresh)
1028                                 rt->rt_rmx.rmx_ssthresh =
1029                                     (rt->rt_rmx.rmx_ssthresh + i) / 2;
1030                         else
1031                                 rt->rt_rmx.rmx_ssthresh = i;
1032                         tcpstat.tcps_cachedssthresh++;
1033                 }
1034         }
1035
1036         /*
1037          * Mark route for deletion if no information is cached.
1038          */
1039         if (rt != NULL && (so->so_flags & SOF_OVERFLOW) && tcp_lq_overflow) {
1040                 if (!(rt->rt_rmx.rmx_locks & RTV_RTT) &&
1041                     rt->rt_rmx.rmx_rtt == 0) {
1042                         rt->rt_flags |= RTF_DELCLONE;
1043                 }
1044         }
1045
1046 no_valid_rt:
1047         if (rt != NULL)
1048                 RT_UNLOCK(rt);
1049
1050         /* free the reassembly queue, if any */
1051         (void) tcp_freeq(tp);
1052
1053         tcp_free_sackholes(tp);
1054
1055         /* Free the packet list */
1056         if (tp->t_pktlist_head != NULL)
1057                 m_freem_list(tp->t_pktlist_head);
1058         TCP_PKTLIST_CLEAR(tp);
1059
1060 #ifdef __APPLE__
1061         if (so->cached_in_sock_layer)
1062             inp->inp_saved_ppcb = (caddr_t) tp;
1063 #endif
1064         /* Issue a wakeup before detach so that we don't miss
1065          * a wakeup
1066          */
1067         sodisconnectwakeup(so);
1068
1069 #if INET6
1070         if (INP_CHECK_SOCKAF(so, AF_INET6))
1071                 in6_pcbdetach(inp);
1072         else
1073 #endif /* INET6 */
1074         in_pcbdetach(inp);
1075
1076         /* Call soisdisconnected after detach because it might unlock the socket */
1077         soisdisconnected(so);
1078         tcpstat.tcps_closed++;
1079         KERNEL_DEBUG(DBG_FNC_TCP_CLOSE | DBG_FUNC_END, tcpstat.tcps_closed,0,0,0,0);
1080         return(NULL);
1081 }
1082
1083 int
1084 tcp_freeq(tp)
1085         struct tcpcb *tp;
1086 {
1087
1088         register struct tseg_qent *q;
1089         int rv = 0;
1090
1091         while((q = LIST_FIRST(&tp->t_segq)) != NULL) {
1092                 LIST_REMOVE(q, tqe_q);
1093                 m_freem(q->tqe_m);
1094                 zfree(tcp_reass_zone, q);
1095                 tcp_reass_qsize--;
1096                 rv = 1;
1097         }
1098         return (rv);
1099 }
1100
1101 void
1102 tcp_drain()
1103 {
1104         if (do_tcpdrain)
1105         {
1106                 struct inpcb *inpb;
1107                 struct tcpcb *tcpb;
1108                 struct tseg_qent *te;
1109
1110         /*
1111          * Walk the tcpbs, if existing, and flush the reassembly queue,
1112          * if there is one...
1113          * XXX: The "Net/3" implementation doesn't imply that the TCP
1114          *      reassembly queue should be flushed, but in a situation
1115          *      where we're really low on mbufs, this is potentially
1116          *      usefull.
1117          */
1118                 if (!lck_rw_try_lock_exclusive(tcbinfo.mtx)) /* do it next time if the lock is in use */
1119                         return;
1120
1121                 for (inpb = LIST_FIRST(tcbinfo.listhead); inpb;
1122                         inpb = LIST_NEXT(inpb, inp_list)) {
1123                                 if ((tcpb = intotcpcb(inpb))) {
1124                                         while ((te = LIST_FIRST(&tcpb->t_segq))
1125                                                != NULL) {
1126                                         LIST_REMOVE(te, tqe_q);
1127                                         m_freem(te->tqe_m);
1128                                         zfree(tcp_reass_zone, te);
1129                                         tcp_reass_qsize--;
1130                                 }
1131                         }
1132                 }
1133                 lck_rw_done(tcbinfo.mtx);
1134
1135         }
1136 }
1137
1138 /*
1139  * Notify a tcp user of an asynchronous error;
1140  * store error as soft error, but wake up user
1141  * (for now, won't do anything until can select for soft error).
1142  *
1143  * Do not wake up user since there currently is no mechanism for
1144  * reporting soft errors (yet - a kqueue filter may be added).
1145  */
1146 static void
1147 tcp_notify(inp, error)
1148         struct inpcb *inp;
1149         int error;
1150 {
1151         struct tcpcb *tp;
1152
1153         if (inp == NULL || (inp->inp_state == INPCB_STATE_DEAD))
1154                 return; /* pcb is gone already */
1155
1156         tp = (struct tcpcb *)inp->inp_ppcb;
1157
1158         /*
1159          * Ignore some errors if we are hooked up.
1160          * If connection hasn't completed, has retransmitted several times,
1161          * and receives a second error, give up now.  This is better
1162          * than waiting a long time to establish a connection that
1163          * can never complete.
1164          */
1165         if (tp->t_state == TCPS_ESTABLISHED &&
1166              (error == EHOSTUNREACH || error == ENETUNREACH ||
1167               error == EHOSTDOWN)) {
1168                 return;
1169         } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
1170             tp->t_softerror)
1171                 tcp_drop(tp, error);
1172         else
1173                 tp->t_softerror = error;
1174 #if 0
1175         wakeup((caddr_t) &so->so_timeo);
1176         sorwakeup(so);
1177         sowwakeup(so);
1178 #endif
1179 }
1180
1181 /*
1182  * tcpcb_to_otcpcb copies specific bits of a tcpcb to a otcpcb format.
1183  * The otcpcb data structure is passed to user space and must not change.
1184  */
1185 static void
1186 tcpcb_to_otcpcb(struct tcpcb *tp, struct otcpcb *otp)
1187 {
1188         int i;
1189
1190         otp->t_segq = (u_int32_t)(uintptr_t)tp->t_segq.lh_first;
1191         otp->t_dupacks = tp->t_dupacks;
1192         for (i = 0; i < TCPT_NTIMERS_EXT; i++)
1193                 otp->t_timer[i] = tp->t_timer[i];
1194         otp->t_inpcb = (_TCPCB_PTR(struct inpcb *))(uintptr_t)tp->t_inpcb;
1195         otp->t_state = tp->t_state;
1196         otp->t_flags = tp->t_flags;
1197         otp->t_force = tp->t_force;
1198         otp->snd_una = tp->snd_una;
1199         otp->snd_max = tp->snd_max;
1200         otp->snd_nxt = tp->snd_nxt;
1201         otp->snd_up = tp->snd_up;
1202         otp->snd_wl1 = tp->snd_wl1;
1203         otp->snd_wl2 = tp->snd_wl2;
1204         otp->iss = tp->iss;
1205         otp->irs = tp->irs;
1206         otp->rcv_nxt = tp->rcv_nxt;
1207         otp->rcv_adv = tp->rcv_adv;
1208         otp->rcv_wnd = tp->rcv_wnd;
1209         otp->rcv_up = tp->rcv_up;
1210         otp->snd_wnd = tp->snd_wnd;
1211         otp->snd_cwnd = tp->snd_cwnd;
1212         otp->snd_ssthresh = tp->snd_ssthresh;
1213         otp->t_maxopd = tp->t_maxopd;
1214         otp->t_rcvtime = tp->t_rcvtime;
1215         otp->t_starttime = tp->t_starttime;
1216         otp->t_rtttime = tp->t_rtttime;
1217         otp->t_rtseq = tp->t_rtseq;
1218         otp->t_rxtcur = tp->t_rxtcur;
1219         otp->t_maxseg = tp->t_maxseg;
1220         otp->t_srtt = tp->t_srtt;
1221         otp->t_rttvar = tp->t_rttvar;
1222         otp->t_rxtshift = tp->t_rxtshift;
1223         otp->t_rttmin = tp->t_rttmin;
1224         otp->t_rttupdated = tp->t_rttupdated;
1225         otp->max_sndwnd = tp->max_sndwnd;
1226         otp->t_softerror = tp->t_softerror;
1227         otp->t_oobflags = tp->t_oobflags;
1228         otp->t_iobc = tp->t_iobc;
1229         otp->snd_scale = tp->snd_scale;
1230         otp->rcv_scale = tp->rcv_scale;
1231         otp->request_r_scale = tp->request_r_scale;
1232         otp->requested_s_scale = tp->requested_s_scale;
1233         otp->ts_recent = tp->ts_recent;
1234         otp->ts_recent_age = tp->ts_recent_age;
1235         otp->last_ack_sent = tp->last_ack_sent;
1236         otp->cc_send = tp->cc_send;
1237         otp->cc_recv = tp->cc_recv;
1238         otp->snd_recover = tp->snd_recover;
1239         otp->snd_cwnd_prev = tp->snd_cwnd_prev;
1240         otp->snd_ssthresh_prev = tp->snd_ssthresh_prev;
1241         otp->t_badrxtwin = tp->t_badrxtwin;
1242 }
1243
1244 static int
1245 tcp_pcblist SYSCTL_HANDLER_ARGS
1246 {
1247 #pragma unused(oidp, arg1, arg2)
1248         int error, i, n;
1249         struct inpcb *inp, **inp_list;
1250         inp_gen_t gencnt;
1251         struct xinpgen xig;
1252         int slot;
1253
1254         /*
1255          * The process of preparing the TCB list is too time-consuming and
1256          * resource-intensive to repeat twice on every request.
1257          */
1258         lck_rw_lock_shared(tcbinfo.mtx);
1259         if (req->oldptr == USER_ADDR_NULL) {
1260                 n = tcbinfo.ipi_count;
1261                 req->oldidx = 2 * (sizeof xig)
1262                         + (n + n/8) * sizeof(struct xtcpcb);
1263                 lck_rw_done(tcbinfo.mtx);
1264                 return 0;
1265         }
1266
1267         if (req->newptr != USER_ADDR_NULL) {
1268                 lck_rw_done(tcbinfo.mtx);
1269                 return EPERM;
1270         }
1271
1272         /*
1273          * OK, now we're committed to doing something.
1274          */
1275         gencnt = tcbinfo.ipi_gencnt;
1276         n = tcbinfo.ipi_count;
1277
1278         bzero(&xig, sizeof(xig));
1279         xig.xig_len = sizeof xig;
1280         xig.xig_count = n;
1281         xig.xig_gen = gencnt;
1282         xig.xig_sogen = so_gencnt;
1283         error = SYSCTL_OUT(req, &xig, sizeof xig);
1284         if (error) {
1285                 lck_rw_done(tcbinfo.mtx);
1286                 return error;
1287         }
1288         /*
1289          * We are done if there is no pcb
1290          */
1291         if (n == 0) {
1292                 lck_rw_done(tcbinfo.mtx);
1293                 return 0;
1294         }
1295
1296         inp_list = _MALLOC(n * sizeof *inp_list, M_TEMP, M_WAITOK);
1297         if (inp_list == 0) {
1298                 lck_rw_done(tcbinfo.mtx);
1299                 return ENOMEM;
1300         }
1301
1302         for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n;
1303              inp = LIST_NEXT(inp, inp_list)) {
1304 #ifdef __APPLE__
1305                 if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD)
1306 #else
1307                 if (inp->inp_gencnt <= gencnt && !prison_xinpcb(req->p, inp))
1308 #endif
1309                         inp_list[i++] = inp;
1310         }
1311
1312         for (slot = 0; slot < N_TIME_WAIT_SLOTS; slot++) {
1313                 struct inpcb *inpnxt;
1314
1315                 for (inp = time_wait_slots[slot].lh_first; inp && i < n; inp = inpnxt) {
1316                         inpnxt = inp->inp_list.le_next;
1317                         if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD)
1318                                 inp_list[i++] = inp;
1319                 }
1320         }
1321
1322         n = i;
1323
1324         error = 0;
1325         for (i = 0; i < n; i++) {
1326                 inp = inp_list[i];
1327                 if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) {
1328                         struct xtcpcb xt;
1329                         caddr_t inp_ppcb;
1330
1331                         bzero(&xt, sizeof(xt));
1332                         xt.xt_len = sizeof xt;
1333                         /* XXX should avoid extra copy */
1334                         inpcb_to_compat(inp, &xt.xt_inp);
1335                         inp_ppcb = inp->inp_ppcb;
1336                         if (inp_ppcb != NULL) {
1337                                 tcpcb_to_otcpcb((struct tcpcb *)inp_ppcb,
1338                                     &xt.xt_tp);
1339                         } else {
1340                                 bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
1341                         }
1342                         if (inp->inp_socket)
1343                                 sotoxsocket(inp->inp_socket, &xt.xt_socket);
1344                         error = SYSCTL_OUT(req, &xt, sizeof xt);
1345                 }
1346         }
1347         if (!error) {
1348                 /*
1349                  * Give the user an updated idea of our state.
1350                  * If the generation differs from what we told
1351                  * her before, she knows that something happened
1352                  * while we were processing this request, and it
1353                  * might be necessary to retry.
1354                  */
1355                 bzero(&xig, sizeof(xig));
1356                 xig.xig_len = sizeof xig;
1357                 xig.xig_gen = tcbinfo.ipi_gencnt;
1358                 xig.xig_sogen = so_gencnt;
1359                 xig.xig_count = tcbinfo.ipi_count;
1360                 error = SYSCTL_OUT(req, &xig, sizeof xig);
1361         }
1362         FREE(inp_list, M_TEMP);
1363         lck_rw_done(tcbinfo.mtx);
1364         return error;
1365 }
1366
1367 SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
1368             tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
1369
1370 #if !CONFIG_EMBEDDED
1371
1372 static void
1373 tcpcb_to_xtcpcb64(struct tcpcb *tp, struct xtcpcb64 *otp)
1374 {
1375         int i;
1376
1377         otp->t_segq = (u_int32_t)(uintptr_t)tp->t_segq.lh_first;
1378         otp->t_dupacks = tp->t_dupacks;
1379         for (i = 0; i < TCPT_NTIMERS_EXT; i++)
1380                 otp->t_timer[i] = tp->t_timer[i];
1381         otp->t_state = tp->t_state;
1382         otp->t_flags = tp->t_flags;
1383         otp->t_force = tp->t_force;
1384         otp->snd_una = tp->snd_una;
1385         otp->snd_max = tp->snd_max;
1386         otp->snd_nxt = tp->snd_nxt;
1387         otp->snd_up = tp->snd_up;
1388         otp->snd_wl1 = tp->snd_wl1;
1389         otp->snd_wl2 = tp->snd_wl2;
1390         otp->iss = tp->iss;
1391         otp->irs = tp->irs;
1392         otp->rcv_nxt = tp->rcv_nxt;
1393         otp->rcv_adv = tp->rcv_adv;
1394         otp->rcv_wnd = tp->rcv_wnd;
1395         otp->rcv_up = tp->rcv_up;
1396         otp->snd_wnd = tp->snd_wnd;
1397         otp->snd_cwnd = tp->snd_cwnd;
1398         otp->snd_ssthresh = tp->snd_ssthresh;
1399         otp->t_maxopd = tp->t_maxopd;
1400         otp->t_rcvtime = tp->t_rcvtime;
1401         otp->t_starttime = tp->t_starttime;
1402         otp->t_rtttime = tp->t_rtttime;
1403         otp->t_rtseq = tp->t_rtseq;
1404         otp->t_rxtcur = tp->t_rxtcur;
1405         otp->t_maxseg = tp->t_maxseg;
1406         otp->t_srtt = tp->t_srtt;
1407         otp->t_rttvar = tp->t_rttvar;
1408         otp->t_rxtshift = tp->t_rxtshift;
1409         otp->t_rttmin = tp->t_rttmin;
1410         otp->t_rttupdated = tp->t_rttupdated;
1411         otp->max_sndwnd = tp->max_sndwnd;
1412         otp->t_softerror = tp->t_softerror;
1413         otp->t_oobflags = tp->t_oobflags;
1414         otp->t_iobc = tp->t_iobc;
1415         otp->snd_scale = tp->snd_scale;
1416         otp->rcv_scale = tp->rcv_scale;
1417         otp->request_r_scale = tp->request_r_scale;
1418         otp->requested_s_scale = tp->requested_s_scale;
1419         otp->ts_recent = tp->ts_recent;
1420         otp->ts_recent_age = tp->ts_recent_age;
1421         otp->last_ack_sent = tp->last_ack_sent;
1422         otp->cc_send = tp->cc_send;
1423         otp->cc_recv = tp->cc_recv;
1424         otp->snd_recover = tp->snd_recover;
1425         otp->snd_cwnd_prev = tp->snd_cwnd_prev;
1426         otp->snd_ssthresh_prev = tp->snd_ssthresh_prev;
1427         otp->t_badrxtwin = tp->t_badrxtwin;
1428 }
1429
1430
1431 static int
1432 tcp_pcblist64 SYSCTL_HANDLER_ARGS
1433 {
1434 #pragma unused(oidp, arg1, arg2)
1435         int error, i, n;
1436         struct inpcb *inp, **inp_list;
1437         inp_gen_t gencnt;
1438         struct xinpgen xig;
1439         int slot;
1440
1441         /*
1442          * The process of preparing the TCB list is too time-consuming and
1443          * resource-intensive to repeat twice on every request.
1444          */
1445         lck_rw_lock_shared(tcbinfo.mtx);
1446         if (req->oldptr == USER_ADDR_NULL) {
1447                 n = tcbinfo.ipi_count;
1448                 req->oldidx = 2 * (sizeof xig)
1449                         + (n + n/8) * sizeof(struct xtcpcb64);
1450                 lck_rw_done(tcbinfo.mtx);
1451                 return 0;
1452         }
1453
1454         if (req->newptr != USER_ADDR_NULL) {
1455                 lck_rw_done(tcbinfo.mtx);
1456                 return EPERM;
1457         }
1458
1459         /*
1460          * OK, now we're committed to doing something.
1461          */
1462         gencnt = tcbinfo.ipi_gencnt;
1463         n = tcbinfo.ipi_count;
1464
1465         bzero(&xig, sizeof(xig));
1466         xig.xig_len = sizeof xig;
1467         xig.xig_count = n;
1468         xig.xig_gen = gencnt;
1469         xig.xig_sogen = so_gencnt;
1470         error = SYSCTL_OUT(req, &xig, sizeof xig);
1471         if (error) {
1472                 lck_rw_done(tcbinfo.mtx);
1473                 return error;
1474         }
1475         /*
1476          * We are done if there is no pcb
1477          */
1478         if (n == 0) {
1479                 lck_rw_done(tcbinfo.mtx);
1480                 return 0;
1481         }
1482
1483         inp_list = _MALLOC(n * sizeof *inp_list, M_TEMP, M_WAITOK);
1484         if (inp_list == 0) {
1485                 lck_rw_done(tcbinfo.mtx);
1486                 return ENOMEM;
1487         }
1488
1489         for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n;
1490              inp = LIST_NEXT(inp, inp_list)) {
1491 #ifdef __APPLE__
1492                 if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD)
1493 #else
1494                 if (inp->inp_gencnt <= gencnt && !prison_xinpcb(req->p, inp))
1495 #endif
1496                         inp_list[i++] = inp;
1497         }
1498
1499         for (slot = 0; slot < N_TIME_WAIT_SLOTS; slot++) {
1500                 struct inpcb *inpnxt;
1501
1502                 for (inp = time_wait_slots[slot].lh_first; inp && i < n; inp = inpnxt) {
1503                         inpnxt = inp->inp_list.le_next;
1504                         if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD)
1505                                 inp_list[i++] = inp;
1506                 }
1507         }
1508
1509         n = i;
1510
1511         error = 0;
1512         for (i = 0; i < n; i++) {
1513                 inp = inp_list[i];
1514                 if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) {
1515                                         struct xtcpcb64 xt;
1516
1517                                         bzero(&xt, sizeof(xt));
1518                                         xt.xt_len = sizeof xt;
1519                                         inpcb_to_xinpcb64(inp, &xt.xt_inpcb);
1520                                         xt.xt_inpcb.inp_ppcb = (u_int64_t)(uintptr_t)inp->inp_ppcb;
1521                                         if (inp->inp_ppcb != NULL)
1522                                                 tcpcb_to_xtcpcb64((struct tcpcb *)inp->inp_ppcb, &xt);
1523                                         if (inp->inp_socket)
1524                                                 sotoxsocket64(inp->inp_socket, &xt.xt_inpcb.xi_socket);
1525                                         error = SYSCTL_OUT(req, &xt, sizeof xt);
1526                 }
1527         }
1528         if (!error) {
1529                         /*
1530                          * Give the user an updated idea of our state.
1531                          * If the generation differs from what we told
1532                          * her before, she knows that something happened
1533                          * while we were processing this request, and it
1534                          * might be necessary to retry.
1535                          */
1536                         bzero(&xig, sizeof(xig));
1537                         xig.xig_len = sizeof xig;
1538                         xig.xig_gen = tcbinfo.ipi_gencnt;
1539                         xig.xig_sogen = so_gencnt;
1540                         xig.xig_count = tcbinfo.ipi_count;
1541                         error = SYSCTL_OUT(req, &xig, sizeof xig);
1542         }
1543         FREE(inp_list, M_TEMP);
1544         lck_rw_done(tcbinfo.mtx);
1545         return error;
1546 }
1547
1548 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, pcblist64, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
1549             tcp_pcblist64, "S,xtcpcb64", "List of active TCP connections");
1550
1551 #endif /* !CONFIG_EMBEDDED */
1552
1553 static int
1554 tcp_pcblist_n SYSCTL_HANDLER_ARGS
1555 {
1556 #pragma unused(oidp, arg1, arg2)
1557         int error = 0;
1558
1559         error = get_pcblist_n(IPPROTO_TCP, req, &tcbinfo);
1560
1561         return error;
1562 }
1563
1564
1565 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, pcblist_n, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
1566             tcp_pcblist_n, "S,xtcpcb_n", "List of active TCP connections");
1567
1568
1569 void
1570 tcp_ctlinput(cmd, sa, vip)
1571         int cmd;
1572         struct sockaddr *sa;
1573         void *vip;
1574 {
1575         tcp_seq icmp_tcp_seq;
1576         struct ip *ip = vip;
1577         struct tcphdr *th;
1578         struct in_addr faddr;
1579         struct inpcb *inp;
1580         struct tcpcb *tp;
1581
1582         void (*notify)(struct inpcb *, int) = tcp_notify;
1583
1584         struct icmp *icp;
1585
1586         faddr = ((struct sockaddr_in *)sa)->sin_addr;
1587         if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
1588                 return;
1589
1590         if (cmd == PRC_MSGSIZE)
1591                 notify = tcp_mtudisc;
1592         else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
1593                 cmd == PRC_UNREACH_PORT) && ip)
1594                 notify = tcp_drop_syn_sent;
1595         else if (PRC_IS_REDIRECT(cmd)) {
1596                 ip = 0;
1597                 notify = in_rtchange;
1598         } else if (cmd == PRC_HOSTDEAD)
1599                 ip = 0;
1600         /* Source quench is deprecated */
1601         else if (cmd == PRC_QUENCH)
1602                 return;
1603         else if ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0)
1604                 return;
1605         if (ip) {
1606                 icp = (struct icmp *)((caddr_t)ip
1607                                 - offsetof(struct icmp, icmp_ip));
1608                 th = (struct tcphdr *)((caddr_t)ip
1609                                + (IP_VHL_HL(ip->ip_vhl) << 2));
1610                 inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport,
1611                     ip->ip_src, th->th_sport, 0, NULL);
1612                 if (inp != NULL && inp->inp_socket != NULL) {
1613                         tcp_lock(inp->inp_socket, 1, 0);
1614                         if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
1615                                 tcp_unlock(inp->inp_socket, 1, 0);
1616                                 return;
1617                         }
1618                         icmp_tcp_seq = htonl(th->th_seq);
1619                         tp = intotcpcb(inp);
1620                         if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
1621                             SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
1622                                 if (cmd == PRC_MSGSIZE) {
1623
1624                                         /*
1625                                          * MTU discovery:
1626                                          * If we got a needfrag and there is a host route to the
1627                                          * original destination, and the MTU is not locked, then
1628                                          * set the MTU in the route to the suggested new value
1629                                          * (if given) and then notify as usual.  The ULPs will
1630                                          * notice that the MTU has changed and adapt accordingly.
1631                                          * If no new MTU was suggested, then we guess a new one
1632                                          * less than the current value.  If the new MTU is
1633                                          * unreasonably small (defined by sysctl tcp_minmss), then
1634                                          * we reset the MTU to the interface value and enable the
1635                                          * lock bit, indicating that we are no longer doing MTU
1636                                          * discovery.
1637                                          */
1638                                         struct rtentry *rt;
1639                                         int mtu;
1640                                         struct sockaddr_in icmpsrc = { sizeof (struct sockaddr_in), AF_INET,
1641                                                                                 0 , { 0 }, { 0,0,0,0,0,0,0,0 } };
1642                                         icmpsrc.sin_addr = icp->icmp_ip.ip_dst;
1643
1644                                         rt = rtalloc1((struct sockaddr *)&icmpsrc, 0,
1645                                             RTF_CLONING | RTF_PRCLONING);
1646                                         if (rt != NULL) {
1647                                                 RT_LOCK(rt);
1648                                                 if ((rt->rt_flags & RTF_HOST) &&
1649                                                     !(rt->rt_rmx.rmx_locks & RTV_MTU)) {
1650                                                         mtu = ntohs(icp->icmp_nextmtu);
1651                                                         if (!mtu)
1652                                                                 mtu = ip_next_mtu(rt->rt_rmx.
1653                                                                     rmx_mtu, 1);
1654 #if DEBUG_MTUDISC
1655                                                         printf("MTU for %s reduced to %d\n",
1656                                                             inet_ntop(AF_INET,
1657                                                             &icmpsrc.sin_addr, ipv4str,
1658                                                             sizeof (ipv4str)), mtu);
1659 #endif
1660                                                         if (mtu < max(296, (tcp_minmss +
1661                                                             sizeof (struct tcpiphdr)))) {
1662                                                                 /* rt->rt_rmx.rmx_mtu =
1663                                                                         rt->rt_ifp->if_mtu; */
1664                                                                 rt->rt_rmx.rmx_locks |= RTV_MTU;
1665                                                         } else if (rt->rt_rmx.rmx_mtu > mtu) {
1666                                                                 rt->rt_rmx.rmx_mtu = mtu;
1667                                                         }
1668                                                 }
1669                                                 RT_UNLOCK(rt);
1670                                                 rtfree(rt);
1671                                         }
1672                                 }
1673
1674                                 (*notify)(inp, inetctlerrmap[cmd]);
1675                         }
1676                         tcp_unlock(inp->inp_socket, 1, 0);
1677                 }
1678         } else
1679                 in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify);
1680 }
1681
1682 #if INET6
1683 void
1684 tcp6_ctlinput(cmd, sa, d)
1685         int cmd;
1686         struct sockaddr *sa;
1687         void *d;
1688 {
1689         struct tcphdr th;
1690         void (*notify)(struct inpcb *, int) = tcp_notify;
1691         struct ip6_hdr *ip6;
1692         struct mbuf *m;
1693         struct ip6ctlparam *ip6cp = NULL;
1694         const struct sockaddr_in6 *sa6_src = NULL;
1695         int off;
1696         struct tcp_portonly {
1697                 u_int16_t th_sport;
1698                 u_int16_t th_dport;
1699         } *thp;
1700
1701         if (sa->sa_family != AF_INET6 ||
1702             sa->sa_len != sizeof(struct sockaddr_in6))
1703                 return;
1704
1705         if (cmd == PRC_MSGSIZE)
1706                 notify = tcp_mtudisc;
1707         else if (!PRC_IS_REDIRECT(cmd) &&
1708                  ((unsigned)cmd > PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
1709                 return;
1710         /* Source quench is deprecated */
1711         else if (cmd == PRC_QUENCH)
1712                 return;
1713
1714         /* if the parameter is from icmp6, decode it. */
1715         if (d != NULL) {
1716                 ip6cp = (struct ip6ctlparam *)d;
1717                 m = ip6cp->ip6c_m;
1718                 ip6 = ip6cp->ip6c_ip6;
1719                 off = ip6cp->ip6c_off;
1720                 sa6_src = ip6cp->ip6c_src;
1721         } else {
1722                 m = NULL;
1723                 ip6 = NULL;
1724                 off = 0;        /* fool gcc */
1725                 sa6_src = &sa6_any;
1726         }
1727
1728         if (ip6) {
1729                 /*
1730                  * XXX: We assume that when IPV6 is non NULL,
1731                  * M and OFF are valid.
1732                  */
1733
1734                 /* check if we can safely examine src and dst ports */
1735                 if (m->m_pkthdr.len < off + sizeof(*thp))
1736                         return;
1737
1738                 bzero(&th, sizeof(th));
1739                 m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
1740
1741                 in6_pcbnotify(&tcbinfo, sa, th.th_dport,
1742                     (struct sockaddr *)ip6cp->ip6c_src,
1743                     th.th_sport, cmd, NULL, notify);
1744         } else {
1745                 in6_pcbnotify(&tcbinfo, sa, 0,
1746                     (struct sockaddr *)(size_t)sa6_src, 0, cmd, NULL, notify);
1747         }
1748 }
1749 #endif /* INET6 */
1750
1751
1752 /*
1753  * Following is where TCP initial sequence number generation occurs.
1754  *
1755  * There are two places where we must use initial sequence numbers:
1756  * 1.  In SYN-ACK packets.
1757  * 2.  In SYN packets.
1758  *
1759  * The ISNs in SYN-ACK packets have no monotonicity requirement,
1760  * and should be as unpredictable as possible to avoid the possibility
1761  * of spoofing and/or connection hijacking.  To satisfy this
1762  * requirement, SYN-ACK ISNs are generated via the arc4random()
1763  * function.  If exact RFC 1948 compliance is requested via sysctl,
1764  * these ISNs will be generated just like those in SYN packets.
1765  *
1766  * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
1767  * depends on this property.  In addition, these ISNs should be
1768  * unguessable so as to prevent connection hijacking.  To satisfy
1769  * the requirements of this situation, the algorithm outlined in
1770  * RFC 1948 is used to generate sequence numbers.
1771  *
1772  * For more information on the theory of operation, please see
1773  * RFC 1948.
1774  *
1775  * Implementation details:
1776  *
1777  * Time is based off the system timer, and is corrected so that it
1778  * increases by one megabyte per second.  This allows for proper
1779  * recycling on high speed LANs while still leaving over an hour
1780  * before rollover.
1781  *
1782  * Two sysctls control the generation of ISNs:
1783  *
1784  * net.inet.tcp.isn_reseed_interval controls the number of seconds
1785  * between seeding of isn_secret.  This is normally set to zero,
1786  * as reseeding should not be necessary.
1787  *
1788  * net.inet.tcp.strict_rfc1948 controls whether RFC 1948 is followed
1789  * strictly.  When strict compliance is requested, reseeding is
1790  * disabled and SYN-ACKs will be generated in the same manner as
1791  * SYNs.  Strict mode is disabled by default.
1792  *
1793  */
1794
1795 #define ISN_BYTES_PER_SECOND 1048576
1796
1797 tcp_seq
1798 tcp_new_isn(tp)
1799         struct tcpcb *tp;
1800 {
1801         u_int32_t md5_buffer[4];
1802         tcp_seq new_isn;
1803         struct timeval timenow;
1804         u_char isn_secret[32];
1805         int isn_last_reseed = 0;
1806         MD5_CTX isn_ctx;
1807
1808         /* Use arc4random for SYN-ACKs when not in exact RFC1948 mode. */
1809         if (((tp->t_state == TCPS_LISTEN) || (tp->t_state == TCPS_TIME_WAIT))
1810            && tcp_strict_rfc1948 == 0)
1811 #ifdef __APPLE__
1812                 return random();
1813 #else
1814                 return arc4random();
1815 #endif
1816         getmicrotime(&timenow);
1817
1818         /* Seed if this is the first use, reseed if requested. */
1819         if ((isn_last_reseed == 0) ||
1820             ((tcp_strict_rfc1948 == 0) && (tcp_isn_reseed_interval > 0) &&
1821              (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz)
1822                 < (u_int)timenow.tv_sec))) {
1823 #ifdef __APPLE__
1824                 read_random(&isn_secret, sizeof(isn_secret));
1825 #else
1826                 read_random_unlimited(&isn_secret, sizeof(isn_secret));
1827 #endif
1828                 isn_last_reseed = timenow.tv_sec;
1829         }
1830
1831         /* Compute the md5 hash and return the ISN. */
1832         MD5Init(&isn_ctx);
1833         MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short));
1834         MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short));
1835 #if INET6
1836         if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
1837                 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr,
1838                           sizeof(struct in6_addr));
1839                 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr,
1840                           sizeof(struct in6_addr));
1841         } else
1842 #endif
1843         {
1844                 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr,
1845                           sizeof(struct in_addr));
1846                 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr,
1847                           sizeof(struct in_addr));
1848         }
1849         MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret));
1850         MD5Final((u_char *) &md5_buffer, &isn_ctx);
1851         new_isn = (tcp_seq) md5_buffer[0];
1852         new_isn += timenow.tv_sec * (ISN_BYTES_PER_SECOND / hz);
1853         return new_isn;
1854 }
1855
1856
1857 /*
1858  * When a specific ICMP unreachable message is received and the
1859  * connection state is SYN-SENT, drop the connection.  This behavior
1860  * is controlled by the icmp_may_rst sysctl.
1861  */
1862 void
1863 tcp_drop_syn_sent(inp, errno)
1864         struct inpcb *inp;
1865         int errno;
1866 {
1867         struct tcpcb *tp = intotcpcb(inp);
1868
1869         if (tp && tp->t_state == TCPS_SYN_SENT)
1870                 tcp_drop(tp, errno);
1871 }
1872
1873 /*
1874  * When `need fragmentation' ICMP is received, update our idea of the MSS
1875  * based on the new value in the route.  Also nudge TCP to send something,
1876  * since we know the packet we just sent was dropped.
1877  * This duplicates some code in the tcp_mss() function in tcp_input.c.
1878  */
1879 void
1880 tcp_mtudisc(
1881         struct inpcb *inp,
1882         __unused int errno
1883 )
1884 {
1885         struct tcpcb *tp = intotcpcb(inp);
1886         struct rtentry *rt;
1887         struct rmxp_tao *taop;
1888         struct socket *so = inp->inp_socket;
1889         int offered;
1890         int mss;
1891 #if INET6
1892         int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
1893 #endif /* INET6 */
1894
1895         if (tp) {
1896 #if INET6
1897                 if (isipv6)
1898                         rt = tcp_rtlookup6(inp, IFSCOPE_NONE);
1899                 else
1900 #endif /* INET6 */
1901                 rt = tcp_rtlookup(inp, IFSCOPE_NONE);
1902                 if (!rt || !rt->rt_rmx.rmx_mtu) {
1903                         tp->t_maxopd = tp->t_maxseg =
1904 #if INET6
1905                                 isipv6 ? tcp_v6mssdflt :
1906 #endif /* INET6 */
1907                                 tcp_mssdflt;
1908
1909                         /* Route locked during lookup above */
1910                         if (rt != NULL)
1911                                 RT_UNLOCK(rt);
1912                         return;
1913                 }
1914                 taop = rmx_taop(rt->rt_rmx);
1915                 offered = taop->tao_mssopt;
1916                 mss = rt->rt_rmx.rmx_mtu -
1917 #if INET6
1918                         (isipv6 ?
1919                          sizeof(struct ip6_hdr) + sizeof(struct tcphdr) :
1920 #endif /* INET6 */
1921                          sizeof(struct tcpiphdr)
1922 #if INET6
1923                          )
1924 #endif /* INET6 */
1925                         ;
1926
1927                 /* Route locked during lookup above */
1928                 RT_UNLOCK(rt);
1929
1930                 if (offered)
1931                         mss = min(mss, offered);
1932                 /*
1933                  * XXX - The above conditional probably violates the TCP
1934                  * spec.  The problem is that, since we don't know the
1935                  * other end's MSS, we are supposed to use a conservative
1936                  * default.  But, if we do that, then MTU discovery will
1937                  * never actually take place, because the conservative
1938                  * default is much less than the MTUs typically seen
1939                  * on the Internet today.  For the moment, we'll sweep
1940                  * this under the carpet.
1941                  *
1942                  * The conservative default might not actually be a problem
1943                  * if the only case this occurs is when sending an initial
1944                  * SYN with options and data to a host we've never talked
1945                  * to before.  Then, they will reply with an MSS value which
1946                  * will get recorded and the new parameters should get
1947                  * recomputed.  For Further Study.
1948                  */
1949                 if (tp->t_maxopd <= mss)
1950                         return;
1951                 tp->t_maxopd = mss;
1952
1953                 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
1954                     (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
1955                         mss -= TCPOLEN_TSTAMP_APPA;
1956
1957                 if (so->so_snd.sb_hiwat < mss)
1958                         mss = so->so_snd.sb_hiwat;
1959
1960                 tp->t_maxseg = mss;
1961
1962                 /*
1963                  * Reset the slow-start flight size as it may depends on the new MSS
1964                  */
1965                 if (CC_ALGO(tp)->cwnd_init != NULL)
1966                         CC_ALGO(tp)->cwnd_init(tp);
1967                 tcpstat.tcps_mturesent++;
1968                 tp->t_rtttime = 0;
1969                 tp->snd_nxt = tp->snd_una;
1970                 tcp_output(tp);
1971         }
1972 }
1973
1974 /*
1975  * Look-up the routing entry to the peer of this inpcb.  If no route
1976  * is found and it cannot be allocated the return NULL.  This routine
1977  * is called by TCP routines that access the rmx structure and by tcp_mss
1978  * to get the interface MTU.  If a route is found, this routine will
1979  * hold the rtentry lock; the caller is responsible for unlocking.
1980  */
1981 struct rtentry *
1982 tcp_rtlookup(inp, input_ifscope)
1983         struct inpcb *inp;
1984         unsigned int input_ifscope;
1985 {
1986         struct route *ro;
1987         struct rtentry *rt;
1988         struct tcpcb *tp;
1989
1990         lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_NOTOWNED);
1991
1992         ro = &inp->inp_route;
1993         if ((rt = ro->ro_rt) != NULL)
1994                 RT_LOCK(rt);
1995
1996         if (rt == NULL || !(rt->rt_flags & RTF_UP) ||
1997             rt->generation_id != route_generation) {
1998                 /* No route yet, so try to acquire one */
1999                 if (inp->inp_faddr.s_addr != INADDR_ANY) {
2000                         unsigned int ifscope;
2001
2002                         ro->ro_dst.sa_family = AF_INET;
2003                         ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
2004                         ((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
2005                                 inp->inp_faddr;
2006
2007                         /*
2008                          * If the socket was bound to an interface, then
2009                          * the bound-to-interface takes precedence over
2010                          * the inbound interface passed in by the caller
2011                          * (if we get here as part of the output path then
2012                          * input_ifscope is IFSCOPE_NONE).
2013                          */
2014                         ifscope = (inp->inp_flags & INP_BOUND_IF) ?
2015                             inp->inp_boundif : input_ifscope;
2016
2017                         if (rt != NULL)
2018                                 RT_UNLOCK(rt);
2019                         rtalloc_scoped(ro, ifscope);
2020                         if ((rt = ro->ro_rt) != NULL)
2021                                 RT_LOCK(rt);
2022                 }
2023         }
2024
2025         /*
2026          * Update MTU discovery determination. Don't do it if:
2027          *      1) it is disabled via the sysctl
2028          *      2) the route isn't up
2029          *      3) the MTU is locked (if it is, then discovery has been
2030          *         disabled)
2031          */
2032
2033          tp = intotcpcb(inp);
2034
2035         if (!path_mtu_discovery || ((rt != NULL) &&
2036             (!(rt->rt_flags & RTF_UP) || (rt->rt_rmx.rmx_locks & RTV_MTU))))
2037                 tp->t_flags &= ~TF_PMTUD;
2038         else
2039                 tp->t_flags |= TF_PMTUD;
2040
2041 #if CONFIG_IFEF_NOWINDOWSCALE
2042         if (tcp_obey_ifef_nowindowscale &&
2043             tp->t_state == TCPS_SYN_SENT && rt != NULL && rt->rt_ifp != NULL &&
2044             (rt->rt_ifp->if_eflags & IFEF_NOWINDOWSCALE)) {
2045                 /* Window scaling is enabled on this interface */
2046                 tp->t_flags &= ~TF_REQ_SCALE;
2047         }
2048 #endif
2049
2050         if (rt != NULL && rt->rt_ifp != NULL) {
2051                 somultipages(inp->inp_socket,
2052                     (rt->rt_ifp->if_hwassist & IFNET_MULTIPAGES));
2053                 tcp_set_tso(tp, rt->rt_ifp);
2054         }
2055
2056         /*
2057          * Caller needs to call RT_UNLOCK(rt).
2058          */
2059         return rt;
2060 }
2061
2062 #if INET6
2063 struct rtentry *
2064 tcp_rtlookup6(inp, input_ifscope)
2065         struct inpcb *inp;
2066         unsigned int input_ifscope;
2067 {
2068         struct route_in6 *ro6;
2069         struct rtentry *rt;
2070         struct tcpcb *tp;
2071
2072         lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_NOTOWNED);
2073
2074         ro6 = &inp->in6p_route;
2075         if ((rt = ro6->ro_rt) != NULL)
2076                 RT_LOCK(rt);
2077
2078         if (rt == NULL || !(rt->rt_flags & RTF_UP) ||
2079             rt->generation_id != route_generation) {
2080                 /* No route yet, so try to acquire one */
2081                 if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
2082                         struct sockaddr_in6 *dst6;
2083                         unsigned int ifscope;
2084
2085                         dst6 = (struct sockaddr_in6 *)&ro6->ro_dst;
2086                         dst6->sin6_family = AF_INET6;
2087                         dst6->sin6_len = sizeof(*dst6);
2088                         dst6->sin6_addr = inp->in6p_faddr;
2089
2090                         /*
2091                          * If the socket was bound to an interface, then
2092                          * the bound-to-interface takes precedence over
2093                          * the inbound interface passed in by the caller
2094                          * (if we get here as part of the output path then
2095                          * input_ifscope is IFSCOPE_NONE).
2096                          */
2097                         ifscope = (inp->inp_flags & INP_BOUND_IF) ?
2098                             inp->inp_boundif : input_ifscope;
2099
2100                         if (rt != NULL)
2101                                 RT_UNLOCK(rt);
2102                         rtalloc_scoped((struct route *)ro6, ifscope);
2103                         if ((rt = ro6->ro_rt) != NULL)
2104                                 RT_LOCK(rt);
2105                 }
2106         }
2107         /*
2108          * Update path MTU Discovery determination
2109          * while looking up the route:
2110          *  1) we have a valid route to the destination
2111          *  2) the MTU is not locked (if it is, then discovery has been
2112          *    disabled)
2113          */
2114
2115
2116          tp = intotcpcb(inp);
2117
2118         /*
2119          * Update MTU discovery determination. Don't do it if:
2120          *      1) it is disabled via the sysctl
2121          *      2) the route isn't up
2122          *      3) the MTU is locked (if it is, then discovery has been
2123          *         disabled)
2124          */
2125
2126         if (!path_mtu_discovery || ((rt != NULL) &&
2127             (!(rt->rt_flags & RTF_UP) || (rt->rt_rmx.rmx_locks & RTV_MTU))))
2128                 tp->t_flags &= ~TF_PMTUD;
2129         else
2130                 tp->t_flags |= TF_PMTUD;
2131
2132 #if CONFIG_IFEF_NOWINDOWSCALE
2133         if (tcp_obey_ifef_nowindowscale &&
2134             tp->t_state == TCPS_SYN_SENT && rt != NULL && rt->rt_ifp != NULL &&
2135             (rt->rt_ifp->if_eflags & IFEF_NOWINDOWSCALE)) {
2136                 /* Window scaling is not enabled on this interface */
2137                 tp->t_flags &= ~TF_REQ_SCALE;
2138         }
2139 #endif
2140
2141         if (rt != NULL && rt->rt_ifp != NULL) {
2142                 somultipages(inp->inp_socket,
2143                     (rt->rt_ifp->if_hwassist & IFNET_MULTIPAGES));
2144                 tcp_set_tso(tp, rt->rt_ifp);
2145         }
2146
2147         /*
2148          * Caller needs to call RT_UNLOCK(rt).
2149          */
2150         return rt;
2151 }
2152 #endif /* INET6 */
2153
2154 #if IPSEC
2155 /* compute ESP/AH header size for TCP, including outer IP header. */
2156 size_t
2157 ipsec_hdrsiz_tcp(tp)
2158         struct tcpcb *tp;
2159 {
2160         struct inpcb *inp;
2161         struct mbuf *m;
2162         size_t hdrsiz;
2163         struct ip *ip;
2164 #if INET6
2165         struct ip6_hdr *ip6 = NULL;
2166 #endif /* INET6 */
2167         struct tcphdr *th;
2168
2169         if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL))
2170                 return 0;
2171         MGETHDR(m, M_DONTWAIT, MT_DATA);        /* MAC-OK */
2172         if (!m)
2173                 return 0;
2174
2175 #if INET6
2176         if ((inp->inp_vflag & INP_IPV6) != 0) {
2177                 ip6 = mtod(m, struct ip6_hdr *);
2178                 th = (struct tcphdr *)(ip6 + 1);
2179                 m->m_pkthdr.len = m->m_len =
2180                         sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
2181                 tcp_fillheaders(tp, ip6, th);
2182                 hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
2183         } else
2184 #endif /* INET6 */
2185       {
2186         ip = mtod(m, struct ip *);
2187         th = (struct tcphdr *)(ip + 1);
2188         m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
2189         tcp_fillheaders(tp, ip, th);
2190         hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
2191       }
2192         m_free(m);
2193         return hdrsiz;
2194 }
2195 #endif /*IPSEC*/
2196
2197 /*
2198  * Return a pointer to the cached information about the remote host.
2199  * The cached information is stored in the protocol specific part of
2200  * the route metrics.
2201  */
2202 struct rmxp_tao *
2203 tcp_gettaocache(inp)
2204         struct inpcb *inp;
2205 {
2206         struct rtentry *rt;
2207         struct rmxp_tao *taop;
2208
2209 #if INET6
2210         if ((inp->inp_vflag & INP_IPV6) != 0)
2211                 rt = tcp_rtlookup6(inp, IFSCOPE_NONE);
2212         else
2213 #endif /* INET6 */
2214         rt = tcp_rtlookup(inp, IFSCOPE_NONE);
2215
2216         /* Make sure this is a host route and is up. */
2217         if (rt == NULL ||
2218             (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST)) {
2219                 /* Route locked during lookup above */
2220                 if (rt != NULL)
2221                         RT_UNLOCK(rt);
2222                 return NULL;
2223         }
2224
2225         taop = rmx_taop(rt->rt_rmx);
2226         /* Route locked during lookup above */
2227         RT_UNLOCK(rt);
2228         return (taop);
2229 }
2230
2231 /*
2232  * Clear all the TAO cache entries, called from tcp_init.
2233  *
2234  * XXX
2235  * This routine is just an empty one, because we assume that the routing
2236  * routing tables are initialized at the same time when TCP, so there is
2237  * nothing in the cache left over.
2238  */
2239 static void
2240 tcp_cleartaocache()
2241 {
2242 }
2243
2244 int
2245 tcp_lock(struct socket *so, int refcount, void *lr)
2246 {
2247         void *lr_saved;
2248
2249         if (lr == NULL)
2250                 lr_saved = __builtin_return_address(0);
2251         else
2252                 lr_saved = lr;
2253
2254         if (so->so_pcb != NULL) {
2255                 lck_mtx_lock(&((struct inpcb *)so->so_pcb)->inpcb_mtx);
2256         } else  {
2257                 panic("tcp_lock: so=%p NO PCB! lr=%p lrh= %s\n",
2258                     so, lr_saved, solockhistory_nr(so));
2259                 /* NOTREACHED */
2260         }
2261
2262         if (so->so_usecount < 0) {
2263                 panic("tcp_lock: so=%p so_pcb=%p lr=%p ref=%x lrh= %s\n",
2264                     so, so->so_pcb, lr_saved, so->so_usecount, solockhistory_nr(so));
2265                 /* NOTREACHED */
2266         }
2267         if (refcount)
2268                 so->so_usecount++;
2269         so->lock_lr[so->next_lock_lr] = lr_saved;
2270         so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
2271         return (0);
2272 }
2273
2274 int
2275 tcp_unlock(struct socket *so, int refcount, void *lr)
2276 {
2277         void *lr_saved;
2278
2279         if (lr == NULL)
2280                 lr_saved = __builtin_return_address(0);
2281         else
2282                 lr_saved = lr;
2283
2284 #ifdef MORE_TCPLOCK_DEBUG
2285         printf("tcp_unlock: so=%p sopcb=%p lock=%p ref=%x lr=%p\n",
2286             so, so->so_pcb, &((struct inpcb *)so->so_pcb)->inpcb_mtx,
2287             so->so_usecount, lr_saved);
2288 #endif
2289         if (refcount)
2290                 so->so_usecount--;
2291
2292         if (so->so_usecount < 0) {
2293                 panic("tcp_unlock: so=%p usecount=%x lrh= %s\n",
2294                     so, so->so_usecount, solockhistory_nr(so));
2295                 /* NOTREACHED */
2296         }
2297         if (so->so_pcb == NULL) {
2298                 panic("tcp_unlock: so=%p NO PCB usecount=%x lr=%p lrh= %s\n",
2299                     so, so->so_usecount, lr_saved, solockhistory_nr(so));
2300                 /* NOTREACHED */
2301         } else {
2302                 lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx,
2303                     LCK_MTX_ASSERT_OWNED);
2304                 so->unlock_lr[so->next_unlock_lr] = lr_saved;
2305                 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
2306                 lck_mtx_unlock(&((struct inpcb *)so->so_pcb)->inpcb_mtx);
2307         }
2308         return (0);
2309 }
2310
2311 lck_mtx_t *
2312 tcp_getlock(
2313         struct socket *so,
2314         __unused int locktype)
2315 {
2316         struct inpcb *inp = sotoinpcb(so);
2317
2318         if (so->so_pcb)  {
2319                 if (so->so_usecount < 0)
2320                         panic("tcp_getlock: so=%p usecount=%x lrh= %s\n",
2321                             so, so->so_usecount, solockhistory_nr(so));
2322                 return(&inp->inpcb_mtx);
2323         }
2324         else {
2325                 panic("tcp_getlock: so=%p NULL so_pcb %s\n",
2326                     so, solockhistory_nr(so));
2327                 return (so->so_proto->pr_domain->dom_mtx);
2328         }
2329 }
2330
2331 int32_t
2332 tcp_sbspace(struct tcpcb *tp)
2333 {
2334         struct sockbuf *sb = &tp->t_inpcb->inp_socket->so_rcv;
2335         int32_t space, newspace;
2336
2337         space =  ((int32_t) imin((sb->sb_hiwat - sb->sb_cc),
2338                 (sb->sb_mbmax - sb->sb_mbcnt)));
2339         if (space < 0)
2340                 space = 0;
2341
2342         /* Avoid increasing window size if the current window
2343          * is already very low, we could be in "persist" mode and
2344          * we could break some apps (see rdar://5409343)
2345          */
2346
2347         if (space < tp->t_maxseg)
2348                 return space;
2349
2350         /* Clip window size for slower link */
2351
2352         if (((tp->t_flags & TF_SLOWLINK) != 0) && slowlink_wsize > 0 )
2353                 return imin(space, slowlink_wsize);
2354
2355         /*
2356          * Check for ressources constraints before over-ajusting the amount of space we can
2357          * advertise in the TCP window size updates.
2358          */
2359
2360         if (sbspace_factor && (tp->t_inpcb->inp_pcbinfo->ipi_count < tcp_sockthreshold) &&
2361                     (total_mb_cnt / 8) < (mbstat.m_clusters / sbspace_factor)) {
2362                 if (space < (int32_t)(sb->sb_maxused - sb->sb_cc)) {/* make sure we don't constrain the window if we have enough ressources */
2363                         space = (int32_t) imax((sb->sb_maxused - sb->sb_cc), tp->rcv_maxbyps);
2364                 }
2365                 newspace = (int32_t) imax(((int32_t)sb->sb_maxused - sb->sb_cc), (int32_t)tp->rcv_maxbyps);
2366
2367                 if (newspace > space)
2368                         space = newspace;
2369         }
2370         return space;
2371 }
2372 /*
2373  * Checks TCP Segment Offloading capability for a given connection and interface pair.
2374  */
2375 void
2376 tcp_set_tso(tp, ifp)
2377         struct tcpcb *tp;
2378         struct ifnet *ifp;
2379 {
2380 #if INET6
2381         struct inpcb *inp = tp->t_inpcb;
2382         int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
2383
2384         if (isipv6) {
2385                 if (ifp && ifp->if_hwassist & IFNET_TSO_IPV6) {
2386                         tp->t_flags |= TF_TSO;
2387                         if (ifp->if_tso_v6_mtu != 0)
2388                                 tp->tso_max_segment_size = ifp->if_tso_v6_mtu;
2389                         else
2390                                 tp->tso_max_segment_size = TCP_MAXWIN;
2391                 } else
2392                                 tp->t_flags &= ~TF_TSO;
2393
2394         } else
2395 #endif /* INET6 */
2396
2397         {
2398                 if (ifp && ifp->if_hwassist & IFNET_TSO_IPV4) {
2399                         tp->t_flags |= TF_TSO;
2400                         if (ifp->if_tso_v4_mtu != 0)
2401                                 tp->tso_max_segment_size = ifp->if_tso_v4_mtu;
2402                         else
2403                                 tp->tso_max_segment_size = TCP_MAXWIN;
2404                 } else
2405                                 tp->t_flags &= ~TF_TSO;
2406         }
2407 }
2408
2409 #define TIMEVAL_TO_TCPHZ(_tv_) ((_tv_).tv_sec * TCP_RETRANSHZ + (_tv_).tv_usec / TCP_RETRANSHZ_TO_USEC)
2410
2411 /* Function to calculate the tcp clock. The tcp clock will get updated
2412  * at the boundaries of the tcp layer. This is done at 3 places:
2413  * 1. Right before processing an input tcp packet
2414  * 2. Whenever a connection wants to access the network using tcp_usrreqs
2415  * 3. When a tcp timer fires or before tcp slow timeout
2416  *
2417  */
2418
2419 void
2420 calculate_tcp_clock()
2421 {
2422         struct timeval tv = tcp_uptime;
2423         struct timeval interval = {0, TCP_RETRANSHZ_TO_USEC};
2424         struct timeval now, hold_now;
2425         uint32_t incr = 0;
2426
2427         timevaladd(&tv, &interval);
2428         microuptime(&now);
2429         if (timevalcmp(&now, &tv, >)) {
2430                 /* time to update the clock */
2431                 lck_spin_lock(tcp_uptime_lock);
2432                 if (timevalcmp(&tcp_uptime, &now, >=)) {
2433                         /* clock got updated while we were waiting for the lock */
2434                         lck_spin_unlock(tcp_uptime_lock);
2435                         return;
2436                         }
2437
2438                 microuptime(&now);
2439                 hold_now = now;
2440                 tv = tcp_uptime;
2441                 timevalsub(&now, &tv);
2442
2443                 incr = TIMEVAL_TO_TCPHZ(now);
2444                 if (incr > 0) {
2445                         tcp_uptime = hold_now;
2446                         tcp_now += incr;
2447                 }
2448
2449                 lck_spin_unlock(tcp_uptime_lock);
2450         }
2451         return;
2452 }
2453
2454 /* DSEP Review Done pl-20051213-v02 @3253,@3391,@3400 */