bsd/netinet/tcp_timer.c

   1 /*
   2  * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  30  *      The Regents of the University of California.  All rights reserved.
  31  *
  32  * Redistribution and use in source and binary forms, with or without
  33  * modification, are permitted provided that the following conditions
  34  * are met:
  35  * 1. Redistributions of source code must retain the above copyright
  36  *    notice, this list of conditions and the following disclaimer.
  37  * 2. Redistributions in binary form must reproduce the above copyright
  38  *    notice, this list of conditions and the following disclaimer in the
  39  *    documentation and/or other materials provided with the distribution.
  40  * 3. All advertising materials mentioning features or use of this software
  41  *    must display the following acknowledgement:
  42  *      This product includes software developed by the University of
  43  *      California, Berkeley and its contributors.
  44  * 4. Neither the name of the University nor the names of its contributors
  45  *    may be used to endorse or promote products derived from this software
  46  *    without specific prior written permission.
  47  *
  48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  58  * SUCH DAMAGE.
  59  *
  60  *      @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95
  61  * $FreeBSD: src/sys/netinet/tcp_timer.c,v 1.34.2.11 2001/08/22 00:59:12 silby Exp $
  62  */
  63
  64
  65 #include <sys/param.h>
  66 #include <sys/systm.h>
  67 #include <sys/kernel.h>
  68 #include <sys/mbuf.h>
  69 #include <sys/sysctl.h>
  70 #include <sys/socket.h>
  71 #include <sys/socketvar.h>
  72 #include <sys/protosw.h>
  73 #include <sys/domain.h>
  74 #include <kern/locks.h>
  75
  76 #include <kern/cpu_number.h>    /* before tcp_seq.h, for tcp_random18() */
  77
  78 #include <net/route.h>
  79
  80 #include <netinet/in.h>
  81 #include <netinet/in_systm.h>
  82 #include <netinet/in_pcb.h>
  83 #if INET6
  84 #include <netinet6/in6_pcb.h>
  85 #endif
  86 #include <netinet/ip_var.h>
  87 #include <netinet/tcp.h>
  88 #include <netinet/tcp_fsm.h>
  89 #include <netinet/tcp_seq.h>
  90 #include <netinet/tcp_timer.h>
  91 #include <netinet/tcp_var.h>
  92 #if INET6
  93 #include <netinet6/tcp6_var.h>
  94 #endif
  95 #include <netinet/tcpip.h>
  96 #if TCPDEBUG
  97 #include <netinet/tcp_debug.h>
  98 #endif
  99 #include <sys/kdebug.h>
 100
 101 extern void postevent(struct socket *, struct sockbuf *,
 102                                                int);
 103 #define DBG_FNC_TCP_FAST        NETDBG_CODE(DBG_NETTCP, (5 << 8))
 104 #define DBG_FNC_TCP_SLOW        NETDBG_CODE(DBG_NETTCP, (5 << 8) | 1)
 105
 106 static int      background_io_trigger = 5;
 107 SYSCTL_INT(_net_inet_tcp, OID_AUTO, background_io_trigger, CTLFLAG_RW,
 108     &background_io_trigger, 0, "Background IO Trigger Setting");
 109
 110 /*
 111  * NOTE - WARNING
 112  *
 113  *
 114  *
 115  *
 116  */
 117 static int
 118 sysctl_msec_to_ticks SYSCTL_HANDLER_ARGS
 119 {
 120 #pragma unused(arg1, arg2)
 121         int error, s, tt;
 122
 123         tt = *(int *)oidp->oid_arg1;
 124         s = tt * 1000 / TCP_RETRANSHZ;;
 125
 126         error = sysctl_handle_int(oidp, &s, 0, req);
 127         if (error || !req->newptr)
 128                 return (error);
 129
 130         tt = s * TCP_RETRANSHZ / 1000;
 131         if (tt < 1)
 132                 return (EINVAL);
 133
 134         *(int *)oidp->oid_arg1 = tt;
 135         return (0);
 136 }
 137
 138 int     tcp_keepinit;
 139 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW,
 140     &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "");
 141
 142 int     tcp_keepidle;
 143 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW,
 144     &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "");
 145
 146 int     tcp_keepintvl;
 147 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW,
 148     &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "");
 149
 150 int     tcp_msl;
 151 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW,
 152     &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
 153
 154 static int      always_keepalive = 0;
 155 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW,
 156     &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections");
 157
 158 /*
 159  * See tcp_syn_backoff[] for interval values between SYN retransmits;
 160  * the value set below defines the number of retransmits, before we
 161  * disable the timestamp and window scaling options during subsequent
 162  * SYN retransmits.  Setting it to 0 disables the dropping off of those
 163  * two options.
 164  */
 165 static int tcp_broken_peer_syn_rxmit_thres = 7;
 166 SYSCTL_INT(_net_inet_tcp, OID_AUTO, broken_peer_syn_rxmit_thres, CTLFLAG_RW,
 167     &tcp_broken_peer_syn_rxmit_thres, 0, "Number of retransmitted SYNs before "
 168     "TCP disables rfc1323 and rfc1644 during the rest of attempts");
 169
 170 int     tcp_pmtud_black_hole_detect = 1 ;
 171 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, CTLFLAG_RW,
 172     &tcp_pmtud_black_hole_detect, 0, "Path MTU Discovery Black Hole Detection");
 173
 174 int     tcp_pmtud_black_hole_mss = 1200 ;
 175 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, CTLFLAG_RW,
 176     &tcp_pmtud_black_hole_mss, 0, "Path MTU Discovery Black Hole Detection lowered MSS");
 177
 178 static int      tcp_keepcnt = TCPTV_KEEPCNT;
 179 static int      tcp_gc_done = FALSE;    /* perfromed garbage collection of "used" sockets */
 180         /* max idle probes */
 181 int     tcp_maxpersistidle;
 182         /* max idle time in persist */
 183 int     tcp_maxidle;
 184
 185 struct  inpcbhead       time_wait_slots[N_TIME_WAIT_SLOTS];
 186 int             cur_tw_slot = 0;
 187
 188 u_int32_t               *delack_bitmask;
 189
 190 void    add_to_time_wait_locked(struct tcpcb *tp);
 191 void    add_to_time_wait(struct tcpcb *tp) ;
 192
 193 static void tcp_garbage_collect(struct inpcb *, int);
 194
 195 void    add_to_time_wait_locked(struct tcpcb *tp)
 196 {
 197         int             tw_slot;
 198     struct inpcbinfo *pcbinfo   = &tcbinfo;
 199
 200         /* pcb list should be locked when we get here */
 201         lck_rw_assert(pcbinfo->mtx, LCK_RW_ASSERT_EXCLUSIVE);
 202
 203         LIST_REMOVE(tp->t_inpcb, inp_list);
 204
 205         if (tp->t_timer[TCPT_2MSL] <= 0)
 206             tp->t_timer[TCPT_2MSL] = 1;
 207
 208         /*
 209          * Because we're pulling this pcb out of the main TCP pcb list,
 210          * we need to recalculate the TCPT_2MSL timer value for tcp_slowtimo
 211          * higher timer granularity.
 212          */
 213
 214         tp->t_timer[TCPT_2MSL] = (tp->t_timer[TCPT_2MSL] / TCP_RETRANSHZ) * PR_SLOWHZ;
 215         tp->t_rcvtime = (tp->t_rcvtime / TCP_RETRANSHZ) * PR_SLOWHZ;
 216
 217         tp->t_rcvtime += tp->t_timer[TCPT_2MSL] & (N_TIME_WAIT_SLOTS - 1);
 218
 219         tw_slot = (tp->t_timer[TCPT_2MSL] & (N_TIME_WAIT_SLOTS - 1)) + cur_tw_slot;
 220         if (tw_slot >= N_TIME_WAIT_SLOTS)
 221             tw_slot -= N_TIME_WAIT_SLOTS;
 222
 223         LIST_INSERT_HEAD(&time_wait_slots[tw_slot], tp->t_inpcb, inp_list);
 224 }
 225
 226 void    add_to_time_wait(struct tcpcb *tp)
 227 {
 228         struct inpcbinfo *pcbinfo               = &tcbinfo;
 229
 230         if (!lck_rw_try_lock_exclusive(pcbinfo->mtx)) {
 231                 tcp_unlock(tp->t_inpcb->inp_socket, 0, 0);
 232                 lck_rw_lock_exclusive(pcbinfo->mtx);
 233                 tcp_lock(tp->t_inpcb->inp_socket, 0, 0);
 234         }
 235         add_to_time_wait_locked(tp);
 236         lck_rw_done(pcbinfo->mtx);
 237 }
 238
 239
 240
 241
 242 /*
 243  * Fast timeout routine for processing delayed acks
 244  */
 245 void
 246 tcp_fasttimo(void *arg)
 247 {
 248 #pragma unused(arg)
 249     struct inpcb *inp;
 250     register struct tcpcb *tp;
 251     struct socket *so;
 252 #if TCPDEBUG
 253     int ostate;
 254 #endif
 255
 256
 257     struct inpcbinfo *pcbinfo   = &tcbinfo;
 258
 259     int delack_done = 0;
 260
 261     KERNEL_DEBUG(DBG_FNC_TCP_FAST | DBG_FUNC_START, 0,0,0,0,0);
 262
 263
 264     lck_rw_lock_shared(pcbinfo->mtx);
 265
 266     /* Walk the list of valid tcpcbs and send ACKS on the ones with DELACK bit set */
 267
 268     LIST_FOREACH(inp, &tcb, inp_list) {
 269
 270         so = inp->inp_socket;
 271
 272         if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING)
 273                 continue;
 274
 275         tcp_lock(so, 1, 0);
 276
 277         if ((in_pcb_checkstate(inp, WNT_RELEASE,1) == WNT_STOPUSING)  && so->so_usecount == 1) {
 278                 tcp_unlock(so, 1, 0);
 279                 continue;
 280         }
 281
 282         tp = intotcpcb(inp);
 283
 284         if (tp == 0 || tp->t_state == TCPS_LISTEN) {
 285                 tcp_unlock(so, 1, 0);
 286                 continue;
 287         }
 288
 289
 290         /* Only run the retransmit timer in that case */
 291         if (tp->t_timer[0] && --tp->t_timer[0] == 0) {
 292                 tp = tcp_timers(tp, 0);
 293                 if (tp == NULL)
 294                         goto tpgone;
 295         }
 296
 297         /* TCP pcb  timers following the tcp_now clock rate */
 298
 299         tp->t_rcvtime++;
 300         tp->t_starttime++;
 301         if (tp->t_rtttime)
 302                 tp->t_rtttime++;
 303
 304         /*
 305          * Process delayed acks (if enabled) according to PR_FASTHZ, not the retrans timer
 306          */
 307
 308         if (tcp_delack_enabled && (tcp_now % (TCP_RETRANSHZ/PR_FASTHZ)) && tp->t_flags & TF_DELACK) {
 309                 delack_done++;
 310                 tp->t_flags &= ~TF_DELACK;
 311                 tp->t_flags |= TF_ACKNOW;
 312                 tcpstat.tcps_delack++;
 313                 tp->t_unacksegs = 0;
 314                 (void) tcp_output(tp);
 315         }
 316 tpgone:
 317         tcp_unlock(so, 1, 0);
 318     }
 319     KERNEL_DEBUG(DBG_FNC_TCP_FAST | DBG_FUNC_END, delack_done, 0, tcpstat.tcps_delack,0,0);
 320     lck_rw_done(pcbinfo->mtx);
 321
 322     tcp_now++;
 323     timeout(tcp_fasttimo, 0, hz/TCP_RETRANSHZ);
 324 }
 325
 326 static void
 327 tcp_garbage_collect(struct inpcb *inp, int istimewait)
 328 {
 329         struct socket *so;
 330         struct tcpcb *tp;
 331
 332         so = inp->inp_socket;
 333         tp = intotcpcb(inp);
 334
 335         /*
 336          * Skip if still in use or busy; it would have been more efficient
 337          * if we were to test so_usecount against 0, but this isn't possible
 338          * due to the current implementation of tcp_dropdropablreq() where
 339          * overflow sockets that are eligible for garbage collection have
 340          * their usecounts set to 1.
 341          */
 342         if (so->so_usecount > 1 || !lck_mtx_try_lock_spin(inp->inpcb_mtx))
 343                 return;
 344
 345         /* Check again under the lock */
 346         if (so->so_usecount > 1) {
 347                 lck_mtx_unlock(inp->inpcb_mtx);
 348                 return;
 349         }
 350
 351         /*
 352          * Overflowed socket dropped from the listening queue?  Do this
 353          * only if we are called to clean up the time wait slots, since
 354          * tcp_dropdropablreq() considers a socket to have been fully
 355          * dropped after add_to_time_wait() is finished.
 356          * Also handle the case of connections getting closed by the peer while in the queue as
 357          * seen with rdar://6422317
 358          *
 359          */
 360         if (so->so_usecount == 1 &&
 361             ((istimewait && (so->so_flags & SOF_OVERFLOW)) ||
 362             ((tp != NULL) && (tp->t_state == TCPS_CLOSED) && (so->so_head != NULL)
 363                  && ((so->so_state & (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) ==
 364                          (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE))))) {
 365
 366                 if (inp->inp_state != INPCB_STATE_DEAD) {
 367                         /* Become a regular mutex */
 368                         lck_mtx_convert_spin(inp->inpcb_mtx);
 369 #if INET6
 370                         if (INP_CHECK_SOCKAF(so, AF_INET6))
 371                                 in6_pcbdetach(inp);
 372                         else
 373 #endif /* INET6 */
 374                         in_pcbdetach(inp);
 375                 }
 376                 so->so_usecount--;
 377                 lck_mtx_unlock(inp->inpcb_mtx);
 378                 return;
 379         } else if (inp->inp_wantcnt != WNT_STOPUSING) {
 380                 lck_mtx_unlock(inp->inpcb_mtx);
 381                 return;
 382         }
 383
 384         /*
 385          * We get here because the PCB is no longer searchable (WNT_STOPUSING);
 386          * detach (if needed) and dispose if it is dead (usecount is 0).  This
 387          * covers all cases, including overflow sockets and those that are
 388          * considered as "embryonic", i.e. created by sonewconn() in TCP input
 389          * path, and have not yet been committed.  For the former, we reduce
 390          * the usecount to 0 as done by the code above.  For the latter, the
 391          * usecount would have reduced to 0 as part calling soabort() when the
 392          * socket is dropped at the end of tcp_input().
 393          */
 394         if (so->so_usecount == 0) {
 395                 /* Become a regular mutex */
 396                 lck_mtx_convert_spin(inp->inpcb_mtx);
 397                 if (inp->inp_state != INPCB_STATE_DEAD) {
 398 #if INET6
 399                         if (INP_CHECK_SOCKAF(so, AF_INET6))
 400                                 in6_pcbdetach(inp);
 401                         else
 402 #endif /* INET6 */
 403                         in_pcbdetach(inp);
 404                 }
 405                 in_pcbdispose(inp);
 406         } else {
 407                 lck_mtx_unlock(inp->inpcb_mtx);
 408         }
 409 }
 410
 411 static int bg_cnt = 0;
 412 #define BG_COUNTER_MAX 3
 413
 414 void
 415 tcp_slowtimo(void)
 416 {
 417         struct inpcb *inp, *nxt;
 418         struct tcpcb *tp;
 419         struct socket *so;
 420         int i;
 421 #if TCPDEBUG
 422         int ostate;
 423 #endif
 424
 425 #if  KDEBUG
 426         static int tws_checked = 0;
 427 #endif
 428
 429         struct inpcbinfo *pcbinfo               = &tcbinfo;
 430
 431         KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_START, 0,0,0,0,0);
 432
 433         tcp_maxidle = tcp_keepcnt * tcp_keepintvl;
 434
 435         lck_rw_lock_shared(pcbinfo->mtx);
 436
 437         bg_cnt++;
 438
 439         LIST_FOREACH(inp, &tcb, inp_list) {
 440
 441                 so = inp->inp_socket;
 442
 443                 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING)
 444                         continue;
 445
 446                 tcp_lock(so, 1, 0);
 447
 448                 if ((in_pcb_checkstate(inp, WNT_RELEASE,1) == WNT_STOPUSING)  && so->so_usecount == 1) {
 449                         tcp_unlock(so, 1, 0);
 450                         continue;
 451                 }
 452                 tp = intotcpcb(inp);
 453                 if (tp == 0 || tp->t_state == TCPS_LISTEN) {
 454                         tcp_unlock(so, 1, 0);
 455                         continue;
 456                 }
 457
 458                 tp = intotcpcb(inp);
 459
 460                 if (tp == 0 || tp->t_state == TCPS_LISTEN)
 461                         goto tpgone;
 462
 463 #if TRAFFIC_MGT
 464                 if (so->so_traffic_mgt_flags & TRAFFIC_MGT_SO_BACKGROUND &&
 465                         bg_cnt > BG_COUNTER_MAX) {
 466                         u_int32_t       curr_recvtotal = tcpstat.tcps_rcvtotal;
 467                         u_int32_t       curr_bg_recvtotal = tcpstat.tcps_bg_rcvtotal;
 468                         u_int32_t       bg_recvdiff = curr_bg_recvtotal - tp->bg_recv_snapshot;
 469                         u_int32_t       tot_recvdiff = curr_recvtotal - tp->tot_recv_snapshot;
 470                         u_int32_t       fg_recv_change = tot_recvdiff - bg_recvdiff;
 471                         u_int32_t       recv_change;
 472
 473                         if (!(so->so_traffic_mgt_flags & TRAFFIC_MGT_SO_BG_SUPPRESSED)) {
 474                                 if (tot_recvdiff)
 475                                         recv_change = (fg_recv_change * 100) / tot_recvdiff;
 476                                 else
 477                                         recv_change = 0;
 478
 479                                 if (recv_change > background_io_trigger) {
 480                                         so->so_traffic_mgt_flags |= TRAFFIC_MGT_SO_BG_SUPPRESSED;
 481                                 }
 482
 483                                 tp->tot_recv_snapshot = curr_recvtotal;
 484                                 tp->bg_recv_snapshot = curr_bg_recvtotal;
 485                         }
 486                         else {  // SUPPRESSED
 487                                 // this allows for bg traffic to subside before we start measuring total traffic change
 488                                 if (tot_recvdiff)
 489                                         recv_change = (bg_recvdiff * 100) / tot_recvdiff;
 490                                 else
 491                                         recv_change = 0;
 492
 493                                 if (recv_change < background_io_trigger) {
 494                                         // Draconian for now: if there is any change at all, keep suppressed
 495                                         if (!tot_recvdiff) {
 496                                                 so->so_traffic_mgt_flags &= ~TRAFFIC_MGT_SO_BG_SUPPRESSED;
 497                                                 tp->t_unacksegs = 0;
 498                                                 (void) tcp_output(tp);  // open window
 499                                         }
 500                                 }
 501
 502                                 tp->tot_recv_snapshot = curr_recvtotal;
 503                                 tp->bg_recv_snapshot = curr_bg_recvtotal;
 504                         }
 505                 }
 506 #endif /* TRAFFIC_MGT */
 507
 508                 for (i = 1; i < TCPT_NTIMERS; i++) {
 509                         if (tp->t_timer[i] != 0) {
 510                                 tp->t_timer[i] -= TCP_RETRANSHZ/PR_SLOWHZ;
 511                                 if (tp->t_timer[i] <=  0) {
 512 #if TCPDEBUG
 513                                         ostate = tp->t_state;
 514 #endif
 515
 516                                         tp->t_timer[i] = 0; /* account for granularity change between tcp_now and slowtimo */
 517                                         tp = tcp_timers(tp, i);
 518                                         if (tp == NULL)
 519                                                 goto tpgone;
 520 #if TCPDEBUG
 521                                         if (tp->t_inpcb->inp_socket->so_options
 522                                             & SO_DEBUG)
 523                                                 tcp_trace(TA_USER, ostate, tp,
 524                                                           (void *)0,
 525                                                           (struct tcphdr *)0,
 526                                                           PRU_SLOWTIMO);
 527 #endif
 528                                 }
 529                         }
 530                 }
 531 tpgone:
 532                 tcp_unlock(so, 1, 0);
 533         }
 534
 535         if (bg_cnt > 3)
 536                 bg_cnt = 0;
 537
 538         /* Second part of tcp_slowtimo: garbage collect socket/tcpcb
 539          * We need to acquire the list lock exclusively to do this
 540          */
 541
 542         if (lck_rw_lock_shared_to_exclusive(pcbinfo->mtx) == FALSE) {
 543                 if (tcp_gc_done == TRUE) {      /* don't sweat it this time. cleanup was done last time */
 544                         tcp_gc_done = FALSE;
 545                         KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END, tws_checked, cur_tw_slot,0,0,0);
 546                         return; /* Upgrade failed and lost lock - give up this time. */
 547                 }
 548                 lck_rw_lock_exclusive(pcbinfo->mtx);    /* Upgrade failed, lost lock now take it again exclusive */
 549         }
 550         tcp_gc_done = TRUE;
 551
 552         /*
 553          * Process the items in the current time-wait slot
 554          */
 555 #if  KDEBUG
 556         tws_checked = 0;
 557 #endif
 558         KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_NONE, tws_checked,0,0,0,0);
 559
 560         LIST_FOREACH(inp, &time_wait_slots[cur_tw_slot], inp_list) {
 561 #if KDEBUG
 562                 tws_checked++;
 563 #endif
 564
 565                 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING)
 566                         continue;
 567
 568                 tcp_lock(inp->inp_socket, 1, 0);
 569
 570                 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING)
 571                         goto twunlock;
 572
 573                 tp = intotcpcb(inp);
 574                 if (tp == NULL)  /* tp already closed, remove from list */
 575                         goto twunlock;
 576
 577                 if (tp->t_timer[TCPT_2MSL] >= N_TIME_WAIT_SLOTS) {
 578                     tp->t_timer[TCPT_2MSL] -= N_TIME_WAIT_SLOTS;
 579                     tp->t_rcvtime += N_TIME_WAIT_SLOTS;
 580                 }
 581                 else
 582                     tp->t_timer[TCPT_2MSL] = 0;
 583
 584                 if (tp->t_timer[TCPT_2MSL] == 0)  {
 585
 586                         /* That pcb is ready for a close */
 587                         tcp_free_sackholes(tp);
 588                         tp = tcp_close(tp);
 589                 }
 590 twunlock:
 591                 tcp_unlock(inp->inp_socket, 1, 0);
 592         }
 593
 594
 595         LIST_FOREACH_SAFE(inp, &tcb, inp_list, nxt) {
 596                 tcp_garbage_collect(inp, 0);
 597         }
 598
 599         /* Now cleanup the time wait ones */
 600         LIST_FOREACH_SAFE(inp, &time_wait_slots[cur_tw_slot], inp_list, nxt) {
 601                 tcp_garbage_collect(inp, 1);
 602         }
 603
 604         if (++cur_tw_slot >= N_TIME_WAIT_SLOTS)
 605                 cur_tw_slot = 0;
 606
 607         lck_rw_done(pcbinfo->mtx);
 608         KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END, tws_checked, cur_tw_slot,0,0,0);
 609 }
 610
 611 /*
 612  * Cancel all timers for TCP tp.
 613  */
 614 void
 615 tcp_canceltimers(tp)
 616         struct tcpcb *tp;
 617 {
 618         register int i;
 619
 620         for (i = 0; i < TCPT_NTIMERS; i++)
 621                 tp->t_timer[i] = 0;
 622 }
 623
 624 int     tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
 625     { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
 626
 627 int     tcp_backoff[TCP_MAXRXTSHIFT + 1] =
 628     { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 };
 629
 630 static int tcp_totbackoff = 511;        /* sum of tcp_backoff[] */
 631
 632 /*
 633  * TCP timer processing.
 634  */
 635 struct tcpcb *
 636 tcp_timers(tp, timer)
 637         register struct tcpcb *tp;
 638         int timer;
 639 {
 640         register int rexmt;
 641         struct socket *so_tmp;
 642         struct tcptemp *t_template;
 643         int optlen = 0;
 644
 645 #if TCPDEBUG
 646         int ostate;
 647 #endif
 648
 649 #if INET6
 650         int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV4) == 0;
 651 #endif /* INET6 */
 652
 653         so_tmp = tp->t_inpcb->inp_socket;
 654
 655         switch (timer) {
 656
 657         /*
 658          * 2 MSL timeout in shutdown went off.  If we're closed but
 659          * still waiting for peer to close and connection has been idle
 660          * too long, or if 2MSL time is up from TIME_WAIT or FIN_WAIT_2,
 661          * delete connection control block.
 662          * Otherwise, (this case shouldn't happen) check again in a bit
 663          * we keep the socket in the main list in that case.
 664          */
 665         case TCPT_2MSL:
 666                 tcp_free_sackholes(tp);
 667                 if (tp->t_state != TCPS_TIME_WAIT &&
 668                     tp->t_state != TCPS_FIN_WAIT_2 &&
 669                     tp->t_rcvtime < tcp_maxidle) {
 670                         tp->t_timer[TCPT_2MSL] = (u_int32_t)tcp_keepintvl;
 671                 }
 672                 else {
 673                         tp = tcp_close(tp);
 674                         return(tp);
 675                 }
 676                 break;
 677
 678         /*
 679          * Retransmission timer went off.  Message has not
 680          * been acked within retransmit interval.  Back off
 681          * to a longer retransmit interval and retransmit one segment.
 682          */
 683         case TCPT_REXMT:
 684                 tcp_free_sackholes(tp);
 685                 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
 686                         tp->t_rxtshift = TCP_MAXRXTSHIFT;
 687                         tcpstat.tcps_timeoutdrop++;
 688                         tp = tcp_drop(tp, tp->t_softerror ?
 689                             tp->t_softerror : ETIMEDOUT);
 690                         postevent(so_tmp, 0, EV_TIMEOUT);
 691                         break;
 692                 }
 693
 694                 if (tp->t_rxtshift == 1) {
 695                         /*
 696                          * first retransmit; record ssthresh and cwnd so they can
 697                          * be recovered if this turns out to be a "bad" retransmit.
 698                          * A retransmit is considered "bad" if an ACK for this
 699                          * segment is received within RTT/2 interval; the assumption
 700                          * here is that the ACK was already in flight.  See
 701                          * "On Estimating End-to-End Network Path Properties" by
 702                          * Allman and Paxson for more details.
 703                          */
 704                         tp->snd_cwnd_prev = tp->snd_cwnd;
 705                         tp->snd_ssthresh_prev = tp->snd_ssthresh;
 706                         tp->snd_recover_prev = tp->snd_recover;
 707                         if (IN_FASTRECOVERY(tp))
 708                                   tp->t_flags |= TF_WASFRECOVERY;
 709                         else
 710                                   tp->t_flags &= ~TF_WASFRECOVERY;
 711                         tp->t_badrxtwin = tcp_now  + (tp->t_srtt >> (TCP_RTT_SHIFT));
 712                 }
 713                 tcpstat.tcps_rexmttimeo++;
 714                 if (tp->t_state == TCPS_SYN_SENT)
 715                         rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift];
 716                 else
 717                         rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
 718                 TCPT_RANGESET(tp->t_rxtcur, rexmt,
 719                         tp->t_rttmin, TCPTV_REXMTMAX);
 720                 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
 721
 722                 /*
 723                  * Check for potential Path MTU Discovery Black Hole
 724                  */
 725
 726                 if (tcp_pmtud_black_hole_detect && (tp->t_state == TCPS_ESTABLISHED)) {
 727                         if (((tp->t_flags & (TF_PMTUD|TF_MAXSEGSNT)) == (TF_PMTUD|TF_MAXSEGSNT)) && (tp->t_rxtshift == 2)) {
 728                                 /*
 729                                  * Enter Path MTU Black-hole Detection mechanism:
 730                                  * - Disable Path MTU Discovery (IP "DF" bit).
 731                                  * - Reduce MTU to lower value than what we negociated with peer.
 732                                  */
 733
 734                                 tp->t_flags &= ~TF_PMTUD; /* Disable Path MTU Discovery for now */
 735                                 tp->t_flags |= TF_BLACKHOLE; /* Record that we may have found a black hole */
 736                                 optlen = tp->t_maxopd - tp->t_maxseg;
 737                                 tp->t_pmtud_saved_maxopd = tp->t_maxopd; /* Keep track of previous MSS */
 738                                 if (tp->t_maxopd > tcp_pmtud_black_hole_mss)
 739                                         tp->t_maxopd = tcp_pmtud_black_hole_mss; /* Reduce the MSS to intermediary value */
 740                                 else {
 741                                         tp->t_maxopd =  /* use the default MSS */
 742 #if INET6
 743                                                 isipv6 ? tcp_v6mssdflt :
 744 #endif /* INET6 */
 745                                                         tcp_mssdflt;
 746                                 }
 747                                 tp->t_maxseg = tp->t_maxopd - optlen;
 748                         }
 749                         /*
 750                          * If further retransmissions are still unsuccessful with a lowered MTU,
 751                          * maybe this isn't a Black Hole and we restore the previous MSS and
 752                          * blackhole detection flags.
 753                          */
 754                         else {
 755
 756                                 if ((tp->t_flags & TF_BLACKHOLE) && (tp->t_rxtshift > 4)) {
 757                                         tp->t_flags |= TF_PMTUD;
 758                                         tp->t_flags &= ~TF_BLACKHOLE;
 759                                         optlen = tp->t_maxopd - tp->t_maxseg;
 760                                         tp->t_maxopd = tp->t_pmtud_saved_maxopd;
 761                                         tp->t_maxseg = tp->t_maxopd - optlen;
 762                                 }
 763                         }
 764                 }
 765
 766
 767                 /*
 768                  * Disable rfc1323 and rfc1644 if we haven't got any response to
 769                  * our SYN (after we reach the threshold) to work-around some
 770                  * broken terminal servers (most of which have hopefully been
 771                  * retired) that have bad VJ header compression code which
 772                  * trashes TCP segments containing unknown-to-them TCP options.
 773                  */
 774                 if ((tp->t_state == TCPS_SYN_SENT) &&
 775                     (tp->t_rxtshift == tcp_broken_peer_syn_rxmit_thres))
 776                         tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_REQ_CC);
 777                 /*
 778                  * If losing, let the lower level know and try for
 779                  * a better route.  Also, if we backed off this far,
 780                  * our srtt estimate is probably bogus.  Clobber it
 781                  * so we'll take the next rtt measurement as our srtt;
 782                  * move the current srtt into rttvar to keep the current
 783                  * retransmit times until then.
 784                  */
 785                 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
 786 #if INET6
 787                         if (isipv6)
 788                                 in6_losing(tp->t_inpcb);
 789                         else
 790 #endif /* INET6 */
 791                         in_losing(tp->t_inpcb);
 792                         tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
 793                         tp->t_srtt = 0;
 794                 }
 795                 tp->snd_nxt = tp->snd_una;
 796                 /*
 797                  * Note:  We overload snd_recover to function also as the
 798                  * snd_last variable described in RFC 2582
 799                  */
 800                 tp->snd_recover = tp->snd_max;
 801                 /*
 802                  * Force a segment to be sent.
 803                  */
 804                 tp->t_flags |= TF_ACKNOW;
 805                 /*
 806                  * If timing a segment in this window, stop the timer.
 807                  */
 808                 tp->t_rtttime = 0;
 809                 /*
 810                  * Close the congestion window down to one segment
 811                  * (we'll open it by one segment for each ack we get).
 812                  * Since we probably have a window's worth of unacked
 813                  * data accumulated, this "slow start" keeps us from
 814                  * dumping all that data as back-to-back packets (which
 815                  * might overwhelm an intermediate gateway).
 816                  *
 817                  * There are two phases to the opening: Initially we
 818                  * open by one mss on each ack.  This makes the window
 819                  * size increase exponentially with time.  If the
 820                  * window is larger than the path can handle, this
 821                  * exponential growth results in dropped packet(s)
 822                  * almost immediately.  To get more time between
 823                  * drops but still "push" the network to take advantage
 824                  * of improving conditions, we switch from exponential
 825                  * to linear window opening at some threshhold size.
 826                  * For a threshhold, we use half the current window
 827                  * size, truncated to a multiple of the mss.
 828                  *
 829                  * (the minimum cwnd that will give us exponential
 830                  * growth is 2 mss.  We don't allow the threshhold
 831                  * to go below this.)
 832                  */
 833                 if (tp->t_state >=  TCPS_ESTABLISHED) {
 834                         u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
 835                         if (win < 2)
 836                                 win = 2;
 837                         tp->snd_cwnd = tp->t_maxseg;
 838                         tp->snd_ssthresh = win * tp->t_maxseg;
 839                         tp->t_bytes_acked = 0;
 840                         tp->t_dupacks = 0;
 841                         tp->t_unacksegs = 0;
 842                 }
 843                 EXIT_FASTRECOVERY(tp);
 844                 (void) tcp_output(tp);
 845                 break;
 846
 847         /*
 848          * Persistance timer into zero window.
 849          * Force a byte to be output, if possible.
 850          */
 851         case TCPT_PERSIST:
 852                 tcpstat.tcps_persisttimeo++;
 853                 /*
 854                  * Hack: if the peer is dead/unreachable, we do not
 855                  * time out if the window is closed.  After a full
 856                  * backoff, drop the connection if the idle time
 857                  * (no responses to probes) reaches the maximum
 858                  * backoff that we would use if retransmitting.
 859                  */
 860                 if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
 861                     (tp->t_rcvtime >= tcp_maxpersistidle ||
 862                     tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
 863                         tcpstat.tcps_persistdrop++;
 864                         so_tmp = tp->t_inpcb->inp_socket;
 865                         tp = tcp_drop(tp, ETIMEDOUT);
 866                         postevent(so_tmp, 0, EV_TIMEOUT);
 867                         break;
 868                 }
 869                 tcp_setpersist(tp);
 870                 tp->t_force = 1;
 871                 tp->t_unacksegs = 0;
 872                 (void) tcp_output(tp);
 873                 tp->t_force = 0;
 874                 break;
 875
 876         /*
 877          * Keep-alive timer went off; send something
 878          * or drop connection if idle for too long.
 879          */
 880         case TCPT_KEEP:
 881                 tcpstat.tcps_keeptimeo++;
 882                 if (tp->t_state < TCPS_ESTABLISHED)
 883                         goto dropit;
 884                 if ((always_keepalive ||
 885                     tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) &&
 886                     (tp->t_state <= TCPS_CLOSING || tp->t_state == TCPS_FIN_WAIT_2)) {
 887                         if (tp->t_rcvtime >= TCP_KEEPIDLE(tp) + (u_int32_t)tcp_maxidle)
 888                                 goto dropit;
 889                         /*
 890                          * Send a packet designed to force a response
 891                          * if the peer is up and reachable:
 892                          * either an ACK if the connection is still alive,
 893                          * or an RST if the peer has closed the connection
 894                          * due to timeout or reboot.
 895                          * Using sequence number tp->snd_una-1
 896                          * causes the transmitted zero-length segment
 897                          * to lie outside the receive window;
 898                          * by the protocol spec, this requires the
 899                          * correspondent TCP to respond.
 900                          */
 901                         tcpstat.tcps_keepprobe++;
 902                         t_template = tcp_maketemplate(tp);
 903                         if (t_template) {
 904                                 unsigned int ifscope;
 905
 906                                 if (tp->t_inpcb->inp_flags & INP_BOUND_IF)
 907                                         ifscope = tp->t_inpcb->inp_boundif;
 908                                 else
 909                                         ifscope = IFSCOPE_NONE;
 910
 911                                 tcp_respond(tp, t_template->tt_ipgen,
 912                                     &t_template->tt_t, (struct mbuf *)NULL,
 913                                     tp->rcv_nxt, tp->snd_una - 1, 0, ifscope);
 914                                 (void) m_free(dtom(t_template));
 915                         }
 916                         tp->t_timer[TCPT_KEEP] = tcp_keepintvl;
 917                 } else
 918                         tp->t_timer[TCPT_KEEP] = TCP_KEEPIDLE(tp);
 919                 break;
 920
 921 #if TCPDEBUG
 922         if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
 923                 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
 924                           PRU_SLOWTIMO);
 925 #endif
 926         dropit:
 927                 tcpstat.tcps_keepdrops++;
 928                 tp = tcp_drop(tp, ETIMEDOUT);
 929                 postevent(so_tmp, 0, EV_TIMEOUT);
 930                 break;
 931         }
 932         return (tp);
 933 }