bsd/netinet/tcp_timer.c

   1 /*
   2  * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  30  *      The Regents of the University of California.  All rights reserved.
  31  *
  32  * Redistribution and use in source and binary forms, with or without
  33  * modification, are permitted provided that the following conditions
  34  * are met:
  35  * 1. Redistributions of source code must retain the above copyright
  36  *    notice, this list of conditions and the following disclaimer.
  37  * 2. Redistributions in binary form must reproduce the above copyright
  38  *    notice, this list of conditions and the following disclaimer in the
  39  *    documentation and/or other materials provided with the distribution.
  40  * 3. All advertising materials mentioning features or use of this software
  41  *    must display the following acknowledgement:
  42  *      This product includes software developed by the University of
  43  *      California, Berkeley and its contributors.
  44  * 4. Neither the name of the University nor the names of its contributors
  45  *    may be used to endorse or promote products derived from this software
  46  *    without specific prior written permission.
  47  *
  48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  58  * SUCH DAMAGE.
  59  *
  60  *      @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95
  61  * $FreeBSD: src/sys/netinet/tcp_timer.c,v 1.34.2.11 2001/08/22 00:59:12 silby Exp $
  62  */
  63
  64
  65 #include <sys/param.h>
  66 #include <sys/systm.h>
  67 #include <sys/kernel.h>
  68 #include <sys/mbuf.h>
  69 #include <sys/sysctl.h>
  70 #include <sys/socket.h>
  71 #include <sys/socketvar.h>
  72 #include <sys/protosw.h>
  73 #include <kern/locks.h>
  74
  75 #include <kern/cpu_number.h>    /* before tcp_seq.h, for tcp_random18() */
  76
  77 #include <net/route.h>
  78
  79 #include <netinet/in.h>
  80 #include <netinet/in_systm.h>
  81 #include <netinet/in_pcb.h>
  82 #if INET6
  83 #include <netinet6/in6_pcb.h>
  84 #endif
  85 #include <netinet/ip_var.h>
  86 #include <netinet/tcp.h>
  87 #include <netinet/tcp_fsm.h>
  88 #include <netinet/tcp_seq.h>
  89 #include <netinet/tcp_timer.h>
  90 #include <netinet/tcp_var.h>
  91 #include <netinet/tcpip.h>
  92 #if TCPDEBUG
  93 #include <netinet/tcp_debug.h>
  94 #endif
  95 #include <sys/kdebug.h>
  96
  97 extern void postevent(struct socket *, struct sockbuf *,
  98                                                int);
  99 #define DBG_FNC_TCP_FAST        NETDBG_CODE(DBG_NETTCP, (5 << 8))
 100 #define DBG_FNC_TCP_SLOW        NETDBG_CODE(DBG_NETTCP, (5 << 8) | 1)
 101
 102 static int      background_io_trigger = 5;
 103 SYSCTL_INT(_net_inet_tcp, OID_AUTO, background_io_trigger, CTLFLAG_RW,
 104     &background_io_trigger, 0, "Background IO Trigger Setting");
 105
 106 /*
 107  * NOTE - WARNING
 108  *
 109  *
 110  *
 111  *
 112  */
 113 static int
 114 sysctl_msec_to_ticks SYSCTL_HANDLER_ARGS
 115 {
 116 #pragma unused(arg1, arg2)
 117         int error, s, tt;
 118
 119         tt = *(int *)oidp->oid_arg1;
 120         s = tt * 1000 / TCP_RETRANSHZ;;
 121
 122         error = sysctl_handle_int(oidp, &s, 0, req);
 123         if (error || !req->newptr)
 124                 return (error);
 125
 126         tt = s * TCP_RETRANSHZ / 1000;
 127         if (tt < 1)
 128                 return (EINVAL);
 129
 130         *(int *)oidp->oid_arg1 = tt;
 131         return (0);
 132 }
 133
 134 int     tcp_keepinit;
 135 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW,
 136     &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "");
 137
 138 int     tcp_keepidle;
 139 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW,
 140     &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "");
 141
 142 int     tcp_keepintvl;
 143 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW,
 144     &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "");
 145
 146 int     tcp_msl;
 147 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW,
 148     &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
 149
 150 static int      always_keepalive = 0;
 151 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW,
 152     &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections");
 153
 154 static int      tcp_keepcnt = TCPTV_KEEPCNT;
 155 static int      tcp_gc_done = FALSE;    /* perfromed garbage collection of "used" sockets */
 156         /* max idle probes */
 157 int     tcp_maxpersistidle;
 158         /* max idle time in persist */
 159 int     tcp_maxidle;
 160
 161 struct  inpcbhead       time_wait_slots[N_TIME_WAIT_SLOTS];
 162 int             cur_tw_slot = 0;
 163
 164 u_long          *delack_bitmask;
 165
 166 void    add_to_time_wait_locked(struct tcpcb *tp);
 167 void    add_to_time_wait(struct tcpcb *tp) ;
 168
 169
 170 void    add_to_time_wait_locked(struct tcpcb *tp)
 171 {
 172         int             tw_slot;
 173     struct inpcbinfo *pcbinfo   = &tcbinfo;
 174
 175         /* pcb list should be locked when we get here */
 176         lck_rw_assert(pcbinfo->mtx, LCK_RW_ASSERT_EXCLUSIVE);
 177
 178         LIST_REMOVE(tp->t_inpcb, inp_list);
 179
 180         if (tp->t_timer[TCPT_2MSL] <= 0)
 181             tp->t_timer[TCPT_2MSL] = 1;
 182
 183         /*
 184          * Because we're pulling this pcb out of the main TCP pcb list,
 185          * we need to recalculate the TCPT_2MSL timer value for tcp_slowtimo
 186          * higher timer granularity.
 187          */
 188
 189         tp->t_timer[TCPT_2MSL] = (tp->t_timer[TCPT_2MSL] / TCP_RETRANSHZ) * PR_SLOWHZ;
 190         tp->t_rcvtime = (tp->t_rcvtime / TCP_RETRANSHZ) * PR_SLOWHZ;
 191
 192         tp->t_rcvtime += tp->t_timer[TCPT_2MSL] & (N_TIME_WAIT_SLOTS - 1);
 193
 194         tw_slot = (tp->t_timer[TCPT_2MSL] & (N_TIME_WAIT_SLOTS - 1)) + cur_tw_slot;
 195         if (tw_slot >= N_TIME_WAIT_SLOTS)
 196             tw_slot -= N_TIME_WAIT_SLOTS;
 197
 198         LIST_INSERT_HEAD(&time_wait_slots[tw_slot], tp->t_inpcb, inp_list);
 199 }
 200
 201 void    add_to_time_wait(struct tcpcb *tp)
 202 {
 203         struct inpcbinfo *pcbinfo               = &tcbinfo;
 204
 205         if (!lck_rw_try_lock_exclusive(pcbinfo->mtx)) {
 206                 tcp_unlock(tp->t_inpcb->inp_socket, 0, 0);
 207                 lck_rw_lock_exclusive(pcbinfo->mtx);
 208                 tcp_lock(tp->t_inpcb->inp_socket, 0, 0);
 209         }
 210         add_to_time_wait_locked(tp);
 211         lck_rw_done(pcbinfo->mtx);
 212 }
 213
 214
 215
 216
 217 /*
 218  * Fast timeout routine for processing delayed acks
 219  */
 220 void
 221 tcp_fasttimo()
 222 {
 223     struct inpcb *inp;
 224     register struct tcpcb *tp;
 225     struct socket *so;
 226 #if TCPDEBUG
 227     int ostate;
 228 #endif
 229
 230
 231     struct inpcbinfo *pcbinfo   = &tcbinfo;
 232
 233     int delack_done = 0;
 234
 235     KERNEL_DEBUG(DBG_FNC_TCP_FAST | DBG_FUNC_START, 0,0,0,0,0);
 236
 237
 238     lck_rw_lock_shared(pcbinfo->mtx);
 239
 240     /* Walk the list of valid tcpcbs and send ACKS on the ones with DELACK bit set */
 241
 242     LIST_FOREACH(inp, &tcb, inp_list) {
 243
 244         so = inp->inp_socket;
 245
 246         if (so == &tcbinfo.nat_dummy_socket)
 247                 continue;
 248
 249         if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING)
 250                 continue;
 251
 252         tcp_lock(so, 1, 0);
 253
 254         if ((in_pcb_checkstate(inp, WNT_RELEASE,1) == WNT_STOPUSING)  && so->so_usecount == 1) {
 255                 tcp_unlock(so, 1, 0);
 256                 continue;
 257         }
 258
 259         tp = intotcpcb(inp);
 260
 261         if (tp == 0 || tp->t_state == TCPS_LISTEN) {
 262                 tcp_unlock(so, 1, 0);
 263                 continue;
 264         }
 265
 266
 267         /* Only run the retransmit timer in that case */
 268         if (tp->t_timer[0] && --tp->t_timer[0] == 0) {
 269                 tp = tcp_timers(tp, 0);
 270                 if (tp == NULL)
 271                         goto tpgone;
 272         }
 273
 274         /* TCP pcb  timers following the tcp_now clock rate */
 275
 276         tp->t_rcvtime++;
 277         tp->t_starttime++;
 278         if (tp->t_rtttime)
 279                 tp->t_rtttime++;
 280
 281         /*
 282          * Process delayed acks (if enabled) according to PR_FASTHZ, not the retrans timer
 283          */
 284
 285         if (tcp_delack_enabled && (tcp_now % (TCP_RETRANSHZ/PR_FASTHZ)) && tp->t_flags & TF_DELACK) {
 286                 delack_done++;
 287                 tp->t_flags &= ~TF_DELACK;
 288                 tp->t_flags |= TF_ACKNOW;
 289                 tcpstat.tcps_delack++;
 290                 tp->t_unacksegs = 0;
 291                 (void) tcp_output(tp);
 292         }
 293 tpgone:
 294         tcp_unlock(so, 1, 0);
 295     }
 296     KERNEL_DEBUG(DBG_FNC_TCP_FAST | DBG_FUNC_END, delack_done, 0, tcpstat.tcps_delack,0,0);
 297     lck_rw_done(pcbinfo->mtx);
 298
 299     tcp_now++;
 300     timeout(tcp_fasttimo, 0, hz/TCP_RETRANSHZ);
 301 }
 302
 303 void
 304 tcp_garbage_collect(inp, istimewait)
 305         struct inpcb *inp;
 306         int istimewait;
 307 {
 308         struct socket *so;
 309         struct tcpcb *tp;
 310
 311
 312                 if (inp->inp_socket == &tcbinfo.nat_dummy_socket)
 313                                 return;
 314
 315
 316                 if (!lck_mtx_try_lock(inp->inpcb_mtx)) /* skip if still in use */
 317                         return;
 318
 319                 so = inp->inp_socket;
 320                 tp = intotcpcb(inp);
 321
 322                 if ((so->so_usecount == 1) &&
 323                         (so->so_flags & SOF_OVERFLOW)) {
 324                                 in_pcbdetach(inp);
 325                                 so->so_usecount--;
 326                                 lck_mtx_unlock(inp->inpcb_mtx);
 327                                 return;
 328                 }
 329                 else {
 330                         if (inp->inp_wantcnt != WNT_STOPUSING) {
 331                                 lck_mtx_unlock(inp->inpcb_mtx);
 332                                 return;
 333                         }
 334                 }
 335
 336
 337                 if (so->so_usecount == 0)
 338                         in_pcbdispose(inp);
 339                 else {
 340                         /* Special case:
 341                          * - Check for embryonic socket stuck on listener queue (4023660)
 342                          * - overflowed socket dropped from the listening queue
 343                          * and dispose of remaining reference
 344                          */
 345                         if ((so->so_usecount == 1) &&
 346                           (((tp->t_state == TCPS_CLOSED) && (so->so_head != NULL) && (so->so_state & SS_INCOMP)) ||
 347                                 (istimewait && (so->so_flags & SOF_OVERFLOW)))) {
 348                                         so->so_usecount--;
 349                                         in_pcbdispose(inp);
 350                         } else
 351                                 lck_mtx_unlock(inp->inpcb_mtx);
 352                 }
 353 }
 354
 355 static int bg_cnt = 0;
 356 #define BG_COUNTER_MAX 3
 357
 358 void
 359 tcp_slowtimo()
 360 {
 361         struct inpcb *inp, *nxt;
 362         struct tcpcb *tp;
 363         struct socket *so;
 364         int i;
 365 #if TCPDEBUG
 366         int ostate;
 367 #endif
 368
 369         static int tws_checked = 0;
 370
 371         struct inpcbinfo *pcbinfo               = &tcbinfo;
 372
 373         KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_START, 0,0,0,0,0);
 374
 375         tcp_maxidle = tcp_keepcnt * tcp_keepintvl;
 376
 377         lck_rw_lock_shared(pcbinfo->mtx);
 378
 379         bg_cnt++;
 380
 381         LIST_FOREACH(inp, &tcb, inp_list) {
 382
 383                 so = inp->inp_socket;
 384
 385                 if (so == &tcbinfo.nat_dummy_socket)
 386                         continue;
 387
 388                 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING)
 389                         continue;
 390
 391                 tcp_lock(so, 1, 0);
 392
 393                 if ((in_pcb_checkstate(inp, WNT_RELEASE,1) == WNT_STOPUSING)  && so->so_usecount == 1) {
 394                         tcp_unlock(so, 1, 0);
 395                         continue;
 396                 }
 397                 tp = intotcpcb(inp);
 398                 if (tp == 0 || tp->t_state == TCPS_LISTEN) {
 399                         tcp_unlock(so, 1, 0);
 400                         continue;
 401                 }
 402
 403                 tp = intotcpcb(inp);
 404
 405                 if (tp == 0 || tp->t_state == TCPS_LISTEN)
 406                         goto tpgone;
 407
 408 #if TRAFFIC_MGT
 409                 if (so->so_traffic_mgt_flags & TRAFFIC_MGT_SO_BACKGROUND &&
 410                         bg_cnt > BG_COUNTER_MAX) {
 411                         u_int32_t       curr_recvtotal = tcpstat.tcps_rcvtotal;
 412                         u_int32_t       curr_bg_recvtotal = tcpstat.tcps_bg_rcvtotal;
 413                         u_int32_t       bg_recvdiff = curr_bg_recvtotal - tp->bg_recv_snapshot;
 414                         u_int32_t       tot_recvdiff = curr_recvtotal - tp->tot_recv_snapshot;
 415                         u_int32_t       fg_recv_change = tot_recvdiff - bg_recvdiff;
 416                         u_int32_t       recv_change;
 417
 418                         if (!(so->so_traffic_mgt_flags & TRAFFIC_MGT_SO_BG_SUPPRESSED)) {
 419                                 if (tot_recvdiff)
 420                                         recv_change = (fg_recv_change * 100) / tot_recvdiff;
 421                                 else
 422                                         recv_change = 0;
 423
 424                                 if (recv_change > background_io_trigger) {
 425                                         so->so_traffic_mgt_flags |= TRAFFIC_MGT_SO_BG_SUPPRESSED;
 426                                 }
 427
 428                                 tp->tot_recv_snapshot = curr_recvtotal;
 429                                 tp->bg_recv_snapshot = curr_bg_recvtotal;
 430                         }
 431                         else {  // SUPPRESSED
 432                                 // this allows for bg traffic to subside before we start measuring total traffic change
 433                                 if (tot_recvdiff)
 434                                         recv_change = (bg_recvdiff * 100) / tot_recvdiff;
 435                                 else
 436                                         recv_change = 0;
 437
 438                                 if (recv_change < background_io_trigger) {
 439                                         // Draconian for now: if there is any change at all, keep suppressed
 440                                         if (!tot_recvdiff) {
 441                                                 so->so_traffic_mgt_flags &= ~TRAFFIC_MGT_SO_BG_SUPPRESSED;
 442                                                 tp->t_unacksegs = 0;
 443                                                 (void) tcp_output(tp);  // open window
 444                                         }
 445                                 }
 446
 447                                 tp->tot_recv_snapshot = curr_recvtotal;
 448                                 tp->bg_recv_snapshot = curr_bg_recvtotal;
 449                         }
 450                 }
 451 #endif /* TRAFFIC_MGT */
 452
 453                 for (i = 1; i < TCPT_NTIMERS; i++) {
 454                         if (tp->t_timer[i] != 0) {
 455                                 tp->t_timer[i] -= TCP_RETRANSHZ/PR_SLOWHZ;
 456                                 if (tp->t_timer[i] <=  0) {
 457 #if TCPDEBUG
 458                                         ostate = tp->t_state;
 459 #endif
 460
 461                                         tp->t_timer[i] = 0; /* account for granularity change between tcp_now and slowtimo */
 462                                         tp = tcp_timers(tp, i);
 463                                         if (tp == NULL)
 464                                                 goto tpgone;
 465 #if TCPDEBUG
 466                                         if (tp->t_inpcb->inp_socket->so_options
 467                                             & SO_DEBUG)
 468                                                 tcp_trace(TA_USER, ostate, tp,
 469                                                           (void *)0,
 470                                                           (struct tcphdr *)0,
 471                                                           PRU_SLOWTIMO);
 472 #endif
 473                                 }
 474                         }
 475                 }
 476 tpgone:
 477                 tcp_unlock(so, 1, 0);
 478         }
 479
 480         if (bg_cnt > 3)
 481                 bg_cnt = 0;
 482
 483         /* Second part of tcp_slowtimo: garbage collect socket/tcpcb
 484          * We need to acquire the list lock exclusively to do this
 485          */
 486
 487         if (lck_rw_lock_shared_to_exclusive(pcbinfo->mtx) == FALSE) {
 488                 if (tcp_gc_done == TRUE) {      /* don't sweat it this time. cleanup was done last time */
 489                         tcp_gc_done = FALSE;
 490                         KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END, tws_checked, cur_tw_slot,0,0,0);
 491                         return; /* Upgrade failed and lost lock - give up this time. */
 492                 }
 493                 lck_rw_lock_exclusive(pcbinfo->mtx);    /* Upgrade failed, lost lock now take it again exclusive */
 494         }
 495         tcp_gc_done = TRUE;
 496
 497         /*
 498          * Process the items in the current time-wait slot
 499          */
 500 #if  KDEBUG
 501         tws_checked = 0;
 502 #endif
 503         KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_NONE, tws_checked,0,0,0,0);
 504
 505         LIST_FOREACH(inp, &time_wait_slots[cur_tw_slot], inp_list) {
 506 #if KDEBUG
 507                 tws_checked++;
 508 #endif
 509
 510                 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING)
 511                         continue;
 512
 513                 tcp_lock(inp->inp_socket, 1, 0);
 514
 515                 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING)
 516                         goto twunlock;
 517
 518                 tp = intotcpcb(inp);
 519                 if (tp == NULL)  /* tp already closed, remove from list */
 520                         goto twunlock;
 521
 522                 if (tp->t_timer[TCPT_2MSL] >= N_TIME_WAIT_SLOTS) {
 523                     tp->t_timer[TCPT_2MSL] -= N_TIME_WAIT_SLOTS;
 524                     tp->t_rcvtime += N_TIME_WAIT_SLOTS;
 525                 }
 526                 else
 527                     tp->t_timer[TCPT_2MSL] = 0;
 528
 529                 if (tp->t_timer[TCPT_2MSL] == 0)  {
 530
 531                         /* That pcb is ready for a close */
 532                         tcp_free_sackholes(tp);
 533                         tp = tcp_close(tp);
 534                 }
 535 twunlock:
 536                 tcp_unlock(inp->inp_socket, 1, 0);
 537         }
 538
 539
 540         LIST_FOREACH_SAFE(inp, &tcb, inp_list, nxt) {
 541                 tcp_garbage_collect(inp, 0);
 542         }
 543
 544         /* Now cleanup the time wait ones */
 545         LIST_FOREACH_SAFE(inp, &time_wait_slots[cur_tw_slot], inp_list, nxt) {
 546                 tcp_garbage_collect(inp, 1);
 547         }
 548
 549         if (++cur_tw_slot >= N_TIME_WAIT_SLOTS)
 550                 cur_tw_slot = 0;
 551
 552         lck_rw_done(pcbinfo->mtx);
 553         KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END, tws_checked, cur_tw_slot,0,0,0);
 554 }
 555
 556 /*
 557  * Cancel all timers for TCP tp.
 558  */
 559 void
 560 tcp_canceltimers(tp)
 561         struct tcpcb *tp;
 562 {
 563         register int i;
 564
 565         for (i = 0; i < TCPT_NTIMERS; i++)
 566                 tp->t_timer[i] = 0;
 567 }
 568
 569 int     tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
 570     { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
 571
 572 int     tcp_backoff[TCP_MAXRXTSHIFT + 1] =
 573     { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 };
 574
 575 static int tcp_totbackoff = 511;        /* sum of tcp_backoff[] */
 576
 577 /*
 578  * TCP timer processing.
 579  */
 580 struct tcpcb *
 581 tcp_timers(tp, timer)
 582         register struct tcpcb *tp;
 583         int timer;
 584 {
 585         register int rexmt;
 586         struct socket *so_tmp;
 587         struct inpcbinfo *pcbinfo               = &tcbinfo;
 588         struct tcptemp *t_template;
 589
 590 #if TCPDEBUG
 591         int ostate;
 592 #endif
 593
 594 #if INET6
 595         int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV4) == 0;
 596 #endif /* INET6 */
 597
 598         so_tmp = tp->t_inpcb->inp_socket;
 599
 600         switch (timer) {
 601
 602         /*
 603          * 2 MSL timeout in shutdown went off.  If we're closed but
 604          * still waiting for peer to close and connection has been idle
 605          * too long, or if 2MSL time is up from TIME_WAIT or FIN_WAIT_2,
 606          * delete connection control block.
 607          * Otherwise, (this case shouldn't happen) check again in a bit
 608          * we keep the socket in the main list in that case.
 609          */
 610         case TCPT_2MSL:
 611                 tcp_free_sackholes(tp);
 612                 if (tp->t_state != TCPS_TIME_WAIT &&
 613                     tp->t_state != TCPS_FIN_WAIT_2 &&
 614                     tp->t_rcvtime < tcp_maxidle) {
 615                         tp->t_timer[TCPT_2MSL] = (unsigned long)tcp_keepintvl;
 616                 }
 617                 else {
 618                         tp = tcp_close(tp);
 619                         return(tp);
 620                 }
 621                 break;
 622
 623         /*
 624          * Retransmission timer went off.  Message has not
 625          * been acked within retransmit interval.  Back off
 626          * to a longer retransmit interval and retransmit one segment.
 627          */
 628         case TCPT_REXMT:
 629                 tcp_free_sackholes(tp);
 630                 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
 631                         tp->t_rxtshift = TCP_MAXRXTSHIFT;
 632                         tcpstat.tcps_timeoutdrop++;
 633                         tp = tcp_drop(tp, tp->t_softerror ?
 634                             tp->t_softerror : ETIMEDOUT);
 635                         postevent(so_tmp, 0, EV_TIMEOUT);
 636                         break;
 637                 }
 638
 639                 if (tp->t_rxtshift == 1) {
 640                         /*
 641                          * first retransmit; record ssthresh and cwnd so they can
 642                          * be recovered if this turns out to be a "bad" retransmit.
 643                          * A retransmit is considered "bad" if an ACK for this
 644                          * segment is received within RTT/2 interval; the assumption
 645                          * here is that the ACK was already in flight.  See
 646                          * "On Estimating End-to-End Network Path Properties" by
 647                          * Allman and Paxson for more details.
 648                          */
 649                         tp->snd_cwnd_prev = tp->snd_cwnd;
 650                         tp->snd_ssthresh_prev = tp->snd_ssthresh;
 651                         tp->snd_recover_prev = tp->snd_recover;
 652                         if (IN_FASTRECOVERY(tp))
 653                                   tp->t_flags |= TF_WASFRECOVERY;
 654                         else
 655                                   tp->t_flags &= ~TF_WASFRECOVERY;
 656                         tp->t_badrxtwin = tcp_now  + (tp->t_srtt >> (TCP_RTT_SHIFT));
 657                 }
 658                 tcpstat.tcps_rexmttimeo++;
 659                 if (tp->t_state == TCPS_SYN_SENT)
 660                         rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift];
 661                 else
 662                         rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
 663                 TCPT_RANGESET(tp->t_rxtcur, rexmt,
 664                         tp->t_rttmin, TCPTV_REXMTMAX);
 665                 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
 666
 667                 /*
 668                  * Disable rfc1323 and rfc1644 if we havn't got any response to
 669                  * our third SYN to work-around some broken terminal servers
 670                  * (most of which have hopefully been retired) that have bad VJ
 671                  * header compression code which trashes TCP segments containing
 672                  * unknown-to-them TCP options.
 673                  */
 674                 if ((tp->t_state == TCPS_SYN_SENT) && (tp->t_rxtshift == 3))
 675                                 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_REQ_CC);
 676                 /*
 677                  * If losing, let the lower level know and try for
 678                  * a better route.  Also, if we backed off this far,
 679                  * our srtt estimate is probably bogus.  Clobber it
 680                  * so we'll take the next rtt measurement as our srtt;
 681                  * move the current srtt into rttvar to keep the current
 682                  * retransmit times until then.
 683                  */
 684                 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
 685 #if INET6
 686                         if (isipv6)
 687                                 in6_losing(tp->t_inpcb);
 688                         else
 689 #endif /* INET6 */
 690                         in_losing(tp->t_inpcb);
 691                         tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
 692                         tp->t_srtt = 0;
 693                 }
 694                 tp->snd_nxt = tp->snd_una;
 695                 /*
 696                  * Note:  We overload snd_recover to function also as the
 697                  * snd_last variable described in RFC 2582
 698                  */
 699                 tp->snd_recover = tp->snd_max;
 700                 /*
 701                  * Force a segment to be sent.
 702                  */
 703                 tp->t_flags |= TF_ACKNOW;
 704                 /*
 705                  * If timing a segment in this window, stop the timer.
 706                  */
 707                 tp->t_rtttime = 0;
 708                 /*
 709                  * Close the congestion window down to one segment
 710                  * (we'll open it by one segment for each ack we get).
 711                  * Since we probably have a window's worth of unacked
 712                  * data accumulated, this "slow start" keeps us from
 713                  * dumping all that data as back-to-back packets (which
 714                  * might overwhelm an intermediate gateway).
 715                  *
 716                  * There are two phases to the opening: Initially we
 717                  * open by one mss on each ack.  This makes the window
 718                  * size increase exponentially with time.  If the
 719                  * window is larger than the path can handle, this
 720                  * exponential growth results in dropped packet(s)
 721                  * almost immediately.  To get more time between
 722                  * drops but still "push" the network to take advantage
 723                  * of improving conditions, we switch from exponential
 724                  * to linear window opening at some threshhold size.
 725                  * For a threshhold, we use half the current window
 726                  * size, truncated to a multiple of the mss.
 727                  *
 728                  * (the minimum cwnd that will give us exponential
 729                  * growth is 2 mss.  We don't allow the threshhold
 730                  * to go below this.)
 731                  */
 732                 if (tp->t_state >=  TCPS_ESTABLISHED) {
 733                         u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
 734                         if (win < 2)
 735                                 win = 2;
 736                         tp->snd_cwnd = tp->t_maxseg;
 737                         tp->snd_ssthresh = win * tp->t_maxseg;
 738                         tp->t_bytes_acked = 0;
 739                         tp->t_dupacks = 0;
 740                         tp->t_unacksegs = 0;
 741                 }
 742                 EXIT_FASTRECOVERY(tp);
 743                 (void) tcp_output(tp);
 744                 break;
 745
 746         /*
 747          * Persistance timer into zero window.
 748          * Force a byte to be output, if possible.
 749          */
 750         case TCPT_PERSIST:
 751                 tcpstat.tcps_persisttimeo++;
 752                 /*
 753                  * Hack: if the peer is dead/unreachable, we do not
 754                  * time out if the window is closed.  After a full
 755                  * backoff, drop the connection if the idle time
 756                  * (no responses to probes) reaches the maximum
 757                  * backoff that we would use if retransmitting.
 758                  */
 759                 if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
 760                     (tp->t_rcvtime >= tcp_maxpersistidle ||
 761                     tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
 762                         tcpstat.tcps_persistdrop++;
 763                         so_tmp = tp->t_inpcb->inp_socket;
 764                         tp = tcp_drop(tp, ETIMEDOUT);
 765                         postevent(so_tmp, 0, EV_TIMEOUT);
 766                         break;
 767                 }
 768                 tcp_setpersist(tp);
 769                 tp->t_force = 1;
 770                 tp->t_unacksegs = 0;
 771                 (void) tcp_output(tp);
 772                 tp->t_force = 0;
 773                 break;
 774
 775         /*
 776          * Keep-alive timer went off; send something
 777          * or drop connection if idle for too long.
 778          */
 779         case TCPT_KEEP:
 780                 tcpstat.tcps_keeptimeo++;
 781                 if (tp->t_state < TCPS_ESTABLISHED)
 782                         goto dropit;
 783                 if ((always_keepalive ||
 784                     tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) &&
 785                     (tp->t_state <= TCPS_CLOSING || tp->t_state == TCPS_FIN_WAIT_2)) {
 786                         if (tp->t_rcvtime >= TCP_KEEPIDLE(tp) + (unsigned long)tcp_maxidle)
 787                                 goto dropit;
 788                         /*
 789                          * Send a packet designed to force a response
 790                          * if the peer is up and reachable:
 791                          * either an ACK if the connection is still alive,
 792                          * or an RST if the peer has closed the connection
 793                          * due to timeout or reboot.
 794                          * Using sequence number tp->snd_una-1
 795                          * causes the transmitted zero-length segment
 796                          * to lie outside the receive window;
 797                          * by the protocol spec, this requires the
 798                          * correspondent TCP to respond.
 799                          */
 800                         tcpstat.tcps_keepprobe++;
 801                         t_template = tcp_maketemplate(tp);
 802                         if (t_template) {
 803                                 tcp_respond(tp, t_template->tt_ipgen,
 804                                     &t_template->tt_t, (struct mbuf *)NULL,
 805                                     tp->rcv_nxt, tp->snd_una - 1, 0, NULL);
 806                                 (void) m_free(dtom(t_template));
 807                         }
 808                         tp->t_timer[TCPT_KEEP] = tcp_keepintvl;
 809                 } else
 810                         tp->t_timer[TCPT_KEEP] = TCP_KEEPIDLE(tp);
 811                 break;
 812
 813 #if TCPDEBUG
 814         if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
 815                 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
 816                           PRU_SLOWTIMO);
 817 #endif
 818         dropit:
 819                 tcpstat.tcps_keepdrops++;
 820                 tp = tcp_drop(tp, ETIMEDOUT);
 821                 postevent(so_tmp, 0, EV_TIMEOUT);
 822                 break;
 823         }
 824         return (tp);
 825 }