]> git.saurik.com Git - apple/xnu.git/blobdiff - bsd/netinet/tcp_timer.c
xnu-2422.115.4.tar.gz
[apple/xnu.git] / bsd / netinet / tcp_timer.c
index 706ec823c640ea62fde4e844439b3bd3f499c6fd..b1ac3138b186b3b102ab02419ae4442d0c95eff5 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2011 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
 #include <sys/mcache.h>
 #include <sys/queue.h>
 #include <kern/locks.h>
-
 #include <kern/cpu_number.h>   /* before tcp_seq.h, for tcp_random18() */
+#include <mach/boolean.h>
 
 #include <net/route.h>
+#include <net/if_var.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #endif
 #include <sys/kdebug.h>
 #include <mach/sdt.h>
+#include <netinet/mptcp_var.h>
 
 extern void postevent(struct socket *, struct sockbuf *,
                                                int);
@@ -121,6 +123,12 @@ extern void postevent(struct socket *, struct sockbuf *,
                panic("Bad link elm %p prev->next != elm", (elm));      \
 } while(0)
 
+/* tcp timer list */
+struct tcptimerlist tcp_timer_list;
+
+/* List of pcbs in timewait state, protected by tcbinfo's ipi_lock */
+struct tcptailq tcp_tw_tailq;
+
 static int     background_io_trigger = 5;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, background_io_trigger, CTLFLAG_RW | CTLFLAG_LOCKED,
     &background_io_trigger, 0, "Background IO Trigger Setting");
@@ -158,6 +166,10 @@ int        tcp_keepintvl;
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
     &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "");
 
+int    tcp_keepcnt;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
+       &tcp_keepcnt, 0, "number of times to repeat keepalive");
+
 int    tcp_msl;
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
     &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
@@ -195,6 +207,12 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, broken_peer_syn_rxmit_thres, CTLFLAG_RW | CT
     &tcp_broken_peer_syn_rxmit_thres, 0, "Number of retransmitted SYNs before "
     "TCP disables rfc1323 and rfc1644 during the rest of attempts");
 
+/* A higher threshold on local connections for disabling RFC 1323 options */
+static int tcp_broken_peer_syn_rxmit_thres_local = 10;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, broken_peer_syn_rexmit_thres_local, 
+       CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_broken_peer_syn_rxmit_thres_local, 0,
+       "Number of retransmitted SYNs before disabling RFC 1323 options on local connections");
+
 static int tcp_timer_advanced = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_timer_advanced, CTLFLAG_RD | CTLFLAG_LOCKED,
     &tcp_timer_advanced, 0, "Number of times one of the timers was advanced");
@@ -212,12 +230,11 @@ int       tcp_pmtud_black_hole_mss = 1200 ;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, CTLFLAG_RW | CTLFLAG_LOCKED,
     &tcp_pmtud_black_hole_mss, 0, "Path MTU Discovery Black Hole Detection lowered MSS");
 
-static int     tcp_keepcnt = TCPTV_KEEPCNT;
-static int     tcp_gc_done = FALSE;    /* perfromed garbage collection of "used" sockets */
+/* performed garbage collection of "used" sockets */
+static boolean_t tcp_gc_done = FALSE;
+
        /* max idle probes */
 int    tcp_maxpersistidle;
-       /* max idle time in persist */
-int    tcp_maxidle;
 
 /* TCP delack timer is set to 100 ms. Since the processing of timer list in fast
  * mode will happen no faster than 100 ms, the delayed ack timer will fire some where 
@@ -225,11 +242,12 @@ int       tcp_maxidle;
  */
 int    tcp_delack = TCP_RETRANSHZ / 10;
 
-struct inpcbhead       time_wait_slots[N_TIME_WAIT_SLOTS];
-int            cur_tw_slot = 0;
-
-/* tcp timer list */
-struct tcptimerlist tcp_timer_list;
+#if MPTCP
+/*
+ * MP_JOIN retransmission of 3rd ACK will be every 500 msecs without backoff
+ */
+int    tcp_jack_rxmt = TCP_RETRANSHZ / 2;
+#endif /* MPTCP */
 
 /* The frequency of running through the TCP timer list in 
  * fast and slow mode can be configured.
@@ -247,6 +265,8 @@ static void tcp_sched_timerlist(uint32_t offset);
 static uint32_t tcp_run_conn_timer(struct tcpcb *tp, uint16_t *next_index);
 static void tcp_sched_timers(struct tcpcb *tp);
 static inline void tcp_set_lotimer_index(struct tcpcb *);
+static void tcp_rexmt_save_state(struct tcpcb *tp);
+void tcp_remove_from_time_wait(struct inpcb *inp);
 
 /* Macro to compare two timers. If there is a reset of the sign bit, it is 
  * safe to assume that the timer has wrapped around. By doing signed comparision, 
@@ -262,60 +282,79 @@ timer_diff(uint32_t t1, uint32_t toff1, uint32_t t2, uint32_t toff2) {
 /* Returns true if the timer is on the timer list */
 #define TIMER_IS_ON_LIST(tp) ((tp)->t_flags & TF_TIMER_ONLIST)
 
+/* Run the TCP timerlist atleast once every hour */
+#define        TCP_TIMERLIST_MAX_OFFSET        (60 * 60 * TCP_RETRANSHZ)
 
-void   add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay);
+static void add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay);
 void   add_to_time_wait(struct tcpcb *tp, uint32_t delay) ;
 
-static void tcp_garbage_collect(struct inpcb *, int);
+static boolean_t tcp_garbage_collect(struct inpcb *, int);
 
-void   add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay) 
+/*
+ * Add to tcp timewait list, delay is given in milliseconds.
+ */
+static void
+add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay)
 {
-       int             tw_slot;
-       struct inpcbinfo *pcbinfo       = &tcbinfo;
+       struct inpcbinfo *pcbinfo = &tcbinfo;
+       struct inpcb *inp = tp->t_inpcb;
        uint32_t timer;
 
-       /* pcb list should be locked when we get here */        
-       lck_rw_assert(pcbinfo->mtx, LCK_RW_ASSERT_EXCLUSIVE);
-
-       LIST_REMOVE(tp->t_inpcb, inp_list);
-
-       /* if (tp->t_timer[TCPT_2MSL] <= 0) 
-           tp->t_timer[TCPT_2MSL] = 1; */
-
-       /*
-        * Because we're pulling this pcb out of the main TCP pcb list,
-        * we need to recalculate the TCPT_2MSL timer value for tcp_slowtimo
-        * higher timer granularity.
-        */
+       /* pcb list should be locked when we get here */
+       lck_rw_assert(pcbinfo->ipi_lock, LCK_RW_ASSERT_EXCLUSIVE);
 
-       timer = (delay / TCP_RETRANSHZ) * PR_SLOWHZ;
-       tp->t_rcvtime = (tp->t_rcvtime / TCP_RETRANSHZ) * PR_SLOWHZ;
+       /* We may get here multiple times, so check */
+       if (!(inp->inp_flags2 & INP2_TIMEWAIT)) {
+               pcbinfo->ipi_twcount++;
+               inp->inp_flags2 |= INP2_TIMEWAIT;
+               
+               /* Remove from global inp list */
+               LIST_REMOVE(inp, inp_list);
+       } else {
+               TAILQ_REMOVE(&tcp_tw_tailq, tp, t_twentry);
+       }
 
-       tp->t_rcvtime += timer & (N_TIME_WAIT_SLOTS - 1); 
+       /* Compute the time at which this socket can be closed */
+       timer = tcp_now + delay;
+       
+       /* We will use the TCPT_2MSL timer for tracking this delay */
 
-       tw_slot = (timer & (N_TIME_WAIT_SLOTS - 1)) + cur_tw_slot; 
-       if (tw_slot >= N_TIME_WAIT_SLOTS)
-           tw_slot -= N_TIME_WAIT_SLOTS;
+       if (TIMER_IS_ON_LIST(tp))
+               tcp_remove_timer(tp);
+       tp->t_timer[TCPT_2MSL] = timer;
 
-       LIST_INSERT_HEAD(&time_wait_slots[tw_slot], tp->t_inpcb, inp_list);
+       TAILQ_INSERT_TAIL(&tcp_tw_tailq, tp, t_twentry);
 }
 
-void   add_to_time_wait(struct tcpcb *tp, uint32_t delay) 
+void
+add_to_time_wait(struct tcpcb *tp, uint32_t delay)
 {
-       struct inpcbinfo *pcbinfo               = &tcbinfo;
-       
-       if (!lck_rw_try_lock_exclusive(pcbinfo->mtx)) {
+       struct inpcbinfo *pcbinfo = &tcbinfo;
+
+       if (!lck_rw_try_lock_exclusive(pcbinfo->ipi_lock)) {
                tcp_unlock(tp->t_inpcb->inp_socket, 0, 0);
-               lck_rw_lock_exclusive(pcbinfo->mtx);
+               lck_rw_lock_exclusive(pcbinfo->ipi_lock);
                tcp_lock(tp->t_inpcb->inp_socket, 0, 0);
        }
        add_to_time_wait_locked(tp, delay);
-       lck_rw_done(pcbinfo->mtx);
+       lck_rw_done(pcbinfo->ipi_lock);
+
+       inpcb_gc_sched(pcbinfo, INPCB_TIMER_LAZY);
 }
 
-static void
+/* If this is on time wait queue, remove it. */
+void
+tcp_remove_from_time_wait(struct inpcb *inp)
+{
+       struct tcpcb *tp = intotcpcb(inp);
+       if (inp->inp_flags2 & INP2_TIMEWAIT)
+               TAILQ_REMOVE(&tcp_tw_tailq, tp, t_twentry);
+}
+
+static boolean_t
 tcp_garbage_collect(struct inpcb *inp, int istimewait)
 {
+       boolean_t active = FALSE;
        struct socket *so;
        struct tcpcb *tp;
 
@@ -329,13 +368,23 @@ tcp_garbage_collect(struct inpcb *inp, int istimewait)
         * overflow sockets that are eligible for garbage collection have
         * their usecounts set to 1.
         */
-       if (so->so_usecount > 1 || !lck_mtx_try_lock_spin(&inp->inpcb_mtx))
-               return;
+       if (!lck_mtx_try_lock_spin(&inp->inpcb_mtx))
+               return (TRUE);
 
        /* Check again under the lock */
        if (so->so_usecount > 1) {
+               if (inp->inp_wantcnt == WNT_STOPUSING)
+                       active = TRUE;
                lck_mtx_unlock(&inp->inpcb_mtx);
-               return;
+               return (active);
+       }
+
+       if (istimewait &&
+               TSTMP_GEQ(tcp_now, tp->t_timer[TCPT_2MSL]) &&
+               tp->t_state != TCPS_CLOSED) {
+               /* Become a regular mutex */
+               lck_mtx_convert_spin(&inp->inpcb_mtx);
+               tcp_close(tp);
        }
 
        /*
@@ -343,42 +392,46 @@ tcp_garbage_collect(struct inpcb *inp, int istimewait)
         * only if we are called to clean up the time wait slots, since
         * tcp_dropdropablreq() considers a socket to have been fully
         * dropped after add_to_time_wait() is finished.
-        * Also handle the case of connections getting closed by the peer while in the queue as
-        * seen with rdar://6422317
-        * 
+        * Also handle the case of connections getting closed by the peer
+        * while in the queue as seen with rdar://6422317
+        *
         */
-       if (so->so_usecount == 1 && 
+       if (so->so_usecount == 1 &&
            ((istimewait && (so->so_flags & SOF_OVERFLOW)) ||
-           ((tp != NULL) && (tp->t_state == TCPS_CLOSED) && (so->so_head != NULL)
-                && ((so->so_state & (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) ==
-                        (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE))))) {
+           ((tp != NULL) && (tp->t_state == TCPS_CLOSED) &&
+           (so->so_head != NULL) &&
+           ((so->so_state & (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) ==
+           (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE))))) {
 
                if (inp->inp_state != INPCB_STATE_DEAD) {
                        /* Become a regular mutex */
                        lck_mtx_convert_spin(&inp->inpcb_mtx);
 #if INET6
-                       if (INP_CHECK_SOCKAF(so, AF_INET6))
+                       if (SOCK_CHECK_DOM(so, PF_INET6))
                                in6_pcbdetach(inp);
                        else
 #endif /* INET6 */
-                       in_pcbdetach(inp);
+                               in_pcbdetach(inp);
                }
                so->so_usecount--;
+               if (inp->inp_wantcnt == WNT_STOPUSING)
+                       active = TRUE;
                lck_mtx_unlock(&inp->inpcb_mtx);
-               return;
+               return (active);
        } else if (inp->inp_wantcnt != WNT_STOPUSING) {
                lck_mtx_unlock(&inp->inpcb_mtx);
-               return;
+               return (FALSE);
        }
 
        /*
-        * We get here because the PCB is no longer searchable (WNT_STOPUSING);
-        * detach (if needed) and dispose if it is dead (usecount is 0).  This
-        * covers all cases, including overflow sockets and those that are
-        * considered as "embryonic", i.e. created by sonewconn() in TCP input
-        * path, and have not yet been committed.  For the former, we reduce
-        * the usecount to 0 as done by the code above.  For the latter, the
-        * usecount would have reduced to 0 as part calling soabort() when the
+        * We get here because the PCB is no longer searchable 
+        * (WNT_STOPUSING); detach (if needed) and dispose if it is dead 
+        * (usecount is 0).  This covers all cases, including overflow 
+        * sockets and those that are considered as "embryonic", 
+        * i.e. created by sonewconn() in TCP input path, and have 
+        * not yet been committed.  For the former, we reduce the usecount
+        *  to 0 as done by the code above.  For the latter, the usecount 
+        * would have reduced to 0 as part calling soabort() when the
         * socket is dropped at the end of tcp_input().
         */
        if (so->so_usecount == 0) {
@@ -386,113 +439,114 @@ tcp_garbage_collect(struct inpcb *inp, int istimewait)
                        struct tcpcb *, tp, int32_t, TCPS_CLOSED);
                /* Become a regular mutex */
                lck_mtx_convert_spin(&inp->inpcb_mtx);
+
+               /*
+                * If this tp still happens to be on the timer list, 
+                * take it out
+                */
+               if (TIMER_IS_ON_LIST(tp)) {
+                       tcp_remove_timer(tp);
+               }
+
                if (inp->inp_state != INPCB_STATE_DEAD) {
 #if INET6
-                       if (INP_CHECK_SOCKAF(so, AF_INET6))
+                       if (SOCK_CHECK_DOM(so, PF_INET6))
                                in6_pcbdetach(inp);
                        else
 #endif /* INET6 */
-                       in_pcbdetach(inp);
+                               in_pcbdetach(inp);
                }
                in_pcbdispose(inp);
-       } else {
-               lck_mtx_unlock(&inp->inpcb_mtx);
+               return (FALSE);
        }
+
+       lck_mtx_unlock(&inp->inpcb_mtx);
+       return (TRUE);
 }
 
+/*
+ * TCP garbage collector callback (inpcb_timer_func_t).
+ *
+ * Returns the number of pcbs that will need to be gc-ed soon,
+ * returnining > 0 will keep timer active.
+ */
 void
-tcp_slowtimo(void)
+tcp_gc(struct inpcbinfo *ipi)
 {
        struct inpcb *inp, *nxt;
-       struct tcpcb *tp;
+       struct tcpcb *tw_tp, *tw_ntp;
 #if TCPDEBUG
        int ostate;
 #endif
-
 #if  KDEBUG
        static int tws_checked = 0;
 #endif
 
-       struct inpcbinfo *pcbinfo               = &tcbinfo;
-
-       KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_START, 0,0,0,0,0);
-
-       tcp_maxidle = tcp_keepcnt * tcp_keepintvl;
+       KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_START, 0, 0, 0, 0, 0);
 
-       /* Update tcp_now here as it may get used while processing the slow timer */
+       /*
+        * Update tcp_now here as it may get used while
+        * processing the slow timer.
+        */
        calculate_tcp_clock();
 
-       /* Garbage collect socket/tcpcb: We need to acquire the list lock 
+       /*
+        * Garbage collect socket/tcpcb: We need to acquire the list lock
         * exclusively to do this
         */
 
-       if (lck_rw_try_lock_exclusive(pcbinfo->mtx) == FALSE) {
-               if (tcp_gc_done == TRUE) {      /* don't sweat it this time. cleanup was done last time */
+       if (lck_rw_try_lock_exclusive(ipi->ipi_lock) == FALSE) {
+               /* don't sweat it this time; cleanup was done last time */
+               if (tcp_gc_done == TRUE) {
                        tcp_gc_done = FALSE;
-                       KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END, tws_checked, cur_tw_slot,0,0,0);
-                       return; /* Upgrade failed and lost lock - give up this time. */
+                       KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END,
+                           tws_checked, cur_tw_slot, 0, 0, 0);
+                       /* Lock upgrade failed, give up this round */
+                       atomic_add_32(&ipi->ipi_gc_req.intimer_fast, 1);
+                       return;
                }
-               lck_rw_lock_exclusive(pcbinfo->mtx);    /* Upgrade failed, lost lock now take it again exclusive */
+               /* Upgrade failed, lost lock now take it again exclusive */
+               lck_rw_lock_exclusive(ipi->ipi_lock);
        }
        tcp_gc_done = TRUE;
 
-       /*
-        * Process the items in the current time-wait slot
-        */
-#if  KDEBUG
-       tws_checked = 0;
-#endif
-       KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_NONE, tws_checked,0,0,0,0);
-
-       LIST_FOREACH(inp, &time_wait_slots[cur_tw_slot], inp_list) {
-#if KDEBUG
-               tws_checked++;
-#endif
-
-               if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) 
-                       continue;
-
-               tcp_lock(inp->inp_socket, 1, 0);
-
-               if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) 
-                       goto twunlock;
-
-               tp = intotcpcb(inp);
-               if (tp == NULL)  /* tp already closed, remove from list */
-                       goto twunlock;
-
-               if (tp->t_timer[TCPT_2MSL] >= N_TIME_WAIT_SLOTS) {
-                   tp->t_timer[TCPT_2MSL] -= N_TIME_WAIT_SLOTS;
-                   tp->t_rcvtime += N_TIME_WAIT_SLOTS;
-               }
-               else
-                   tp->t_timer[TCPT_2MSL] = 0;
-
-               if (tp->t_timer[TCPT_2MSL] == 0)  {
+       LIST_FOREACH_SAFE(inp, &tcb, inp_list, nxt) {
+               if (tcp_garbage_collect(inp, 0))
+                       atomic_add_32(&ipi->ipi_gc_req.intimer_fast, 1);
+       }
 
-                       /* That pcb is ready for a close */     
-                       tcp_free_sackholes(tp);
-                       tp = tcp_close(tp);
+       /* Now cleanup the time wait ones */
+       TAILQ_FOREACH_SAFE(tw_tp, &tcp_tw_tailq, t_twentry, tw_ntp) {
+               /*
+                * We check the timestamp here without holding the 
+                * socket lock for better performance. If there are
+                * any pcbs in time-wait, the timer will get rescheduled.
+                * Hence some error in this check can be tolerated.
+                *
+                * Sometimes a socket on time-wait queue can be closed if
+                * 2MSL timer expired but the application still has a
+                * usecount on it. 
+                */
+               if (tw_tp->t_state == TCPS_CLOSED ||  
+                   TSTMP_GEQ(tcp_now, tw_tp->t_timer[TCPT_2MSL])) {
+                       if (tcp_garbage_collect(tw_tp->t_inpcb, 1))
+                               atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, 1);
                }
-twunlock:
-               tcp_unlock(inp->inp_socket, 1, 0);
        }
 
+       /* take into account pcbs that are still in time_wait_slots */
+       atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, ipi->ipi_twcount);
 
-       LIST_FOREACH_SAFE(inp, &tcb, inp_list, nxt) {
-               tcp_garbage_collect(inp, 0);
-       }
+       lck_rw_done(ipi->ipi_lock);
 
-       /* Now cleanup the time wait ones */
-       LIST_FOREACH_SAFE(inp, &time_wait_slots[cur_tw_slot], inp_list, nxt) {
-               tcp_garbage_collect(inp, 1);
-       }
+       /* Clean up the socache while we are here */
+       if (so_cache_timer())
+               atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, 1);
 
-       if (++cur_tw_slot >= N_TIME_WAIT_SLOTS)
-               cur_tw_slot = 0;
-       
-       lck_rw_done(pcbinfo->mtx);
-       KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END, tws_checked, cur_tw_slot,0,0,0);
+       KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END, tws_checked,
+           cur_tw_slot, 0, 0, 0);
+
+       return;
 }
 
 /*
@@ -519,6 +573,41 @@ int        tcp_backoff[TCP_MAXRXTSHIFT + 1] =
 
 static int tcp_totbackoff = 511;       /* sum of tcp_backoff[] */
 
+static void tcp_rexmt_save_state(struct tcpcb *tp)
+{
+       u_int32_t fsize;
+       if (TSTMP_SUPPORTED(tp)) {
+               /*
+                * Since timestamps are supported on the connection, 
+                * we can do recovery as described in rfc 4015.
+                */
+               fsize = tp->snd_max - tp->snd_una;
+               tp->snd_ssthresh_prev = max(fsize, tp->snd_ssthresh);
+               tp->snd_recover_prev = tp->snd_recover;
+       } else {
+               /*
+                * Timestamp option is not supported on this connection.
+                * Record ssthresh and cwnd so they can
+                * be recovered if this turns out to be a "bad" retransmit.
+                * A retransmit is considered "bad" if an ACK for this 
+                * segment is received within RTT/2 interval; the assumption
+                * here is that the ACK was already in flight.  See 
+                * "On Estimating End-to-End Network Path Properties" by
+                * Allman and Paxson for more details.
+                */
+               tp->snd_cwnd_prev = tp->snd_cwnd;
+               tp->snd_ssthresh_prev = tp->snd_ssthresh;
+               tp->snd_recover_prev = tp->snd_recover;
+               if (IN_FASTRECOVERY(tp))
+                       tp->t_flags |= TF_WASFRECOVERY;
+               else
+                       tp->t_flags &= ~TF_WASFRECOVERY;
+       }
+       tp->t_srtt_prev = (tp->t_srtt >> TCP_RTT_SHIFT) + 2;
+       tp->t_rttvar_prev = (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
+       tp->t_flagsext &= ~(TF_RECOMPUTE_RTT);
+}
+
 /*
  * TCP timer processing.
  */
@@ -528,7 +617,7 @@ tcp_timers(tp, timer)
        int timer;
 {
        register int rexmt;
-       struct socket *so_tmp;
+       struct socket *so;
        struct tcptemp *t_template;
        int optlen = 0;
        int idle_time = 0;
@@ -541,7 +630,7 @@ tcp_timers(tp, timer)
        int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV4) == 0;
 #endif /* INET6 */
 
-       so_tmp = tp->t_inpcb->inp_socket;
+       so = tp->t_inpcb->inp_socket;
        idle_time = tcp_now - tp->t_rcvtime;
 
        switch (timer) {
@@ -558,10 +647,10 @@ tcp_timers(tp, timer)
                tcp_free_sackholes(tp);
                if (tp->t_state != TCPS_TIME_WAIT &&
                    tp->t_state != TCPS_FIN_WAIT_2 &&
-                   ((idle_time > 0) && (idle_time < tcp_maxidle))) {
-                       tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp, (u_int32_t)tcp_keepintvl);
-               }
-               else {
+                   ((idle_time > 0) && (idle_time < TCP_CONN_MAXIDLE(tp)))) {
+                       tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp, 
+                               (u_int32_t)TCP_CONN_KEEPINTVL(tp));
+               else {
                        tp = tcp_close(tp);
                        return(tp);
                }
@@ -573,7 +662,6 @@ tcp_timers(tp, timer)
         * to a longer retransmit interval and retransmit one segment.
         */
        case TCPT_REXMT:
-               tcp_free_sackholes(tp);
                /* Drop a connection in the retransmit timer
                 * 1. If we have retransmitted more than TCP_MAXRXTSHIFT times
                 * 2. If the time spent in this retransmission episode is more than
@@ -582,8 +670,8 @@ tcp_timers(tp, timer)
                 *    retransmitted the FIN 3 times without receiving an ack
                 */
                if (++tp->t_rxtshift > TCP_MAXRXTSHIFT ||
-                       (tp->rxt_conndroptime > 0 && tp->rxt_start > 0 && 
-                       (tcp_now - tp->rxt_start) >= tp->rxt_conndroptime) ||
+                       (tp->t_rxt_conndroptime > 0 && tp->t_rxtstart > 0 && 
+                       (tcp_now - tp->t_rxtstart) >= tp->t_rxt_conndroptime) ||
                        ((tp->t_flagsext & TF_RXTFINDROP) != 0 &&
                        (tp->t_flags & TF_SENTFIN) != 0 &&
                        tp->t_rxtshift >= 4)) {
@@ -594,39 +682,51 @@ tcp_timers(tp, timer)
                                tcpstat.tcps_timeoutdrop++;
                        }
                        tp->t_rxtshift = TCP_MAXRXTSHIFT;
+                       postevent(so, 0, EV_TIMEOUT);                   
+                       soevent(so, 
+                           (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT));
                        tp = tcp_drop(tp, tp->t_softerror ?
                            tp->t_softerror : ETIMEDOUT);
-                       postevent(so_tmp, 0, EV_TIMEOUT);                       
+
                        break;
                }
 
-               if (tp->t_rxtshift == 1) {
-                       /*
-                        * first retransmit; record ssthresh and cwnd so they can
-                        * be recovered if this turns out to be a "bad" retransmit.
-                        * A retransmit is considered "bad" if an ACK for this 
-                        * segment is received within RTT/2 interval; the assumption
-                        * here is that the ACK was already in flight.  See 
-                        * "On Estimating End-to-End Network Path Properties" by
-                        * Allman and Paxson for more details.
-                        */
-                       tp->snd_cwnd_prev = tp->snd_cwnd;
-                       tp->snd_ssthresh_prev = tp->snd_ssthresh;
-                       tp->snd_recover_prev = tp->snd_recover;
-                       if (IN_FASTRECOVERY(tp))
-                                 tp->t_flags |= TF_WASFRECOVERY;
-                       else
-                                 tp->t_flags &= ~TF_WASFRECOVERY;
-                       tp->t_badrxtwin = tcp_now  + (tp->t_srtt >> (TCP_RTT_SHIFT)); 
+               tcpstat.tcps_rexmttimeo++;
 
-                       /* Set the time at which retransmission on this 
-                        * connection started
-                        */
-                       tp->rxt_start = tcp_now;
+               if (tp->t_rxtshift == 1 && 
+                       tp->t_state == TCPS_ESTABLISHED) {
+                       /* Set the time at which retransmission started. */
+                       tp->t_rxtstart = tcp_now;
+
+                       /* 
+                        * if this is the first retransmit timeout, save
+                        * the state so that we can recover if the timeout
+                        * is spurious.
+                        */ 
+                       tcp_rexmt_save_state(tp);
                }
-               tcpstat.tcps_rexmttimeo++;
-               if (tp->t_state == TCPS_SYN_SENT)
+#if MPTCP
+               if ((tp->t_rxtshift == mptcp_fail_thresh) &&
+                   (tp->t_state == TCPS_ESTABLISHED) &&
+                   (tp->t_mpflags & TMPF_MPTCP_TRUE)) {
+                       mptcp_act_on_txfail(so);
+
+               }
+#endif /* MPTCP */
+
+               if (tp->t_adaptive_wtimo > 0 &&
+                       tp->t_rxtshift > tp->t_adaptive_wtimo &&
+                       TCPS_HAVEESTABLISHED(tp->t_state)) {
+                       /* Send an event to the application */
+                       soevent(so,
+                               (SO_FILT_HINT_LOCKED|
+                               SO_FILT_HINT_ADAPTIVE_WTIMO));
+               }
+
+               if (tp->t_state == TCPS_SYN_SENT) {
                        rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift];
+                       tp->t_stat.synrxtshift = tp->t_rxtshift;
+               }
                else
                        rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
                TCPT_RANGESET(tp->t_rxtcur, rexmt,
@@ -634,25 +734,33 @@ tcp_timers(tp, timer)
                        TCP_ADD_REXMTSLOP(tp));
                tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
 
+               if (INP_WAIT_FOR_IF_FEEDBACK(tp->t_inpcb))
+                       goto fc_output;
+
+               tcp_free_sackholes(tp);
                /*
                 * Check for potential Path MTU Discovery Black Hole 
                 */
 
                if (tcp_pmtud_black_hole_detect && (tp->t_state == TCPS_ESTABLISHED)) {
-                       if (((tp->t_flags & (TF_PMTUD|TF_MAXSEGSNT)) == (TF_PMTUD|TF_MAXSEGSNT)) && (tp->t_rxtshift == 2)) {
+                       if (((tp->t_flags & (TF_PMTUD|TF_MAXSEGSNT)) == (TF_PMTUD|TF_MAXSEGSNT)) &&
+                                (tp->t_rxtshift == 2)) {
                                /* 
                                 * Enter Path MTU Black-hole Detection mechanism:
                                 * - Disable Path MTU Discovery (IP "DF" bit).
                                 * - Reduce MTU to lower value than what we negociated with peer.
                                 */
-
-                               tp->t_flags &= ~TF_PMTUD; /* Disable Path MTU Discovery for now */
-                               tp->t_flags |= TF_BLACKHOLE; /* Record that we may have found a black hole */
+                               /* Disable Path MTU Discovery for now */
+                               tp->t_flags &= ~TF_PMTUD;
+                               /* Record that we may have found a black hole */
+                               tp->t_flags |= TF_BLACKHOLE;
                                optlen = tp->t_maxopd - tp->t_maxseg;
-                               tp->t_pmtud_saved_maxopd = tp->t_maxopd; /* Keep track of previous MSS */
-                               if (tp->t_maxopd > tcp_pmtud_black_hole_mss)
-                                       tp->t_maxopd = tcp_pmtud_black_hole_mss; /* Reduce the MSS to intermediary value */
-                               else {
+                               /* Keep track of previous MSS */
+                               tp->t_pmtud_saved_maxopd = tp->t_maxopd;
+                               /* Reduce the MSS to intermediary value */
+                               if (tp->t_maxopd > tcp_pmtud_black_hole_mss) {
+                                       tp->t_maxopd = tcp_pmtud_black_hole_mss;
+                               } else {
                                        tp->t_maxopd =  /* use the default MSS */
 #if INET6
                                                isipv6 ? tcp_v6mssdflt :
@@ -662,7 +770,8 @@ tcp_timers(tp, timer)
                                tp->t_maxseg = tp->t_maxopd - optlen;
 
                                /*
-                                * Reset the slow-start flight size as it may depends on the new MSS
+                                * Reset the slow-start flight size 
+                                * as it may depend on the new MSS
                                 */
                                if (CC_ALGO(tp)->cwnd_init != NULL)
                                        CC_ALGO(tp)->cwnd_init(tp);
@@ -681,8 +790,9 @@ tcp_timers(tp, timer)
                                        tp->t_maxopd = tp->t_pmtud_saved_maxopd;
                                        tp->t_maxseg = tp->t_maxopd - optlen;
                                        /*
-                                       * Reset the slow-start flight size as it may depends on the new MSS
-                                       */
+                                        * Reset the slow-start flight size as it 
+                                        * may depend on the new MSS
+                                        */
                                        if (CC_ALGO(tp)->cwnd_init != NULL)
                                                CC_ALGO(tp)->cwnd_init(tp);
                                }
@@ -696,10 +806,15 @@ tcp_timers(tp, timer)
                 * broken terminal servers (most of which have hopefully been
                 * retired) that have bad VJ header compression code which
                 * trashes TCP segments containing unknown-to-them TCP options.
+                * Do this only on non-local connections.
                 */
-               if ((tp->t_state == TCPS_SYN_SENT) &&
-                   (tp->t_rxtshift == tcp_broken_peer_syn_rxmit_thres))
+               if (tp->t_state == TCPS_SYN_SENT &&
+                   ((!(tp->t_flags & TF_LOCAL) &&
+                   tp->t_rxtshift == tcp_broken_peer_syn_rxmit_thres) ||
+                   ((tp->t_flags & TF_LOCAL) && 
+                   tp->t_rxtshift == tcp_broken_peer_syn_rxmit_thres_local)))
                        tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_REQ_CC);
+
                /*
                 * If losing, let the lower level know and try for
                 * a better route.  Also, if we backed off this far,
@@ -733,12 +848,25 @@ tcp_timers(tp, timer)
                 */
                tp->t_rtttime = 0;
 
-               if (CC_ALGO(tp)->after_timeout != NULL)
+               EXIT_FASTRECOVERY(tp);
+
+               /* RFC 5681 says: when a TCP sender detects segment loss
+                * using retransmit timer and the given segment has already
+                * been retransmitted by way of the retransmission timer at
+                * least once, the value of ssthresh is held constant
+                */
+               if (tp->t_rxtshift == 1 && 
+                       CC_ALGO(tp)->after_timeout != NULL)
                        CC_ALGO(tp)->after_timeout(tp);
 
-               tp->t_dupacks = 0;
-               EXIT_FASTRECOVERY(tp);
 
+               /* CWR notifications are to be sent on new data right after
+                * RTOs, Fast Retransmits and ECE notification receipts.
+                */
+               if ((tp->ecn_flags & TE_ECN_ON) == TE_ECN_ON) {
+                       tp->ecn_flags |= TE_SENDCWR;
+               }
+fc_output:
                DTRACE_TCP5(cc, void, NULL, struct inpcb *, tp->t_inpcb,
                        struct tcpcb *, tp, struct tcphdr *, NULL,
                        int32_t, TCP_CC_REXMT_TIMEOUT);
@@ -766,11 +894,13 @@ tcp_timers(tp, timer)
                if ((tp->t_rxtshift == TCP_MAXRXTSHIFT &&
                    (idle_time >= tcp_maxpersistidle ||
                    idle_time >= TCP_REXMTVAL(tp) * tcp_totbackoff)) || 
-                   ((tp->t_persist_stop != 0) && (tp->t_persist_stop <= tcp_now))) {
+                   ((tp->t_persist_stop != 0) && 
+                       TSTMP_LEQ(tp->t_persist_stop, tcp_now))) {
                        tcpstat.tcps_persistdrop++;
-                       so_tmp = tp->t_inpcb->inp_socket;
+                       postevent(so, 0, EV_TIMEOUT);
+                       soevent(so,
+                           (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT));
                        tp = tcp_drop(tp, ETIMEDOUT);
-                       postevent(so_tmp, 0, EV_TIMEOUT);
                        break;
                }
                tcp_setpersist(tp);
@@ -785,12 +915,27 @@ tcp_timers(tp, timer)
         */
        case TCPT_KEEP:
                tcpstat.tcps_keeptimeo++;
+#if MPTCP
+               /*
+                * Regular TCP connections do not send keepalives after closing
+                * MPTCP must not also, after sending Data FINs.
+                */
+               struct mptcb *mp_tp = tp->t_mptcb;
+               if ((tp->t_mpflags & TMPF_MPTCP_TRUE) && 
+                   (mp_tp == NULL)) {
+                       goto dropit;
+               } else if (mp_tp != NULL) {
+                       if ((mptcp_ok_to_keepalive(mp_tp) == 0))
+                               goto dropit;
+               }
+#endif /* MPTCP */
                if (tp->t_state < TCPS_ESTABLISHED)
                        goto dropit;
                if ((always_keepalive ||
-                   tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) &&
+                   (tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) ||
+                   (tp->t_flagsext & TF_DETECT_READSTALL)) &&
                    (tp->t_state <= TCPS_CLOSING || tp->t_state == TCPS_FIN_WAIT_2)) {
-                       if (idle_time >= TCP_KEEPIDLE(tp) + (u_int32_t)tcp_maxidle)
+                       if (idle_time >= TCP_CONN_KEEPIDLE(tp) + TCP_CONN_MAXIDLE(tp))
                                goto dropit;
                        /*
                         * Send a packet designed to force a response
@@ -810,7 +955,7 @@ tcp_timers(tp, timer)
                                unsigned int ifscope, nocell = 0;
 
                                if (tp->t_inpcb->inp_flags & INP_BOUND_IF)
-                                       ifscope = tp->t_inpcb->inp_boundif;
+                                       ifscope = tp->t_inpcb->inp_boundifp->if_index;
                                else
                                        ifscope = IFSCOPE_NONE;
 
@@ -826,10 +971,34 @@ tcp_timers(tp, timer)
                                    tp->rcv_nxt, tp->snd_una - 1, 0, ifscope,
                                    nocell);
                                (void) m_free(dtom(t_template));
+                               if (tp->t_flagsext & TF_DETECT_READSTALL)
+                                       tp->t_rtimo_probes++;
                        }
-                       tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, tcp_keepintvl);
-               } else
-                       tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, TCP_KEEPIDLE(tp));
+                       tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
+                               TCP_CONN_KEEPINTVL(tp));
+               } else {
+                       tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
+                               TCP_CONN_KEEPIDLE(tp));
+               }
+               if (tp->t_flagsext & TF_DETECT_READSTALL) {
+                       /* 
+                        * The keep alive packets sent to detect a read
+                        * stall did not get a response from the 
+                        * peer. Generate more keep-alives to confirm this.
+                        * If the number of probes sent reaches the limit,
+                        * generate an event.
+                        */
+                       if (tp->t_rtimo_probes > tp->t_adaptive_rtimo) {
+                               /* Generate an event */
+                               soevent(so,
+                                       (SO_FILT_HINT_LOCKED|
+                                       SO_FILT_HINT_ADAPTIVE_RTIMO));
+                               tcp_keepalive_reset(tp);
+                       } else {
+                               tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(
+                                       tp, TCP_REXMTVAL(tp));
+                       }
+               }
                break;
        case TCPT_DELACK:
                if (tcp_delack_enabled && (tp->t_flags & TF_DELACK)) {
@@ -837,17 +1006,51 @@ tcp_timers(tp, timer)
                        tp->t_timer[TCPT_DELACK] = 0;
                        tp->t_flags |= TF_ACKNOW;
 
-                       /* If delayed ack timer fired while we are stretching acks, 
+                       /* If delayed ack timer fired while stretching acks
                         * go back to acking every other packet
                         */
                        if ((tp->t_flags & TF_STRETCHACK) != 0)
                                tcp_reset_stretch_ack(tp);
 
+                       /* If we are measuring inter packet arrival jitter for 
+                        * throttling a connection, this delayed ack might be 
+                        * the reason for accumulating some jitter. So let's
+                        * restart the measurement.
+                        */
+                       CLEAR_IAJ_STATE(tp);
+
                        tcpstat.tcps_delack++;
                        (void) tcp_output(tp);
                }
                break;
 
+#if MPTCP
+       case TCPT_JACK_RXMT:
+               if ((tp->t_state == TCPS_ESTABLISHED) &&
+                   (tp->t_mpflags & TMPF_PREESTABLISHED) &&
+                   (tp->t_mpflags & TMPF_JOINED_FLOW)) {
+                       if (++tp->t_mprxtshift > TCP_MAXRXTSHIFT) {
+                               tcpstat.tcps_timeoutdrop++;
+                               postevent(so, 0, EV_TIMEOUT);
+                               soevent(so, 
+                                   (SO_FILT_HINT_LOCKED|
+                                   SO_FILT_HINT_TIMEOUT));
+                               tp = tcp_drop(tp, tp->t_softerror ?
+                                   tp->t_softerror : ETIMEDOUT);
+                               break;
+                       }
+                       tcpstat.tcps_join_rxmts++;
+                       tp->t_flags |= TF_ACKNOW;
+
+                       /*
+                        * No backoff is implemented for simplicity for this 
+                        * corner case.
+                        */
+                       (void) tcp_output(tp);
+               }
+               break;
+#endif /* MPTCP */
+
 #if TCPDEBUG
        if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
                tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
@@ -855,8 +1058,10 @@ tcp_timers(tp, timer)
 #endif
        dropit:
                tcpstat.tcps_keepdrops++;
+               postevent(so, 0, EV_TIMEOUT);
+               soevent(so,
+                   (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT));
                tp = tcp_drop(tp, ETIMEDOUT);
-               postevent(so_tmp, 0, EV_TIMEOUT);
                break;
        }
        return (tp);
@@ -887,10 +1092,10 @@ tcp_remove_timer(struct tcpcb *tp)
        tp->t_flags &= ~(TF_TIMER_ONLIST);
 
        listp->entries--;
-       lck_mtx_unlock(listp->mtx);
 
        tp->tentry.le.le_next = NULL;
        tp->tentry.le.le_prev = NULL;
+       lck_mtx_unlock(listp->mtx);
 }
 
 /* Function to check if the timerlist needs to be rescheduled to run
@@ -904,16 +1109,18 @@ need_to_resched_timerlist(uint32_t runtime, uint16_t index) {
        int32_t diff;
        boolean_t is_fast;
 
-       if (runtime == 0 || index == TCPT_NONE)
+       if (index == TCPT_NONE)
                return FALSE;
        is_fast = !(IS_TIMER_SLOW(index));
 
        /* If the list is being processed then the state of the list is in flux.
         * In this case always acquire the lock and set the state correctly.
         */
-       if (listp->running) {
+       if (listp->running)
                return TRUE;
-       }
+
+       if (!listp->scheduled)
+               return (TRUE);
 
        diff = timer_diff(listp->runtime, 0, runtime, 0);
        if (diff <= 0) {
@@ -940,12 +1147,16 @@ tcp_sched_timerlist(uint32_t offset)
 
        lck_mtx_assert(listp->mtx, LCK_MTX_ASSERT_OWNED);
 
+       offset = min(offset, TCP_TIMERLIST_MAX_OFFSET);
        listp->runtime = tcp_now + offset;
+       if (listp->runtime == 0)
+               listp->runtime++;
 
        clock_interval_to_deadline(offset, NSEC_PER_SEC / TCP_RETRANSHZ,
                &deadline);
 
        thread_call_enter_delayed(listp->call, deadline);
+       listp->scheduled = TRUE;
 }
 
 /* Function to run the timers for a connection.
@@ -985,11 +1196,9 @@ tcp_run_conn_timer(struct tcpcb *tp, uint16_t *next_index) {
          * with another thread that can cancel or reschedule the timer that is
          * about to run. Check if we need to run anything.
          */
-       index = tp->tentry.index;
-       timer_val = tp->t_timer[index];
-
-        if (index == TCPT_NONE || tp->tentry.runtime == 0) 
+       if ((index = tp->tentry.index) == TCPT_NONE)
                goto done;
+       timer_val = tp->t_timer[index];
 
        diff = timer_diff(tp->tentry.runtime, 0, tcp_now, 0);
        if (diff > 0) {
@@ -1032,8 +1241,8 @@ tcp_run_conn_timer(struct tcpcb *tp, uint16_t *next_index) {
        tp->tentry.index = lo_index;
        if (lo_index != TCPT_NONE) {
                tp->tentry.runtime = tp->tentry.timer_start + tp->t_timer[lo_index];
-       } else {
-               tp->tentry.runtime = 0;
+               if (tp->tentry.runtime == 0)
+                       tp->tentry.runtime++;
        }
 
        if (count > 0) {
@@ -1042,8 +1251,11 @@ tcp_run_conn_timer(struct tcpcb *tp, uint16_t *next_index) {
                        if (needtorun[i]) {
                                tp->t_timer[i] = 0;
                                tp = tcp_timers(tp, i);
-                               if (tp == NULL) 
+                               if (tp == NULL) {
+                                       offset = 0;
+                                       *(next_index) = TCPT_NONE;
                                        goto done;
+                               }
                        }
                }
                tcp_set_lotimer_index(tp);
@@ -1057,6 +1269,7 @@ tcp_run_conn_timer(struct tcpcb *tp, uint16_t *next_index) {
 done:
        if (tp != NULL && tp->tentry.index == TCPT_NONE) {
                tcp_remove_timer(tp);
+               offset = 0;
        }
         tcp_unlock(so, 1, 0);
         return offset;
@@ -1085,7 +1298,7 @@ tcp_run_timerlist(void * arg1, void * arg2) {
        LIST_FOREACH_SAFE(te, &listp->lhead, le, next_te) {
                uint32_t offset = 0;
                uint32_t runtime = te->runtime;
-               if (TSTMP_GT(runtime, tcp_now)) {
+               if (te->index < TCPT_NONE && TSTMP_GT(runtime, tcp_now)) {
                        offset = timer_diff(runtime, 0, tcp_now, 0);
                        if (next_timer == 0 || offset < next_timer) {
                                next_timer = offset;
@@ -1181,8 +1394,11 @@ tcp_run_timerlist(void * arg1, void * arg2) {
 
                tcp_sched_timerlist(next_timer);
        } else {
-               /* No need to reschedule this timer */
-               listp->runtime = 0;
+               /*
+                * No need to reschedule this timer, but always run
+                * periodically at a much higher granularity.
+                */
+               tcp_sched_timerlist(TCP_TIMERLIST_MAX_OFFSET);
        }
 
        listp->running = FALSE;
@@ -1199,7 +1415,7 @@ tcp_sched_timers(struct tcpcb *tp)
        struct tcptimerentry *te = &tp->tentry;
        uint16_t index = te->index;
        struct tcptimerlist *listp = &tcp_timer_list;
-       uint32_t offset = 0;
+       int32_t offset = 0;
        boolean_t is_fast;
        int list_locked = 0;
 
@@ -1217,8 +1433,8 @@ tcp_sched_timers(struct tcpcb *tp)
        }
 
        is_fast = !(IS_TIMER_SLOW(index));
-       offset = te->runtime - tcp_now;
-       if (offset == 0) {
+       offset = timer_diff(te->runtime, 0, tcp_now, 0);
+       if (offset <= 0) {
                offset = 1;
                tcp_timer_advanced++;
        }
@@ -1239,7 +1455,7 @@ tcp_sched_timers(struct tcpcb *tp)
                        listp->maxentries = listp->entries;
 
                /* if the list is not scheduled, just schedule it */
-               if (listp->runtime == 0)
+               if (!listp->scheduled)
                        goto schedule;
 
        }
@@ -1261,15 +1477,22 @@ tcp_sched_timers(struct tcpcb *tp)
                        if (is_fast) {
                                listp->pref_mode = TCP_TIMERLIST_FASTMODE;
                        } else if (listp->pref_offset == 0 ||
-                               ((int)offset) < listp->pref_offset) {
+                               offset < listp->pref_offset) {
                                listp->pref_offset = offset;
                        }
                } else {
-                       int32_t diff;
-                       diff = timer_diff(listp->runtime, 0, tcp_now, offset);
-                       if (diff <= 0) {
-                               /* The list is going to run before this timer */
-                               goto done;
+                       /*
+                        * The list could have got scheduled while this
+                        * thread was waiting for the lock
+                        */
+                       if (listp->scheduled) {
+                               int32_t diff;
+                               diff = timer_diff(listp->runtime, 0,
+                                   tcp_now, offset);
+                               if (diff <= 0)
+                                       goto done;
+                               else
+                                       goto schedule;
                        } else {
                                goto schedule;
                        }
@@ -1305,8 +1528,8 @@ tcp_set_lotimer_index(struct tcpcb *tp) {
        tp->tentry.index = lo_index;
        if (lo_index != TCPT_NONE) {
                tp->tentry.runtime = tp->tentry.timer_start + tp->t_timer[lo_index];
-       } else {
-               tp->tentry.runtime = 0;
+               if (tp->tentry.runtime == 0)
+                       tp->tentry.runtime++;
        }
 }
 
@@ -1315,6 +1538,9 @@ tcp_check_timer_state(struct tcpcb *tp) {
 
        lck_mtx_assert(&tp->t_inpcb->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
 
+       if (tp->t_inpcb->inp_flags2 & INP2_TIMEWAIT)
+               return;
+
        tcp_set_lotimer_index(tp);
 
        tcp_sched_timers(tp);