]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/tcp_timer.c
bc29427056add62d19e092fee1bc84d9768c0cdf
[apple/xnu.git] / bsd / netinet / tcp_timer.c
1 /*
2 * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
30 * The Regents of the University of California. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95
61 * $FreeBSD: src/sys/netinet/tcp_timer.c,v 1.34.2.11 2001/08/22 00:59:12 silby Exp $
62 */
63
64
65 #include <sys/param.h>
66 #include <sys/systm.h>
67 #include <sys/kernel.h>
68 #include <sys/mbuf.h>
69 #include <sys/sysctl.h>
70 #include <sys/socket.h>
71 #include <sys/socketvar.h>
72 #include <sys/protosw.h>
73 #include <sys/domain.h>
74 #include <sys/mcache.h>
75 #include <sys/queue.h>
76 #include <kern/locks.h>
77 #include <kern/cpu_number.h> /* before tcp_seq.h, for tcp_random18() */
78 #include <mach/boolean.h>
79
80 #include <net/route.h>
81 #include <net/if_var.h>
82
83 #include <netinet/in.h>
84 #include <netinet/in_systm.h>
85 #include <netinet/in_pcb.h>
86 #if INET6
87 #include <netinet6/in6_pcb.h>
88 #endif
89 #include <netinet/ip_var.h>
90 #include <netinet/tcp.h>
91 #include <netinet/tcp_fsm.h>
92 #include <netinet/tcp_seq.h>
93 #include <netinet/tcp_timer.h>
94 #include <netinet/tcp_var.h>
95 #include <netinet/tcp_cc.h>
96 #if INET6
97 #include <netinet6/tcp6_var.h>
98 #endif
99 #include <netinet/tcpip.h>
100 #if TCPDEBUG
101 #include <netinet/tcp_debug.h>
102 #endif
103 #include <sys/kdebug.h>
104 #include <mach/sdt.h>
105 #include <netinet/mptcp_var.h>
106
107 extern void postevent(struct socket *, struct sockbuf *,
108 int);
109 #define DBG_FNC_TCP_FAST NETDBG_CODE(DBG_NETTCP, (5 << 8))
110 #define DBG_FNC_TCP_SLOW NETDBG_CODE(DBG_NETTCP, (5 << 8) | 1)
111
112 #define TIMERENTRY_TO_TP(te) ((struct tcpcb *)((uintptr_t)te - offsetof(struct tcpcb, tentry.le.le_next)))
113
114 #define VERIFY_NEXT_LINK(elm,field) do { \
115 if (LIST_NEXT((elm),field) != NULL && \
116 LIST_NEXT((elm),field)->field.le_prev != \
117 &((elm)->field.le_next)) \
118 panic("Bad link elm %p next->prev != elm", (elm)); \
119 } while(0)
120
121 #define VERIFY_PREV_LINK(elm,field) do { \
122 if (*(elm)->field.le_prev != (elm)) \
123 panic("Bad link elm %p prev->next != elm", (elm)); \
124 } while(0)
125
126 /* tcp timer list */
127 struct tcptimerlist tcp_timer_list;
128
129 /* List of pcbs in timewait state, protected by tcbinfo's ipi_lock */
130 struct tcptailq tcp_tw_tailq;
131
132 static int background_io_trigger = 5;
133 SYSCTL_INT(_net_inet_tcp, OID_AUTO, background_io_trigger, CTLFLAG_RW | CTLFLAG_LOCKED,
134 &background_io_trigger, 0, "Background IO Trigger Setting");
135
136 static int
137 sysctl_msec_to_ticks SYSCTL_HANDLER_ARGS
138 {
139 #pragma unused(arg1, arg2)
140 int error, s, tt;
141
142 tt = *(int *)oidp->oid_arg1;
143 s = tt * 1000 / TCP_RETRANSHZ;;
144
145 error = sysctl_handle_int(oidp, &s, 0, req);
146 if (error || !req->newptr)
147 return (error);
148
149 tt = s * TCP_RETRANSHZ / 1000;
150 if (tt < 1)
151 return (EINVAL);
152
153 *(int *)oidp->oid_arg1 = tt;
154 return (0);
155 }
156
157 int tcp_keepinit;
158 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
159 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "");
160
161 int tcp_keepidle;
162 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
163 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "");
164
165 int tcp_keepintvl;
166 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
167 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "");
168
169 int tcp_keepcnt;
170 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
171 &tcp_keepcnt, 0, "number of times to repeat keepalive");
172
173 int tcp_msl;
174 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
175 &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
176
177 /*
178 * Avoid DoS via TCP Robustness in Persist Condition (see http://www.ietf.org/id/draft-ananth-tcpm-persist-02.txt)
179 * by allowing a system wide maximum persistence timeout value when in Zero Window Probe mode.
180 * Expressed in milliseconds to be consistent without timeout related values, the TCP socket option is in seconds.
181 */
182 u_int32_t tcp_max_persist_timeout = 0;
183 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, max_persist_timeout, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
184 &tcp_max_persist_timeout, 0, sysctl_msec_to_ticks, "I", "Maximum persistence timout for ZWP");
185
186 static int always_keepalive = 0;
187 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW | CTLFLAG_LOCKED,
188 &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections");
189
190 /* This parameter determines how long the timer list will stay in fast mode even
191 * though all connections are idle. In fast mode, the timer will fire more frequently
192 * anticipating new data.
193 */
194 int timer_fastmode_idlemax = TCP_FASTMODE_IDLEGEN_MAX;
195 SYSCTL_INT(_net_inet_tcp, OID_AUTO, timer_fastmode_idlemax, CTLFLAG_RW | CTLFLAG_LOCKED,
196 &timer_fastmode_idlemax, 0, "Maximum idle generations in fast mode");
197
198 /*
199 * See tcp_syn_backoff[] for interval values between SYN retransmits;
200 * the value set below defines the number of retransmits, before we
201 * disable the timestamp and window scaling options during subsequent
202 * SYN retransmits. Setting it to 0 disables the dropping off of those
203 * two options.
204 */
205 static int tcp_broken_peer_syn_rxmit_thres = 7;
206 SYSCTL_INT(_net_inet_tcp, OID_AUTO, broken_peer_syn_rxmit_thres, CTLFLAG_RW | CTLFLAG_LOCKED,
207 &tcp_broken_peer_syn_rxmit_thres, 0, "Number of retransmitted SYNs before "
208 "TCP disables rfc1323 and rfc1644 during the rest of attempts");
209
210 /* A higher threshold on local connections for disabling RFC 1323 options */
211 static int tcp_broken_peer_syn_rxmit_thres_local = 10;
212 SYSCTL_INT(_net_inet_tcp, OID_AUTO, broken_peer_syn_rexmit_thres_local,
213 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_broken_peer_syn_rxmit_thres_local, 0,
214 "Number of retransmitted SYNs before disabling RFC 1323 options on local connections");
215
216 static int tcp_timer_advanced = 0;
217 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_timer_advanced, CTLFLAG_RD | CTLFLAG_LOCKED,
218 &tcp_timer_advanced, 0, "Number of times one of the timers was advanced");
219
220 static int tcp_resched_timerlist = 0;
221 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_resched_timerlist, CTLFLAG_RD | CTLFLAG_LOCKED,
222 &tcp_resched_timerlist, 0,
223 "Number of times timer list was rescheduled as part of processing a packet");
224
225 int tcp_pmtud_black_hole_detect = 1 ;
226 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, CTLFLAG_RW | CTLFLAG_LOCKED,
227 &tcp_pmtud_black_hole_detect, 0, "Path MTU Discovery Black Hole Detection");
228
229 int tcp_pmtud_black_hole_mss = 1200 ;
230 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, CTLFLAG_RW | CTLFLAG_LOCKED,
231 &tcp_pmtud_black_hole_mss, 0, "Path MTU Discovery Black Hole Detection lowered MSS");
232
233 /* performed garbage collection of "used" sockets */
234 static boolean_t tcp_gc_done = FALSE;
235
236 /* max idle probes */
237 int tcp_maxpersistidle;
238
239 /* TCP delack timer is set to 100 ms. Since the processing of timer list in fast
240 * mode will happen no faster than 100 ms, the delayed ack timer will fire some where
241 * between 100 and 200 ms.
242 */
243 int tcp_delack = TCP_RETRANSHZ / 10;
244
245 #if MPTCP
246 /*
247 * MP_JOIN retransmission of 3rd ACK will be every 500 msecs without backoff
248 */
249 int tcp_jack_rxmt = TCP_RETRANSHZ / 2;
250 #endif /* MPTCP */
251
252 /* The frequency of running through the TCP timer list in
253 * fast and slow mode can be configured.
254 */
255 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, timer_fastquantum, CTLFLAG_RW | CTLFLAG_LOCKED,
256 &tcp_timer_list.fast_quantum, TCP_FASTTIMER_QUANTUM,
257 "Frequency of running timer list in fast mode");
258
259 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, timer_slowquantum, CTLFLAG_RW | CTLFLAG_LOCKED,
260 &tcp_timer_list.slow_quantum, TCP_SLOWTIMER_QUANTUM,
261 "Frequency of running timer list in slow mode");
262
263 static void tcp_remove_timer(struct tcpcb *tp);
264 static void tcp_sched_timerlist(uint32_t offset);
265 static uint32_t tcp_run_conn_timer(struct tcpcb *tp, uint16_t *next_index);
266 static void tcp_sched_timers(struct tcpcb *tp);
267 static inline void tcp_set_lotimer_index(struct tcpcb *);
268 static void tcp_rexmt_save_state(struct tcpcb *tp);
269 void tcp_remove_from_time_wait(struct inpcb *inp);
270
271 /* Macro to compare two timers. If there is a reset of the sign bit, it is
272 * safe to assume that the timer has wrapped around. By doing signed comparision,
273 * we take care of wrap around such that the value with the sign bit reset is
274 * actually ahead of the other.
275 */
276
277 static inline int32_t
278 timer_diff(uint32_t t1, uint32_t toff1, uint32_t t2, uint32_t toff2) {
279 return (int32_t)((t1 + toff1) - (t2 + toff2));
280 };
281
282 /* Returns true if the timer is on the timer list */
283 #define TIMER_IS_ON_LIST(tp) ((tp)->t_flags & TF_TIMER_ONLIST)
284
285
286 static void add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay);
287 void add_to_time_wait(struct tcpcb *tp, uint32_t delay) ;
288
289 static boolean_t tcp_garbage_collect(struct inpcb *, int);
290
291 /*
292 * Add to tcp timewait list, delay is given in milliseconds.
293 */
294 static void
295 add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay)
296 {
297 struct inpcbinfo *pcbinfo = &tcbinfo;
298 struct inpcb *inp = tp->t_inpcb;
299 uint32_t timer;
300
301 /* pcb list should be locked when we get here */
302 lck_rw_assert(pcbinfo->ipi_lock, LCK_RW_ASSERT_EXCLUSIVE);
303
304 /* We may get here multiple times, so check */
305 if (!(inp->inp_flags2 & INP2_TIMEWAIT)) {
306 pcbinfo->ipi_twcount++;
307 inp->inp_flags2 |= INP2_TIMEWAIT;
308
309 /* Remove from global inp list */
310 LIST_REMOVE(inp, inp_list);
311 } else {
312 TAILQ_REMOVE(&tcp_tw_tailq, tp, t_twentry);
313 }
314
315 /* Compute the time at which this socket can be closed */
316 timer = tcp_now + delay;
317
318 /* We will use the TCPT_2MSL timer for tracking this delay */
319
320 if (TIMER_IS_ON_LIST(tp))
321 tcp_remove_timer(tp);
322 tp->t_timer[TCPT_2MSL] = timer;
323
324 TAILQ_INSERT_TAIL(&tcp_tw_tailq, tp, t_twentry);
325 }
326
327 void
328 add_to_time_wait(struct tcpcb *tp, uint32_t delay)
329 {
330 struct inpcbinfo *pcbinfo = &tcbinfo;
331
332 if (!lck_rw_try_lock_exclusive(pcbinfo->ipi_lock)) {
333 tcp_unlock(tp->t_inpcb->inp_socket, 0, 0);
334 lck_rw_lock_exclusive(pcbinfo->ipi_lock);
335 tcp_lock(tp->t_inpcb->inp_socket, 0, 0);
336 }
337 add_to_time_wait_locked(tp, delay);
338 lck_rw_done(pcbinfo->ipi_lock);
339
340 inpcb_gc_sched(pcbinfo, INPCB_TIMER_LAZY);
341 }
342
343 /* If this is on time wait queue, remove it. */
344 void
345 tcp_remove_from_time_wait(struct inpcb *inp)
346 {
347 struct tcpcb *tp = intotcpcb(inp);
348 if (inp->inp_flags2 & INP2_TIMEWAIT)
349 TAILQ_REMOVE(&tcp_tw_tailq, tp, t_twentry);
350 }
351
352 static boolean_t
353 tcp_garbage_collect(struct inpcb *inp, int istimewait)
354 {
355 boolean_t active = FALSE;
356 struct socket *so;
357 struct tcpcb *tp;
358
359 so = inp->inp_socket;
360 tp = intotcpcb(inp);
361
362 /*
363 * Skip if still in use or busy; it would have been more efficient
364 * if we were to test so_usecount against 0, but this isn't possible
365 * due to the current implementation of tcp_dropdropablreq() where
366 * overflow sockets that are eligible for garbage collection have
367 * their usecounts set to 1.
368 */
369 if (!lck_mtx_try_lock_spin(&inp->inpcb_mtx))
370 return (TRUE);
371
372 /* Check again under the lock */
373 if (so->so_usecount > 1) {
374 if (inp->inp_wantcnt == WNT_STOPUSING)
375 active = TRUE;
376 lck_mtx_unlock(&inp->inpcb_mtx);
377 return (active);
378 }
379
380 if (istimewait &&
381 TSTMP_GEQ(tcp_now, tp->t_timer[TCPT_2MSL]) &&
382 tp->t_state != TCPS_CLOSED) {
383 /* Become a regular mutex */
384 lck_mtx_convert_spin(&inp->inpcb_mtx);
385 tcp_close(tp);
386 }
387
388 /*
389 * Overflowed socket dropped from the listening queue? Do this
390 * only if we are called to clean up the time wait slots, since
391 * tcp_dropdropablreq() considers a socket to have been fully
392 * dropped after add_to_time_wait() is finished.
393 * Also handle the case of connections getting closed by the peer
394 * while in the queue as seen with rdar://6422317
395 *
396 */
397 if (so->so_usecount == 1 &&
398 ((istimewait && (so->so_flags & SOF_OVERFLOW)) ||
399 ((tp != NULL) && (tp->t_state == TCPS_CLOSED) &&
400 (so->so_head != NULL) &&
401 ((so->so_state & (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) ==
402 (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE))))) {
403
404 if (inp->inp_state != INPCB_STATE_DEAD) {
405 /* Become a regular mutex */
406 lck_mtx_convert_spin(&inp->inpcb_mtx);
407 #if INET6
408 if (SOCK_CHECK_DOM(so, PF_INET6))
409 in6_pcbdetach(inp);
410 else
411 #endif /* INET6 */
412 in_pcbdetach(inp);
413 }
414 so->so_usecount--;
415 if (inp->inp_wantcnt == WNT_STOPUSING)
416 active = TRUE;
417 lck_mtx_unlock(&inp->inpcb_mtx);
418 return (active);
419 } else if (inp->inp_wantcnt != WNT_STOPUSING) {
420 lck_mtx_unlock(&inp->inpcb_mtx);
421 return (FALSE);
422 }
423
424 /*
425 * We get here because the PCB is no longer searchable
426 * (WNT_STOPUSING); detach (if needed) and dispose if it is dead
427 * (usecount is 0). This covers all cases, including overflow
428 * sockets and those that are considered as "embryonic",
429 * i.e. created by sonewconn() in TCP input path, and have
430 * not yet been committed. For the former, we reduce the usecount
431 * to 0 as done by the code above. For the latter, the usecount
432 * would have reduced to 0 as part calling soabort() when the
433 * socket is dropped at the end of tcp_input().
434 */
435 if (so->so_usecount == 0) {
436 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
437 struct tcpcb *, tp, int32_t, TCPS_CLOSED);
438 /* Become a regular mutex */
439 lck_mtx_convert_spin(&inp->inpcb_mtx);
440
441 /*
442 * If this tp still happens to be on the timer list,
443 * take it out
444 */
445 if (TIMER_IS_ON_LIST(tp)) {
446 tcp_remove_timer(tp);
447 }
448
449 if (inp->inp_state != INPCB_STATE_DEAD) {
450 #if INET6
451 if (SOCK_CHECK_DOM(so, PF_INET6))
452 in6_pcbdetach(inp);
453 else
454 #endif /* INET6 */
455 in_pcbdetach(inp);
456 }
457 in_pcbdispose(inp);
458 return (FALSE);
459 }
460
461 lck_mtx_unlock(&inp->inpcb_mtx);
462 return (TRUE);
463 }
464
465 /*
466 * TCP garbage collector callback (inpcb_timer_func_t).
467 *
468 * Returns the number of pcbs that will need to be gc-ed soon,
469 * returnining > 0 will keep timer active.
470 */
471 void
472 tcp_gc(struct inpcbinfo *ipi)
473 {
474 struct inpcb *inp, *nxt;
475 struct tcpcb *tw_tp, *tw_ntp;
476 #if TCPDEBUG
477 int ostate;
478 #endif
479 #if KDEBUG
480 static int tws_checked = 0;
481 #endif
482
483 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_START, 0, 0, 0, 0, 0);
484
485 /*
486 * Update tcp_now here as it may get used while
487 * processing the slow timer.
488 */
489 calculate_tcp_clock();
490
491 /*
492 * Garbage collect socket/tcpcb: We need to acquire the list lock
493 * exclusively to do this
494 */
495
496 if (lck_rw_try_lock_exclusive(ipi->ipi_lock) == FALSE) {
497 /* don't sweat it this time; cleanup was done last time */
498 if (tcp_gc_done == TRUE) {
499 tcp_gc_done = FALSE;
500 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END,
501 tws_checked, cur_tw_slot, 0, 0, 0);
502 /* Lock upgrade failed, give up this round */
503 atomic_add_32(&ipi->ipi_gc_req.intimer_fast, 1);
504 return;
505 }
506 /* Upgrade failed, lost lock now take it again exclusive */
507 lck_rw_lock_exclusive(ipi->ipi_lock);
508 }
509 tcp_gc_done = TRUE;
510
511 LIST_FOREACH_SAFE(inp, &tcb, inp_list, nxt) {
512 if (tcp_garbage_collect(inp, 0))
513 atomic_add_32(&ipi->ipi_gc_req.intimer_fast, 1);
514 }
515
516 /* Now cleanup the time wait ones */
517 TAILQ_FOREACH_SAFE(tw_tp, &tcp_tw_tailq, t_twentry, tw_ntp) {
518 /*
519 * We check the timestamp here without holding the
520 * socket lock for better performance. If there are
521 * any pcbs in time-wait, the timer will get rescheduled.
522 * Hence some error in this check can be tolerated.
523 */
524 if (TSTMP_GEQ(tcp_now, tw_tp->t_timer[TCPT_2MSL])) {
525 if (tcp_garbage_collect(tw_tp->t_inpcb, 1))
526 atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, 1);
527 } else {
528 break;
529 }
530 }
531
532 /* take into account pcbs that are still in time_wait_slots */
533 atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, ipi->ipi_twcount);
534
535 lck_rw_done(ipi->ipi_lock);
536
537 /* Clean up the socache while we are here */
538 if (so_cache_timer())
539 atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, 1);
540
541 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END, tws_checked,
542 cur_tw_slot, 0, 0, 0);
543
544 return;
545 }
546
547 /*
548 * Cancel all timers for TCP tp.
549 */
550 void
551 tcp_canceltimers(tp)
552 struct tcpcb *tp;
553 {
554 register int i;
555
556 tcp_remove_timer(tp);
557 for (i = 0; i < TCPT_NTIMERS; i++)
558 tp->t_timer[i] = 0;
559 tp->tentry.timer_start = tcp_now;
560 tp->tentry.index = TCPT_NONE;
561 }
562
563 int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
564 { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
565
566 int tcp_backoff[TCP_MAXRXTSHIFT + 1] =
567 { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 };
568
569 static int tcp_totbackoff = 511; /* sum of tcp_backoff[] */
570
571 static void tcp_rexmt_save_state(struct tcpcb *tp)
572 {
573 u_int32_t fsize;
574 if (TSTMP_SUPPORTED(tp)) {
575 /*
576 * Since timestamps are supported on the connection,
577 * we can do recovery as described in rfc 4015.
578 */
579 fsize = tp->snd_max - tp->snd_una;
580 tp->snd_ssthresh_prev = max(fsize, tp->snd_ssthresh);
581 tp->snd_recover_prev = tp->snd_recover;
582 } else {
583 /*
584 * Timestamp option is not supported on this connection.
585 * Record ssthresh and cwnd so they can
586 * be recovered if this turns out to be a "bad" retransmit.
587 * A retransmit is considered "bad" if an ACK for this
588 * segment is received within RTT/2 interval; the assumption
589 * here is that the ACK was already in flight. See
590 * "On Estimating End-to-End Network Path Properties" by
591 * Allman and Paxson for more details.
592 */
593 tp->snd_cwnd_prev = tp->snd_cwnd;
594 tp->snd_ssthresh_prev = tp->snd_ssthresh;
595 tp->snd_recover_prev = tp->snd_recover;
596 if (IN_FASTRECOVERY(tp))
597 tp->t_flags |= TF_WASFRECOVERY;
598 else
599 tp->t_flags &= ~TF_WASFRECOVERY;
600 }
601 tp->t_srtt_prev = (tp->t_srtt >> TCP_RTT_SHIFT) + 2;
602 tp->t_rttvar_prev = (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
603 tp->t_flagsext &= ~(TF_RECOMPUTE_RTT);
604 }
605
606 /*
607 * TCP timer processing.
608 */
609 struct tcpcb *
610 tcp_timers(tp, timer)
611 register struct tcpcb *tp;
612 int timer;
613 {
614 register int rexmt;
615 struct socket *so;
616 struct tcptemp *t_template;
617 int optlen = 0;
618 int idle_time = 0;
619
620 #if TCPDEBUG
621 int ostate;
622 #endif
623
624 #if INET6
625 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV4) == 0;
626 #endif /* INET6 */
627
628 so = tp->t_inpcb->inp_socket;
629 idle_time = tcp_now - tp->t_rcvtime;
630
631 switch (timer) {
632
633 /*
634 * 2 MSL timeout in shutdown went off. If we're closed but
635 * still waiting for peer to close and connection has been idle
636 * too long, or if 2MSL time is up from TIME_WAIT or FIN_WAIT_2,
637 * delete connection control block.
638 * Otherwise, (this case shouldn't happen) check again in a bit
639 * we keep the socket in the main list in that case.
640 */
641 case TCPT_2MSL:
642 tcp_free_sackholes(tp);
643 if (tp->t_state != TCPS_TIME_WAIT &&
644 tp->t_state != TCPS_FIN_WAIT_2 &&
645 ((idle_time > 0) && (idle_time < TCP_CONN_MAXIDLE(tp)))) {
646 tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp,
647 (u_int32_t)TCP_CONN_KEEPINTVL(tp));
648 } else {
649 tp = tcp_close(tp);
650 return(tp);
651 }
652 break;
653
654 /*
655 * Retransmission timer went off. Message has not
656 * been acked within retransmit interval. Back off
657 * to a longer retransmit interval and retransmit one segment.
658 */
659 case TCPT_REXMT:
660 /* Drop a connection in the retransmit timer
661 * 1. If we have retransmitted more than TCP_MAXRXTSHIFT times
662 * 2. If the time spent in this retransmission episode is more than
663 * the time limit set with TCP_RXT_CONNDROPTIME socket option
664 * 3. If TCP_RXT_FINDROP socket option was set and we have already
665 * retransmitted the FIN 3 times without receiving an ack
666 */
667 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT ||
668 (tp->t_rxt_conndroptime > 0 && tp->t_rxtstart > 0 &&
669 (tcp_now - tp->t_rxtstart) >= tp->t_rxt_conndroptime) ||
670 ((tp->t_flagsext & TF_RXTFINDROP) != 0 &&
671 (tp->t_flags & TF_SENTFIN) != 0 &&
672 tp->t_rxtshift >= 4)) {
673
674 if ((tp->t_flagsext & TF_RXTFINDROP) != 0) {
675 tcpstat.tcps_rxtfindrop++;
676 } else {
677 tcpstat.tcps_timeoutdrop++;
678 }
679 tp->t_rxtshift = TCP_MAXRXTSHIFT;
680 postevent(so, 0, EV_TIMEOUT);
681 soevent(so,
682 (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT));
683 tp = tcp_drop(tp, tp->t_softerror ?
684 tp->t_softerror : ETIMEDOUT);
685
686 break;
687 }
688
689 tcpstat.tcps_rexmttimeo++;
690
691 if (tp->t_rxtshift == 1 &&
692 tp->t_state == TCPS_ESTABLISHED) {
693 /* Set the time at which retransmission started. */
694 tp->t_rxtstart = tcp_now;
695
696 /*
697 * if this is the first retransmit timeout, save
698 * the state so that we can recover if the timeout
699 * is spurious.
700 */
701 tcp_rexmt_save_state(tp);
702 }
703 #if MPTCP
704 if ((tp->t_rxtshift == mptcp_fail_thresh) &&
705 (tp->t_state == TCPS_ESTABLISHED) &&
706 (tp->t_mpflags & TMPF_MPTCP_TRUE)) {
707 mptcp_act_on_txfail(so);
708
709 }
710 #endif /* MPTCP */
711
712 if (tp->t_adaptive_wtimo > 0 &&
713 tp->t_rxtshift > tp->t_adaptive_wtimo &&
714 TCPS_HAVEESTABLISHED(tp->t_state)) {
715 /* Send an event to the application */
716 soevent(so,
717 (SO_FILT_HINT_LOCKED|
718 SO_FILT_HINT_ADAPTIVE_WTIMO));
719 }
720
721 if (tp->t_state == TCPS_SYN_SENT) {
722 rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift];
723 tp->t_stat.synrxtshift = tp->t_rxtshift;
724 }
725 else
726 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
727 TCPT_RANGESET(tp->t_rxtcur, rexmt,
728 tp->t_rttmin, TCPTV_REXMTMAX,
729 TCP_ADD_REXMTSLOP(tp));
730 tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
731
732 if (INP_WAIT_FOR_IF_FEEDBACK(tp->t_inpcb))
733 goto fc_output;
734
735 tcp_free_sackholes(tp);
736 /*
737 * Check for potential Path MTU Discovery Black Hole
738 */
739
740 if (tcp_pmtud_black_hole_detect && (tp->t_state == TCPS_ESTABLISHED)) {
741 if (((tp->t_flags & (TF_PMTUD|TF_MAXSEGSNT)) == (TF_PMTUD|TF_MAXSEGSNT)) &&
742 (tp->t_rxtshift == 2)) {
743 /*
744 * Enter Path MTU Black-hole Detection mechanism:
745 * - Disable Path MTU Discovery (IP "DF" bit).
746 * - Reduce MTU to lower value than what we negociated with peer.
747 */
748 /* Disable Path MTU Discovery for now */
749 tp->t_flags &= ~TF_PMTUD;
750 /* Record that we may have found a black hole */
751 tp->t_flags |= TF_BLACKHOLE;
752 optlen = tp->t_maxopd - tp->t_maxseg;
753 /* Keep track of previous MSS */
754 tp->t_pmtud_saved_maxopd = tp->t_maxopd;
755 /* Reduce the MSS to intermediary value */
756 if (tp->t_maxopd > tcp_pmtud_black_hole_mss) {
757 tp->t_maxopd = tcp_pmtud_black_hole_mss;
758 } else {
759 tp->t_maxopd = /* use the default MSS */
760 #if INET6
761 isipv6 ? tcp_v6mssdflt :
762 #endif /* INET6 */
763 tcp_mssdflt;
764 }
765 tp->t_maxseg = tp->t_maxopd - optlen;
766
767 /*
768 * Reset the slow-start flight size
769 * as it may depend on the new MSS
770 */
771 if (CC_ALGO(tp)->cwnd_init != NULL)
772 CC_ALGO(tp)->cwnd_init(tp);
773 }
774 /*
775 * If further retransmissions are still unsuccessful with a lowered MTU,
776 * maybe this isn't a Black Hole and we restore the previous MSS and
777 * blackhole detection flags.
778 */
779 else {
780
781 if ((tp->t_flags & TF_BLACKHOLE) && (tp->t_rxtshift > 4)) {
782 tp->t_flags |= TF_PMTUD;
783 tp->t_flags &= ~TF_BLACKHOLE;
784 optlen = tp->t_maxopd - tp->t_maxseg;
785 tp->t_maxopd = tp->t_pmtud_saved_maxopd;
786 tp->t_maxseg = tp->t_maxopd - optlen;
787 /*
788 * Reset the slow-start flight size as it
789 * may depend on the new MSS
790 */
791 if (CC_ALGO(tp)->cwnd_init != NULL)
792 CC_ALGO(tp)->cwnd_init(tp);
793 }
794 }
795 }
796
797
798 /*
799 * Disable rfc1323 and rfc1644 if we haven't got any response to
800 * our SYN (after we reach the threshold) to work-around some
801 * broken terminal servers (most of which have hopefully been
802 * retired) that have bad VJ header compression code which
803 * trashes TCP segments containing unknown-to-them TCP options.
804 * Do this only on non-local connections.
805 */
806 if (tp->t_state == TCPS_SYN_SENT &&
807 ((!(tp->t_flags & TF_LOCAL) &&
808 tp->t_rxtshift == tcp_broken_peer_syn_rxmit_thres) ||
809 ((tp->t_flags & TF_LOCAL) &&
810 tp->t_rxtshift == tcp_broken_peer_syn_rxmit_thres_local)))
811 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_REQ_CC);
812
813 /*
814 * If losing, let the lower level know and try for
815 * a better route. Also, if we backed off this far,
816 * our srtt estimate is probably bogus. Clobber it
817 * so we'll take the next rtt measurement as our srtt;
818 * move the current srtt into rttvar to keep the current
819 * retransmit times until then.
820 */
821 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
822 #if INET6
823 if (isipv6)
824 in6_losing(tp->t_inpcb);
825 else
826 #endif /* INET6 */
827 in_losing(tp->t_inpcb);
828 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
829 tp->t_srtt = 0;
830 }
831 tp->snd_nxt = tp->snd_una;
832 /*
833 * Note: We overload snd_recover to function also as the
834 * snd_last variable described in RFC 2582
835 */
836 tp->snd_recover = tp->snd_max;
837 /*
838 * Force a segment to be sent.
839 */
840 tp->t_flags |= TF_ACKNOW;
841 /*
842 * If timing a segment in this window, stop the timer.
843 */
844 tp->t_rtttime = 0;
845
846 EXIT_FASTRECOVERY(tp);
847
848 /* RFC 5681 says: when a TCP sender detects segment loss
849 * using retransmit timer and the given segment has already
850 * been retransmitted by way of the retransmission timer at
851 * least once, the value of ssthresh is held constant
852 */
853 if (tp->t_rxtshift == 1 &&
854 CC_ALGO(tp)->after_timeout != NULL)
855 CC_ALGO(tp)->after_timeout(tp);
856
857
858 /* CWR notifications are to be sent on new data right after
859 * RTOs, Fast Retransmits and ECE notification receipts.
860 */
861 if ((tp->ecn_flags & TE_ECN_ON) == TE_ECN_ON) {
862 tp->ecn_flags |= TE_SENDCWR;
863 }
864 fc_output:
865 DTRACE_TCP5(cc, void, NULL, struct inpcb *, tp->t_inpcb,
866 struct tcpcb *, tp, struct tcphdr *, NULL,
867 int32_t, TCP_CC_REXMT_TIMEOUT);
868
869 (void) tcp_output(tp);
870 break;
871
872 /*
873 * Persistance timer into zero window.
874 * Force a byte to be output, if possible.
875 */
876 case TCPT_PERSIST:
877 tcpstat.tcps_persisttimeo++;
878 /*
879 * Hack: if the peer is dead/unreachable, we do not
880 * time out if the window is closed. After a full
881 * backoff, drop the connection if the idle time
882 * (no responses to probes) reaches the maximum
883 * backoff that we would use if retransmitting.
884 *
885 * Drop the connection if we reached the maximum allowed time for
886 * Zero Window Probes without a non-zero update from the peer.
887 * See rdar://5805356
888 */
889 if ((tp->t_rxtshift == TCP_MAXRXTSHIFT &&
890 (idle_time >= tcp_maxpersistidle ||
891 idle_time >= TCP_REXMTVAL(tp) * tcp_totbackoff)) ||
892 ((tp->t_persist_stop != 0) &&
893 TSTMP_LEQ(tp->t_persist_stop, tcp_now))) {
894 tcpstat.tcps_persistdrop++;
895 postevent(so, 0, EV_TIMEOUT);
896 soevent(so,
897 (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT));
898 tp = tcp_drop(tp, ETIMEDOUT);
899 break;
900 }
901 tcp_setpersist(tp);
902 tp->t_force = 1;
903 (void) tcp_output(tp);
904 tp->t_force = 0;
905 break;
906
907 /*
908 * Keep-alive timer went off; send something
909 * or drop connection if idle for too long.
910 */
911 case TCPT_KEEP:
912 tcpstat.tcps_keeptimeo++;
913 #if MPTCP
914 /*
915 * Regular TCP connections do not send keepalives after closing
916 * MPTCP must not also, after sending Data FINs.
917 */
918 struct mptcb *mp_tp = tp->t_mptcb;
919 if ((tp->t_mpflags & TMPF_MPTCP_TRUE) &&
920 (mp_tp == NULL)) {
921 goto dropit;
922 } else if (mp_tp != NULL) {
923 if ((mptcp_ok_to_keepalive(mp_tp) == 0))
924 goto dropit;
925 }
926 #endif /* MPTCP */
927 if (tp->t_state < TCPS_ESTABLISHED)
928 goto dropit;
929 if ((always_keepalive ||
930 (tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) ||
931 (tp->t_flagsext & TF_DETECT_READSTALL)) &&
932 (tp->t_state <= TCPS_CLOSING || tp->t_state == TCPS_FIN_WAIT_2)) {
933 if (idle_time >= TCP_CONN_KEEPIDLE(tp) + TCP_CONN_MAXIDLE(tp))
934 goto dropit;
935 /*
936 * Send a packet designed to force a response
937 * if the peer is up and reachable:
938 * either an ACK if the connection is still alive,
939 * or an RST if the peer has closed the connection
940 * due to timeout or reboot.
941 * Using sequence number tp->snd_una-1
942 * causes the transmitted zero-length segment
943 * to lie outside the receive window;
944 * by the protocol spec, this requires the
945 * correspondent TCP to respond.
946 */
947 tcpstat.tcps_keepprobe++;
948 t_template = tcp_maketemplate(tp);
949 if (t_template) {
950 unsigned int ifscope, nocell = 0;
951
952 if (tp->t_inpcb->inp_flags & INP_BOUND_IF)
953 ifscope = tp->t_inpcb->inp_boundifp->if_index;
954 else
955 ifscope = IFSCOPE_NONE;
956
957 /*
958 * If the socket isn't allowed to use the
959 * cellular interface, indicate it as such.
960 */
961 if (tp->t_inpcb->inp_flags & INP_NO_IFT_CELLULAR)
962 nocell = 1;
963
964 tcp_respond(tp, t_template->tt_ipgen,
965 &t_template->tt_t, (struct mbuf *)NULL,
966 tp->rcv_nxt, tp->snd_una - 1, 0, ifscope,
967 nocell);
968 (void) m_free(dtom(t_template));
969 if (tp->t_flagsext & TF_DETECT_READSTALL)
970 tp->t_rtimo_probes++;
971 }
972 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
973 TCP_CONN_KEEPINTVL(tp));
974 } else {
975 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
976 TCP_CONN_KEEPIDLE(tp));
977 }
978 if (tp->t_flagsext & TF_DETECT_READSTALL) {
979 /*
980 * The keep alive packets sent to detect a read
981 * stall did not get a response from the
982 * peer. Generate more keep-alives to confirm this.
983 * If the number of probes sent reaches the limit,
984 * generate an event.
985 */
986 if (tp->t_rtimo_probes > tp->t_adaptive_rtimo) {
987 /* Generate an event */
988 soevent(so,
989 (SO_FILT_HINT_LOCKED|
990 SO_FILT_HINT_ADAPTIVE_RTIMO));
991 tcp_keepalive_reset(tp);
992 } else {
993 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(
994 tp, TCP_REXMTVAL(tp));
995 }
996 }
997 break;
998 case TCPT_DELACK:
999 if (tcp_delack_enabled && (tp->t_flags & TF_DELACK)) {
1000 tp->t_flags &= ~TF_DELACK;
1001 tp->t_timer[TCPT_DELACK] = 0;
1002 tp->t_flags |= TF_ACKNOW;
1003
1004 /* If delayed ack timer fired while stretching acks
1005 * go back to acking every other packet
1006 */
1007 if ((tp->t_flags & TF_STRETCHACK) != 0)
1008 tcp_reset_stretch_ack(tp);
1009
1010 /* If we are measuring inter packet arrival jitter for
1011 * throttling a connection, this delayed ack might be
1012 * the reason for accumulating some jitter. So let's
1013 * restart the measurement.
1014 */
1015 CLEAR_IAJ_STATE(tp);
1016
1017 tcpstat.tcps_delack++;
1018 (void) tcp_output(tp);
1019 }
1020 break;
1021
1022 #if MPTCP
1023 case TCPT_JACK_RXMT:
1024 if ((tp->t_state == TCPS_ESTABLISHED) &&
1025 (tp->t_mpflags & TMPF_PREESTABLISHED) &&
1026 (tp->t_mpflags & TMPF_JOINED_FLOW)) {
1027 if (++tp->t_mprxtshift > TCP_MAXRXTSHIFT) {
1028 tcpstat.tcps_timeoutdrop++;
1029 postevent(so, 0, EV_TIMEOUT);
1030 soevent(so,
1031 (SO_FILT_HINT_LOCKED|
1032 SO_FILT_HINT_TIMEOUT));
1033 tp = tcp_drop(tp, tp->t_softerror ?
1034 tp->t_softerror : ETIMEDOUT);
1035 break;
1036 }
1037 tcpstat.tcps_join_rxmts++;
1038 tp->t_flags |= TF_ACKNOW;
1039
1040 /*
1041 * No backoff is implemented for simplicity for this
1042 * corner case.
1043 */
1044 (void) tcp_output(tp);
1045 }
1046 break;
1047 #endif /* MPTCP */
1048
1049 #if TCPDEBUG
1050 if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
1051 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
1052 PRU_SLOWTIMO);
1053 #endif
1054 dropit:
1055 tcpstat.tcps_keepdrops++;
1056 postevent(so, 0, EV_TIMEOUT);
1057 soevent(so,
1058 (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT));
1059 tp = tcp_drop(tp, ETIMEDOUT);
1060 break;
1061 }
1062 return (tp);
1063 }
1064
1065 /* Remove a timer entry from timer list */
1066 void
1067 tcp_remove_timer(struct tcpcb *tp)
1068 {
1069 struct tcptimerlist *listp = &tcp_timer_list;
1070
1071 lck_mtx_assert(&tp->t_inpcb->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1072 if (!(TIMER_IS_ON_LIST(tp))) {
1073 return;
1074 }
1075 lck_mtx_lock(listp->mtx);
1076
1077 /* Check if pcb is on timer list again after acquiring the lock */
1078 if (!(TIMER_IS_ON_LIST(tp))) {
1079 lck_mtx_unlock(listp->mtx);
1080 return;
1081 }
1082
1083 if (listp->next_te != NULL && listp->next_te == &tp->tentry)
1084 listp->next_te = LIST_NEXT(&tp->tentry, le);
1085
1086 LIST_REMOVE(&tp->tentry, le);
1087 tp->t_flags &= ~(TF_TIMER_ONLIST);
1088
1089 listp->entries--;
1090
1091 tp->tentry.le.le_next = NULL;
1092 tp->tentry.le.le_prev = NULL;
1093 lck_mtx_unlock(listp->mtx);
1094 }
1095
1096 /* Function to check if the timerlist needs to be rescheduled to run
1097 * the timer entry correctly. Basically, this is to check if we can avoid
1098 * taking the list lock.
1099 */
1100
1101 static boolean_t
1102 need_to_resched_timerlist(uint32_t runtime, uint16_t index) {
1103 struct tcptimerlist *listp = &tcp_timer_list;
1104 int32_t diff;
1105 boolean_t is_fast;
1106
1107 if (runtime == 0 || index == TCPT_NONE)
1108 return FALSE;
1109 is_fast = !(IS_TIMER_SLOW(index));
1110
1111 /* If the list is being processed then the state of the list is in flux.
1112 * In this case always acquire the lock and set the state correctly.
1113 */
1114 if (listp->running) {
1115 return TRUE;
1116 }
1117
1118 diff = timer_diff(listp->runtime, 0, runtime, 0);
1119 if (diff <= 0) {
1120 /* The list is going to run before this timer */
1121 return FALSE;
1122 } else {
1123 if (is_fast) {
1124 if (diff <= listp->fast_quantum)
1125 return FALSE;
1126 } else {
1127 if (diff <= listp->slow_quantum)
1128 return FALSE;
1129 }
1130 }
1131 return TRUE;
1132 }
1133
1134 void
1135 tcp_sched_timerlist(uint32_t offset)
1136 {
1137
1138 uint64_t deadline = 0;
1139 struct tcptimerlist *listp = &tcp_timer_list;
1140
1141 lck_mtx_assert(listp->mtx, LCK_MTX_ASSERT_OWNED);
1142
1143 listp->runtime = tcp_now + offset;
1144
1145 clock_interval_to_deadline(offset, NSEC_PER_SEC / TCP_RETRANSHZ,
1146 &deadline);
1147
1148 thread_call_enter_delayed(listp->call, deadline);
1149 }
1150
1151 /* Function to run the timers for a connection.
1152 *
1153 * Returns the offset of next timer to be run for this connection which
1154 * can be used to reschedule the timerlist.
1155 */
1156 uint32_t
1157 tcp_run_conn_timer(struct tcpcb *tp, uint16_t *next_index) {
1158
1159 struct socket *so;
1160 uint16_t i = 0, index = TCPT_NONE, lo_index = TCPT_NONE;
1161 uint32_t timer_val, offset = 0, lo_timer = 0;
1162 int32_t diff;
1163 boolean_t needtorun[TCPT_NTIMERS];
1164 int count = 0;
1165
1166 VERIFY(tp != NULL);
1167 bzero(needtorun, sizeof(needtorun));
1168
1169 tcp_lock(tp->t_inpcb->inp_socket, 1, 0);
1170
1171 so = tp->t_inpcb->inp_socket;
1172 /* Release the want count on inp */
1173 if (in_pcb_checkstate(tp->t_inpcb, WNT_RELEASE, 1) == WNT_STOPUSING) {
1174 if (TIMER_IS_ON_LIST(tp)) {
1175 tcp_remove_timer(tp);
1176 }
1177
1178 /* Looks like the TCP connection got closed while we
1179 * were waiting for the lock.. Done
1180 */
1181 goto done;
1182 }
1183
1184 /* Since the timer thread needs to wait for tcp lock, it may race
1185 * with another thread that can cancel or reschedule the timer that is
1186 * about to run. Check if we need to run anything.
1187 */
1188 index = tp->tentry.index;
1189 timer_val = tp->t_timer[index];
1190
1191 if (index == TCPT_NONE || tp->tentry.runtime == 0)
1192 goto done;
1193
1194 diff = timer_diff(tp->tentry.runtime, 0, tcp_now, 0);
1195 if (diff > 0) {
1196 if (tp->tentry.index != TCPT_NONE) {
1197 offset = diff;
1198 *(next_index) = tp->tentry.index;
1199 }
1200 goto done;
1201 }
1202
1203 tp->t_timer[index] = 0;
1204 if (timer_val > 0) {
1205 tp = tcp_timers(tp, index);
1206 if (tp == NULL)
1207 goto done;
1208 }
1209
1210 /* Check if there are any other timers that need to be run. While doing it,
1211 * adjust the timer values wrt tcp_now.
1212 */
1213 for (i = 0; i < TCPT_NTIMERS; ++i) {
1214 if (tp->t_timer[i] != 0) {
1215 diff = timer_diff(tp->tentry.timer_start, tp->t_timer[i], tcp_now, 0);
1216 if (diff <= 0) {
1217 tp->t_timer[i] = 0;
1218 needtorun[i] = TRUE;
1219 count++;
1220 } else {
1221 tp->t_timer[i] = diff;
1222 needtorun[i] = FALSE;
1223 if (lo_timer == 0 || diff < lo_timer) {
1224 lo_timer = diff;
1225 lo_index = i;
1226 }
1227 }
1228 }
1229 }
1230
1231 tp->tentry.timer_start = tcp_now;
1232 tp->tentry.index = lo_index;
1233 if (lo_index != TCPT_NONE) {
1234 tp->tentry.runtime = tp->tentry.timer_start + tp->t_timer[lo_index];
1235 } else {
1236 tp->tentry.runtime = 0;
1237 }
1238
1239 if (count > 0) {
1240 /* run any other timers that are also outstanding at this time. */
1241 for (i = 0; i < TCPT_NTIMERS; ++i) {
1242 if (needtorun[i]) {
1243 tp->t_timer[i] = 0;
1244 tp = tcp_timers(tp, i);
1245 if (tp == NULL)
1246 goto done;
1247 }
1248 }
1249 tcp_set_lotimer_index(tp);
1250 }
1251
1252 if (tp->tentry.index < TCPT_NONE) {
1253 offset = tp->t_timer[tp->tentry.index];
1254 *(next_index) = tp->tentry.index;
1255 }
1256
1257 done:
1258 if (tp != NULL && tp->tentry.index == TCPT_NONE) {
1259 tcp_remove_timer(tp);
1260 }
1261 tcp_unlock(so, 1, 0);
1262 return offset;
1263 }
1264
1265 void
1266 tcp_run_timerlist(void * arg1, void * arg2) {
1267
1268 #pragma unused(arg1, arg2)
1269
1270 struct tcptimerentry *te, *next_te;
1271 struct tcptimerlist *listp = &tcp_timer_list;
1272 struct tcpcb *tp;
1273 uint32_t next_timer = 0;
1274 uint16_t index = TCPT_NONE;
1275 boolean_t need_fast = FALSE;
1276 uint32_t active_count = 0;
1277 uint32_t mode = TCP_TIMERLIST_FASTMODE;
1278
1279 calculate_tcp_clock();
1280
1281 lck_mtx_lock(listp->mtx);
1282
1283 listp->running = TRUE;
1284
1285 LIST_FOREACH_SAFE(te, &listp->lhead, le, next_te) {
1286 uint32_t offset = 0;
1287 uint32_t runtime = te->runtime;
1288 if (TSTMP_GT(runtime, tcp_now)) {
1289 offset = timer_diff(runtime, 0, tcp_now, 0);
1290 if (next_timer == 0 || offset < next_timer) {
1291 next_timer = offset;
1292 }
1293 continue;
1294 }
1295 active_count++;
1296
1297 tp = TIMERENTRY_TO_TP(te);
1298
1299 /* Acquire an inp wantcnt on the inpcb so that the socket won't get
1300 * detached even if tcp_close is called
1301 */
1302 if (in_pcb_checkstate(tp->t_inpcb, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
1303 /* Some how this pcb went into dead state while on the timer list,
1304 * just take it off the list. Since the timer list entry pointers
1305 * are protected by the timer list lock, we can do it here
1306 */
1307 if (TIMER_IS_ON_LIST(tp)) {
1308 tp->t_flags &= ~(TF_TIMER_ONLIST);
1309 LIST_REMOVE(&tp->tentry, le);
1310 listp->entries--;
1311
1312 tp->tentry.le.le_next = NULL;
1313 tp->tentry.le.le_prev = NULL;
1314 }
1315 continue;
1316 }
1317
1318 /* Store the next timerentry pointer before releasing the list lock.
1319 * If that entry has to be removed when we release the lock, this
1320 * pointer will be updated to the element after that.
1321 */
1322 listp->next_te = next_te;
1323
1324 VERIFY_NEXT_LINK(&tp->tentry, le);
1325 VERIFY_PREV_LINK(&tp->tentry, le);
1326
1327 lck_mtx_unlock(listp->mtx);
1328
1329 index = TCPT_NONE;
1330 offset = tcp_run_conn_timer(tp, &index);
1331
1332 lck_mtx_lock(listp->mtx);
1333
1334 next_te = listp->next_te;
1335 listp->next_te = NULL;
1336
1337 if (offset > 0) {
1338 if (index < TCPT_NONE) {
1339 /* Check if this is a fast_timer. */
1340 if (!need_fast && !(IS_TIMER_SLOW(index))) {
1341 need_fast = TRUE;
1342 }
1343
1344 if (next_timer == 0 || offset < next_timer) {
1345 next_timer = offset;
1346 }
1347 }
1348 }
1349 }
1350
1351 if (!LIST_EMPTY(&listp->lhead)) {
1352 if (listp->mode == TCP_TIMERLIST_FASTMODE) {
1353 if (need_fast || active_count > 0 ||
1354 listp->pref_mode == TCP_TIMERLIST_FASTMODE) {
1355 listp->idlegen = 0;
1356 } else {
1357 listp->idlegen++;
1358 if (listp->idlegen > timer_fastmode_idlemax) {
1359 mode = TCP_TIMERLIST_SLOWMODE;
1360 listp->idlegen = 0;
1361 }
1362 }
1363 } else {
1364 if (!need_fast) {
1365 mode = TCP_TIMERLIST_SLOWMODE;
1366 }
1367 }
1368
1369 if (mode == TCP_TIMERLIST_FASTMODE ||
1370 listp->pref_mode == TCP_TIMERLIST_FASTMODE) {
1371 next_timer = listp->fast_quantum;
1372 } else {
1373 if (listp->pref_offset != 0 &&
1374 listp->pref_offset < next_timer)
1375 next_timer = listp->pref_offset;
1376 if (next_timer < listp->slow_quantum)
1377 next_timer = listp->slow_quantum;
1378 }
1379
1380 listp->mode = mode;
1381
1382 tcp_sched_timerlist(next_timer);
1383 } else {
1384 /* No need to reschedule this timer */
1385 listp->runtime = 0;
1386 }
1387
1388 listp->running = FALSE;
1389 listp->pref_mode = 0;
1390 listp->pref_offset = 0;
1391
1392 lck_mtx_unlock(listp->mtx);
1393 }
1394
1395 /* Function to verify if a change in timer state is required for a connection */
1396 void
1397 tcp_sched_timers(struct tcpcb *tp)
1398 {
1399 struct tcptimerentry *te = &tp->tentry;
1400 uint16_t index = te->index;
1401 struct tcptimerlist *listp = &tcp_timer_list;
1402 uint32_t offset = 0;
1403 boolean_t is_fast;
1404 int list_locked = 0;
1405
1406 if (tp->t_inpcb->inp_state == INPCB_STATE_DEAD) {
1407 /* Just return without adding the dead pcb to the list */
1408 if (TIMER_IS_ON_LIST(tp)) {
1409 tcp_remove_timer(tp);
1410 }
1411 return;
1412 }
1413
1414 if (index == TCPT_NONE) {
1415 tcp_remove_timer(tp);
1416 return;
1417 }
1418
1419 is_fast = !(IS_TIMER_SLOW(index));
1420 offset = te->runtime - tcp_now;
1421 if (offset == 0) {
1422 offset = 1;
1423 tcp_timer_advanced++;
1424 }
1425 if (is_fast)
1426 offset = listp->fast_quantum;
1427
1428 if (!TIMER_IS_ON_LIST(tp)) {
1429 if (!list_locked) {
1430 lck_mtx_lock(listp->mtx);
1431 list_locked = 1;
1432 }
1433
1434 LIST_INSERT_HEAD(&listp->lhead, te, le);
1435 tp->t_flags |= TF_TIMER_ONLIST;
1436
1437 listp->entries++;
1438 if (listp->entries > listp->maxentries)
1439 listp->maxentries = listp->entries;
1440
1441 /* if the list is not scheduled, just schedule it */
1442 if (listp->runtime == 0)
1443 goto schedule;
1444
1445 }
1446
1447
1448 /* timer entry is currently on the list */
1449 if (need_to_resched_timerlist(te->runtime, index)) {
1450 tcp_resched_timerlist++;
1451
1452 if (!list_locked) {
1453 lck_mtx_lock(listp->mtx);
1454 list_locked = 1;
1455 }
1456
1457 VERIFY_NEXT_LINK(te, le);
1458 VERIFY_PREV_LINK(te, le);
1459
1460 if (listp->running) {
1461 if (is_fast) {
1462 listp->pref_mode = TCP_TIMERLIST_FASTMODE;
1463 } else if (listp->pref_offset == 0 ||
1464 ((int)offset) < listp->pref_offset) {
1465 listp->pref_offset = offset;
1466 }
1467 } else {
1468 int32_t diff;
1469 diff = timer_diff(listp->runtime, 0, tcp_now, offset);
1470 if (diff <= 0) {
1471 /* The list is going to run before this timer */
1472 goto done;
1473 } else {
1474 goto schedule;
1475 }
1476 }
1477 }
1478 goto done;
1479
1480 schedule:
1481 if (is_fast) {
1482 listp->mode = TCP_TIMERLIST_FASTMODE;
1483 listp->idlegen = 0;
1484 }
1485 tcp_sched_timerlist(offset);
1486
1487 done:
1488 if (list_locked)
1489 lck_mtx_unlock(listp->mtx);
1490
1491 return;
1492 }
1493
1494 void
1495 tcp_set_lotimer_index(struct tcpcb *tp) {
1496 uint16_t i, lo_index = TCPT_NONE;
1497 uint32_t lo_timer = 0;
1498 for (i = 0; i < TCPT_NTIMERS; ++i) {
1499 if (tp->t_timer[i] != 0 &&
1500 (lo_timer == 0 || tp->t_timer[i] < lo_timer)) {
1501 lo_timer = tp->t_timer[i];
1502 lo_index = i;
1503 }
1504 }
1505 tp->tentry.index = lo_index;
1506 if (lo_index != TCPT_NONE) {
1507 tp->tentry.runtime = tp->tentry.timer_start + tp->t_timer[lo_index];
1508 } else {
1509 tp->tentry.runtime = 0;
1510 }
1511 }
1512
1513 void
1514 tcp_check_timer_state(struct tcpcb *tp) {
1515
1516 lck_mtx_assert(&tp->t_inpcb->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1517
1518 tcp_set_lotimer_index(tp);
1519
1520 tcp_sched_timers(tp);
1521 return;
1522 }