]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/tcp_timer.c
a8369ca71a164ccd805b78b92a2cd9c6348d4df8
[apple/xnu.git] / bsd / netinet / tcp_timer.c
1 /*
2 * Copyright (c) 2000-2011 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
30 * The Regents of the University of California. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95
61 * $FreeBSD: src/sys/netinet/tcp_timer.c,v 1.34.2.11 2001/08/22 00:59:12 silby Exp $
62 */
63
64
65 #include <sys/param.h>
66 #include <sys/systm.h>
67 #include <sys/kernel.h>
68 #include <sys/mbuf.h>
69 #include <sys/sysctl.h>
70 #include <sys/socket.h>
71 #include <sys/socketvar.h>
72 #include <sys/protosw.h>
73 #include <sys/domain.h>
74 #include <sys/mcache.h>
75 #include <sys/queue.h>
76 #include <kern/locks.h>
77
78 #include <kern/cpu_number.h> /* before tcp_seq.h, for tcp_random18() */
79
80 #include <net/route.h>
81
82 #include <netinet/in.h>
83 #include <netinet/in_systm.h>
84 #include <netinet/in_pcb.h>
85 #if INET6
86 #include <netinet6/in6_pcb.h>
87 #endif
88 #include <netinet/ip_var.h>
89 #include <netinet/tcp.h>
90 #include <netinet/tcp_fsm.h>
91 #include <netinet/tcp_seq.h>
92 #include <netinet/tcp_timer.h>
93 #include <netinet/tcp_var.h>
94 #include <netinet/tcp_cc.h>
95 #if INET6
96 #include <netinet6/tcp6_var.h>
97 #endif
98 #include <netinet/tcpip.h>
99 #if TCPDEBUG
100 #include <netinet/tcp_debug.h>
101 #endif
102 #include <sys/kdebug.h>
103 #include <mach/sdt.h>
104
105 extern void postevent(struct socket *, struct sockbuf *,
106 int);
107 #define DBG_FNC_TCP_FAST NETDBG_CODE(DBG_NETTCP, (5 << 8))
108 #define DBG_FNC_TCP_SLOW NETDBG_CODE(DBG_NETTCP, (5 << 8) | 1)
109
110 #define TIMERENTRY_TO_TP(te) ((struct tcpcb *)((uintptr_t)te - offsetof(struct tcpcb, tentry.le.le_next)))
111
112 #define VERIFY_NEXT_LINK(elm,field) do { \
113 if (LIST_NEXT((elm),field) != NULL && \
114 LIST_NEXT((elm),field)->field.le_prev != \
115 &((elm)->field.le_next)) \
116 panic("Bad link elm %p next->prev != elm", (elm)); \
117 } while(0)
118
119 #define VERIFY_PREV_LINK(elm,field) do { \
120 if (*(elm)->field.le_prev != (elm)) \
121 panic("Bad link elm %p prev->next != elm", (elm)); \
122 } while(0)
123
124 static int background_io_trigger = 5;
125 SYSCTL_INT(_net_inet_tcp, OID_AUTO, background_io_trigger, CTLFLAG_RW | CTLFLAG_LOCKED,
126 &background_io_trigger, 0, "Background IO Trigger Setting");
127
128 static int
129 sysctl_msec_to_ticks SYSCTL_HANDLER_ARGS
130 {
131 #pragma unused(arg1, arg2)
132 int error, s, tt;
133
134 tt = *(int *)oidp->oid_arg1;
135 s = tt * 1000 / TCP_RETRANSHZ;;
136
137 error = sysctl_handle_int(oidp, &s, 0, req);
138 if (error || !req->newptr)
139 return (error);
140
141 tt = s * TCP_RETRANSHZ / 1000;
142 if (tt < 1)
143 return (EINVAL);
144
145 *(int *)oidp->oid_arg1 = tt;
146 return (0);
147 }
148
149 int tcp_keepinit;
150 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
151 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "");
152
153 int tcp_keepidle;
154 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
155 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "");
156
157 int tcp_keepintvl;
158 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
159 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "");
160
161 int tcp_msl;
162 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
163 &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
164
165 /*
166 * Avoid DoS via TCP Robustness in Persist Condition (see http://www.ietf.org/id/draft-ananth-tcpm-persist-02.txt)
167 * by allowing a system wide maximum persistence timeout value when in Zero Window Probe mode.
168 * Expressed in milliseconds to be consistent without timeout related values, the TCP socket option is in seconds.
169 */
170 u_int32_t tcp_max_persist_timeout = 0;
171 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, max_persist_timeout, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
172 &tcp_max_persist_timeout, 0, sysctl_msec_to_ticks, "I", "Maximum persistence timout for ZWP");
173
174 static int always_keepalive = 0;
175 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW | CTLFLAG_LOCKED,
176 &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections");
177
178 /* This parameter determines how long the timer list will stay in fast mode even
179 * though all connections are idle. In fast mode, the timer will fire more frequently
180 * anticipating new data.
181 */
182 int timer_fastmode_idlemax = TCP_FASTMODE_IDLEGEN_MAX;
183 SYSCTL_INT(_net_inet_tcp, OID_AUTO, timer_fastmode_idlemax, CTLFLAG_RW | CTLFLAG_LOCKED,
184 &timer_fastmode_idlemax, 0, "Maximum idle generations in fast mode");
185
186 /*
187 * See tcp_syn_backoff[] for interval values between SYN retransmits;
188 * the value set below defines the number of retransmits, before we
189 * disable the timestamp and window scaling options during subsequent
190 * SYN retransmits. Setting it to 0 disables the dropping off of those
191 * two options.
192 */
193 static int tcp_broken_peer_syn_rxmit_thres = 7;
194 SYSCTL_INT(_net_inet_tcp, OID_AUTO, broken_peer_syn_rxmit_thres, CTLFLAG_RW | CTLFLAG_LOCKED,
195 &tcp_broken_peer_syn_rxmit_thres, 0, "Number of retransmitted SYNs before "
196 "TCP disables rfc1323 and rfc1644 during the rest of attempts");
197
198 static int tcp_timer_advanced = 0;
199 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_timer_advanced, CTLFLAG_RD | CTLFLAG_LOCKED,
200 &tcp_timer_advanced, 0, "Number of times one of the timers was advanced");
201
202 static int tcp_resched_timerlist = 0;
203 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_resched_timerlist, CTLFLAG_RD | CTLFLAG_LOCKED,
204 &tcp_resched_timerlist, 0,
205 "Number of times timer list was rescheduled as part of processing a packet");
206
207 int tcp_pmtud_black_hole_detect = 1 ;
208 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, CTLFLAG_RW | CTLFLAG_LOCKED,
209 &tcp_pmtud_black_hole_detect, 0, "Path MTU Discovery Black Hole Detection");
210
211 int tcp_pmtud_black_hole_mss = 1200 ;
212 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, CTLFLAG_RW | CTLFLAG_LOCKED,
213 &tcp_pmtud_black_hole_mss, 0, "Path MTU Discovery Black Hole Detection lowered MSS");
214
215 static int tcp_keepcnt = TCPTV_KEEPCNT;
216 static int tcp_gc_done = FALSE; /* perfromed garbage collection of "used" sockets */
217 /* max idle probes */
218 int tcp_maxpersistidle;
219 /* max idle time in persist */
220 int tcp_maxidle;
221
222 /* TCP delack timer is set to 100 ms. Since the processing of timer list in fast
223 * mode will happen no faster than 100 ms, the delayed ack timer will fire some where
224 * between 100 and 200 ms.
225 */
226 int tcp_delack = TCP_RETRANSHZ / 10;
227
228 struct inpcbhead time_wait_slots[N_TIME_WAIT_SLOTS];
229 int cur_tw_slot = 0;
230
231 /* tcp timer list */
232 struct tcptimerlist tcp_timer_list;
233
234 /* The frequency of running through the TCP timer list in
235 * fast and slow mode can be configured.
236 */
237 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, timer_fastquantum, CTLFLAG_RW | CTLFLAG_LOCKED,
238 &tcp_timer_list.fast_quantum, TCP_FASTTIMER_QUANTUM,
239 "Frequency of running timer list in fast mode");
240
241 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, timer_slowquantum, CTLFLAG_RW | CTLFLAG_LOCKED,
242 &tcp_timer_list.slow_quantum, TCP_SLOWTIMER_QUANTUM,
243 "Frequency of running timer list in slow mode");
244
245 static void tcp_remove_timer(struct tcpcb *tp);
246 static void tcp_sched_timerlist(uint32_t offset);
247 static uint32_t tcp_run_conn_timer(struct tcpcb *tp, uint16_t *next_index);
248 static void tcp_sched_timers(struct tcpcb *tp);
249 static inline void tcp_set_lotimer_index(struct tcpcb *);
250
251 /* Macro to compare two timers. If there is a reset of the sign bit, it is
252 * safe to assume that the timer has wrapped around. By doing signed comparision,
253 * we take care of wrap around such that the value with the sign bit reset is
254 * actually ahead of the other.
255 */
256
257 static inline int32_t
258 timer_diff(uint32_t t1, uint32_t toff1, uint32_t t2, uint32_t toff2) {
259 return (int32_t)((t1 + toff1) - (t2 + toff2));
260 };
261
262 /* Returns true if the timer is on the timer list */
263 #define TIMER_IS_ON_LIST(tp) ((tp)->t_flags & TF_TIMER_ONLIST)
264
265
266 void add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay);
267 void add_to_time_wait(struct tcpcb *tp, uint32_t delay) ;
268
269 static void tcp_garbage_collect(struct inpcb *, int);
270
271 void add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay)
272 {
273 int tw_slot;
274 struct inpcbinfo *pcbinfo = &tcbinfo;
275 uint32_t timer;
276
277 /* pcb list should be locked when we get here */
278 lck_rw_assert(pcbinfo->mtx, LCK_RW_ASSERT_EXCLUSIVE);
279
280 LIST_REMOVE(tp->t_inpcb, inp_list);
281
282 /* if (tp->t_timer[TCPT_2MSL] <= 0)
283 tp->t_timer[TCPT_2MSL] = 1; */
284
285 /*
286 * Because we're pulling this pcb out of the main TCP pcb list,
287 * we need to recalculate the TCPT_2MSL timer value for tcp_slowtimo
288 * higher timer granularity.
289 */
290
291 timer = (delay / TCP_RETRANSHZ) * PR_SLOWHZ;
292 tp->t_rcvtime = (tp->t_rcvtime / TCP_RETRANSHZ) * PR_SLOWHZ;
293
294 tp->t_rcvtime += timer & (N_TIME_WAIT_SLOTS - 1);
295
296 tw_slot = (timer & (N_TIME_WAIT_SLOTS - 1)) + cur_tw_slot;
297 if (tw_slot >= N_TIME_WAIT_SLOTS)
298 tw_slot -= N_TIME_WAIT_SLOTS;
299
300 LIST_INSERT_HEAD(&time_wait_slots[tw_slot], tp->t_inpcb, inp_list);
301 }
302
303 void add_to_time_wait(struct tcpcb *tp, uint32_t delay)
304 {
305 struct inpcbinfo *pcbinfo = &tcbinfo;
306
307 if (!lck_rw_try_lock_exclusive(pcbinfo->mtx)) {
308 tcp_unlock(tp->t_inpcb->inp_socket, 0, 0);
309 lck_rw_lock_exclusive(pcbinfo->mtx);
310 tcp_lock(tp->t_inpcb->inp_socket, 0, 0);
311 }
312 add_to_time_wait_locked(tp, delay);
313 lck_rw_done(pcbinfo->mtx);
314 }
315
316 static void
317 tcp_garbage_collect(struct inpcb *inp, int istimewait)
318 {
319 struct socket *so;
320 struct tcpcb *tp;
321
322 so = inp->inp_socket;
323 tp = intotcpcb(inp);
324
325 /*
326 * Skip if still in use or busy; it would have been more efficient
327 * if we were to test so_usecount against 0, but this isn't possible
328 * due to the current implementation of tcp_dropdropablreq() where
329 * overflow sockets that are eligible for garbage collection have
330 * their usecounts set to 1.
331 */
332 if (so->so_usecount > 1 || !lck_mtx_try_lock_spin(&inp->inpcb_mtx))
333 return;
334
335 /* Check again under the lock */
336 if (so->so_usecount > 1) {
337 lck_mtx_unlock(&inp->inpcb_mtx);
338 return;
339 }
340
341 /*
342 * Overflowed socket dropped from the listening queue? Do this
343 * only if we are called to clean up the time wait slots, since
344 * tcp_dropdropablreq() considers a socket to have been fully
345 * dropped after add_to_time_wait() is finished.
346 * Also handle the case of connections getting closed by the peer while in the queue as
347 * seen with rdar://6422317
348 *
349 */
350 if (so->so_usecount == 1 &&
351 ((istimewait && (so->so_flags & SOF_OVERFLOW)) ||
352 ((tp != NULL) && (tp->t_state == TCPS_CLOSED) && (so->so_head != NULL)
353 && ((so->so_state & (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) ==
354 (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE))))) {
355
356 if (inp->inp_state != INPCB_STATE_DEAD) {
357 /* Become a regular mutex */
358 lck_mtx_convert_spin(&inp->inpcb_mtx);
359 #if INET6
360 if (INP_CHECK_SOCKAF(so, AF_INET6))
361 in6_pcbdetach(inp);
362 else
363 #endif /* INET6 */
364 in_pcbdetach(inp);
365 }
366 so->so_usecount--;
367 lck_mtx_unlock(&inp->inpcb_mtx);
368 return;
369 } else if (inp->inp_wantcnt != WNT_STOPUSING) {
370 lck_mtx_unlock(&inp->inpcb_mtx);
371 return;
372 }
373
374 /*
375 * We get here because the PCB is no longer searchable (WNT_STOPUSING);
376 * detach (if needed) and dispose if it is dead (usecount is 0). This
377 * covers all cases, including overflow sockets and those that are
378 * considered as "embryonic", i.e. created by sonewconn() in TCP input
379 * path, and have not yet been committed. For the former, we reduce
380 * the usecount to 0 as done by the code above. For the latter, the
381 * usecount would have reduced to 0 as part calling soabort() when the
382 * socket is dropped at the end of tcp_input().
383 */
384 if (so->so_usecount == 0) {
385 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
386 struct tcpcb *, tp, int32_t, TCPS_CLOSED);
387 /* Become a regular mutex */
388 lck_mtx_convert_spin(&inp->inpcb_mtx);
389
390 /* If this tp still happens to be on the timer list,
391 * take it out
392 */
393 if (TIMER_IS_ON_LIST(tp)) {
394 tcp_remove_timer(tp);
395 }
396
397 if (inp->inp_state != INPCB_STATE_DEAD) {
398 #if INET6
399 if (INP_CHECK_SOCKAF(so, AF_INET6))
400 in6_pcbdetach(inp);
401 else
402 #endif /* INET6 */
403 in_pcbdetach(inp);
404 }
405 in_pcbdispose(inp);
406 } else {
407 lck_mtx_unlock(&inp->inpcb_mtx);
408 }
409 }
410
411 void
412 tcp_slowtimo(void)
413 {
414 struct inpcb *inp, *nxt;
415 struct tcpcb *tp;
416 #if TCPDEBUG
417 int ostate;
418 #endif
419
420 #if KDEBUG
421 static int tws_checked = 0;
422 #endif
423
424 struct inpcbinfo *pcbinfo = &tcbinfo;
425
426 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_START, 0,0,0,0,0);
427
428 tcp_maxidle = tcp_keepcnt * tcp_keepintvl;
429
430 /* Update tcp_now here as it may get used while processing the slow timer */
431 calculate_tcp_clock();
432
433 /* Garbage collect socket/tcpcb: We need to acquire the list lock
434 * exclusively to do this
435 */
436
437 if (lck_rw_try_lock_exclusive(pcbinfo->mtx) == FALSE) {
438 if (tcp_gc_done == TRUE) { /* don't sweat it this time. cleanup was done last time */
439 tcp_gc_done = FALSE;
440 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END, tws_checked, cur_tw_slot,0,0,0);
441 return; /* Upgrade failed and lost lock - give up this time. */
442 }
443 lck_rw_lock_exclusive(pcbinfo->mtx); /* Upgrade failed, lost lock now take it again exclusive */
444 }
445 tcp_gc_done = TRUE;
446
447 /*
448 * Process the items in the current time-wait slot
449 */
450 #if KDEBUG
451 tws_checked = 0;
452 #endif
453 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_NONE, tws_checked,0,0,0,0);
454
455 LIST_FOREACH(inp, &time_wait_slots[cur_tw_slot], inp_list) {
456 #if KDEBUG
457 tws_checked++;
458 #endif
459
460 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING)
461 continue;
462
463 tcp_lock(inp->inp_socket, 1, 0);
464
465 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING)
466 goto twunlock;
467
468 tp = intotcpcb(inp);
469 if (tp == NULL) /* tp already closed, remove from list */
470 goto twunlock;
471
472 if (tp->t_timer[TCPT_2MSL] >= N_TIME_WAIT_SLOTS) {
473 tp->t_timer[TCPT_2MSL] -= N_TIME_WAIT_SLOTS;
474 tp->t_rcvtime += N_TIME_WAIT_SLOTS;
475 }
476 else
477 tp->t_timer[TCPT_2MSL] = 0;
478
479 if (tp->t_timer[TCPT_2MSL] == 0) {
480
481 /* That pcb is ready for a close */
482 tcp_free_sackholes(tp);
483 tp = tcp_close(tp);
484 }
485 twunlock:
486 tcp_unlock(inp->inp_socket, 1, 0);
487 }
488
489
490 LIST_FOREACH_SAFE(inp, &tcb, inp_list, nxt) {
491 tcp_garbage_collect(inp, 0);
492 }
493
494 /* Now cleanup the time wait ones */
495 LIST_FOREACH_SAFE(inp, &time_wait_slots[cur_tw_slot], inp_list, nxt) {
496 tcp_garbage_collect(inp, 1);
497 }
498
499 if (++cur_tw_slot >= N_TIME_WAIT_SLOTS)
500 cur_tw_slot = 0;
501
502 lck_rw_done(pcbinfo->mtx);
503 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END, tws_checked, cur_tw_slot,0,0,0);
504 }
505
506 /*
507 * Cancel all timers for TCP tp.
508 */
509 void
510 tcp_canceltimers(tp)
511 struct tcpcb *tp;
512 {
513 register int i;
514
515 tcp_remove_timer(tp);
516 for (i = 0; i < TCPT_NTIMERS; i++)
517 tp->t_timer[i] = 0;
518 tp->tentry.timer_start = tcp_now;
519 tp->tentry.index = TCPT_NONE;
520 }
521
522 int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
523 { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
524
525 int tcp_backoff[TCP_MAXRXTSHIFT + 1] =
526 { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 };
527
528 static int tcp_totbackoff = 511; /* sum of tcp_backoff[] */
529
530 /*
531 * TCP timer processing.
532 */
533 struct tcpcb *
534 tcp_timers(tp, timer)
535 register struct tcpcb *tp;
536 int timer;
537 {
538 register int rexmt;
539 struct socket *so_tmp;
540 struct tcptemp *t_template;
541 int optlen = 0;
542 int idle_time = 0;
543
544 #if TCPDEBUG
545 int ostate;
546 #endif
547
548 #if INET6
549 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV4) == 0;
550 #endif /* INET6 */
551
552 so_tmp = tp->t_inpcb->inp_socket;
553 idle_time = tcp_now - tp->t_rcvtime;
554
555 switch (timer) {
556
557 /*
558 * 2 MSL timeout in shutdown went off. If we're closed but
559 * still waiting for peer to close and connection has been idle
560 * too long, or if 2MSL time is up from TIME_WAIT or FIN_WAIT_2,
561 * delete connection control block.
562 * Otherwise, (this case shouldn't happen) check again in a bit
563 * we keep the socket in the main list in that case.
564 */
565 case TCPT_2MSL:
566 tcp_free_sackholes(tp);
567 if (tp->t_state != TCPS_TIME_WAIT &&
568 tp->t_state != TCPS_FIN_WAIT_2 &&
569 ((idle_time > 0) && (idle_time < tcp_maxidle))) {
570 tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp, (u_int32_t)tcp_keepintvl);
571 }
572 else {
573 tp = tcp_close(tp);
574 return(tp);
575 }
576 break;
577
578 /*
579 * Retransmission timer went off. Message has not
580 * been acked within retransmit interval. Back off
581 * to a longer retransmit interval and retransmit one segment.
582 */
583 case TCPT_REXMT:
584 tcp_free_sackholes(tp);
585 /* Drop a connection in the retransmit timer
586 * 1. If we have retransmitted more than TCP_MAXRXTSHIFT times
587 * 2. If the time spent in this retransmission episode is more than
588 * the time limit set with TCP_RXT_CONNDROPTIME socket option
589 * 3. If TCP_RXT_FINDROP socket option was set and we have already
590 * retransmitted the FIN 3 times without receiving an ack
591 */
592 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT ||
593 (tp->rxt_conndroptime > 0 && tp->rxt_start > 0 &&
594 (tcp_now - tp->rxt_start) >= tp->rxt_conndroptime) ||
595 ((tp->t_flagsext & TF_RXTFINDROP) != 0 &&
596 (tp->t_flags & TF_SENTFIN) != 0 &&
597 tp->t_rxtshift >= 4)) {
598
599 if ((tp->t_flagsext & TF_RXTFINDROP) != 0) {
600 tcpstat.tcps_rxtfindrop++;
601 } else {
602 tcpstat.tcps_timeoutdrop++;
603 }
604 tp->t_rxtshift = TCP_MAXRXTSHIFT;
605 tp = tcp_drop(tp, tp->t_softerror ?
606 tp->t_softerror : ETIMEDOUT);
607 postevent(so_tmp, 0, EV_TIMEOUT);
608 break;
609 }
610
611 if (tp->t_rxtshift == 1) {
612 /*
613 * first retransmit; record ssthresh and cwnd so they can
614 * be recovered if this turns out to be a "bad" retransmit.
615 * A retransmit is considered "bad" if an ACK for this
616 * segment is received within RTT/2 interval; the assumption
617 * here is that the ACK was already in flight. See
618 * "On Estimating End-to-End Network Path Properties" by
619 * Allman and Paxson for more details.
620 */
621 tp->snd_cwnd_prev = tp->snd_cwnd;
622 tp->snd_ssthresh_prev = tp->snd_ssthresh;
623 tp->snd_recover_prev = tp->snd_recover;
624 if (IN_FASTRECOVERY(tp))
625 tp->t_flags |= TF_WASFRECOVERY;
626 else
627 tp->t_flags &= ~TF_WASFRECOVERY;
628 tp->t_badrxtwin = tcp_now + (tp->t_srtt >> (TCP_RTT_SHIFT));
629
630 /* Set the time at which retransmission on this
631 * connection started
632 */
633 tp->rxt_start = tcp_now;
634 }
635 tcpstat.tcps_rexmttimeo++;
636 if (tp->t_state == TCPS_SYN_SENT)
637 rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift];
638 else
639 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
640 TCPT_RANGESET(tp->t_rxtcur, rexmt,
641 tp->t_rttmin, TCPTV_REXMTMAX,
642 TCP_ADD_REXMTSLOP(tp));
643 tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
644
645 /*
646 * Check for potential Path MTU Discovery Black Hole
647 */
648
649 if (tcp_pmtud_black_hole_detect && (tp->t_state == TCPS_ESTABLISHED)) {
650 if (((tp->t_flags & (TF_PMTUD|TF_MAXSEGSNT)) == (TF_PMTUD|TF_MAXSEGSNT)) && (tp->t_rxtshift == 2)) {
651 /*
652 * Enter Path MTU Black-hole Detection mechanism:
653 * - Disable Path MTU Discovery (IP "DF" bit).
654 * - Reduce MTU to lower value than what we negociated with peer.
655 */
656
657 tp->t_flags &= ~TF_PMTUD; /* Disable Path MTU Discovery for now */
658 tp->t_flags |= TF_BLACKHOLE; /* Record that we may have found a black hole */
659 optlen = tp->t_maxopd - tp->t_maxseg;
660 tp->t_pmtud_saved_maxopd = tp->t_maxopd; /* Keep track of previous MSS */
661 if (tp->t_maxopd > tcp_pmtud_black_hole_mss)
662 tp->t_maxopd = tcp_pmtud_black_hole_mss; /* Reduce the MSS to intermediary value */
663 else {
664 tp->t_maxopd = /* use the default MSS */
665 #if INET6
666 isipv6 ? tcp_v6mssdflt :
667 #endif /* INET6 */
668 tcp_mssdflt;
669 }
670 tp->t_maxseg = tp->t_maxopd - optlen;
671
672 /*
673 * Reset the slow-start flight size as it may depends on the new MSS
674 */
675 if (CC_ALGO(tp)->cwnd_init != NULL)
676 CC_ALGO(tp)->cwnd_init(tp);
677 }
678 /*
679 * If further retransmissions are still unsuccessful with a lowered MTU,
680 * maybe this isn't a Black Hole and we restore the previous MSS and
681 * blackhole detection flags.
682 */
683 else {
684
685 if ((tp->t_flags & TF_BLACKHOLE) && (tp->t_rxtshift > 4)) {
686 tp->t_flags |= TF_PMTUD;
687 tp->t_flags &= ~TF_BLACKHOLE;
688 optlen = tp->t_maxopd - tp->t_maxseg;
689 tp->t_maxopd = tp->t_pmtud_saved_maxopd;
690 tp->t_maxseg = tp->t_maxopd - optlen;
691 /*
692 * Reset the slow-start flight size as it may depends on the new MSS
693 */
694 if (CC_ALGO(tp)->cwnd_init != NULL)
695 CC_ALGO(tp)->cwnd_init(tp);
696 }
697 }
698 }
699
700
701 /*
702 * Disable rfc1323 and rfc1644 if we haven't got any response to
703 * our SYN (after we reach the threshold) to work-around some
704 * broken terminal servers (most of which have hopefully been
705 * retired) that have bad VJ header compression code which
706 * trashes TCP segments containing unknown-to-them TCP options.
707 */
708 if ((tp->t_state == TCPS_SYN_SENT) &&
709 (tp->t_rxtshift == tcp_broken_peer_syn_rxmit_thres))
710 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_REQ_CC);
711 /*
712 * If losing, let the lower level know and try for
713 * a better route. Also, if we backed off this far,
714 * our srtt estimate is probably bogus. Clobber it
715 * so we'll take the next rtt measurement as our srtt;
716 * move the current srtt into rttvar to keep the current
717 * retransmit times until then.
718 */
719 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
720 #if INET6
721 if (isipv6)
722 in6_losing(tp->t_inpcb);
723 else
724 #endif /* INET6 */
725 in_losing(tp->t_inpcb);
726 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
727 tp->t_srtt = 0;
728 }
729 tp->snd_nxt = tp->snd_una;
730 /*
731 * Note: We overload snd_recover to function also as the
732 * snd_last variable described in RFC 2582
733 */
734 tp->snd_recover = tp->snd_max;
735 /*
736 * Force a segment to be sent.
737 */
738 tp->t_flags |= TF_ACKNOW;
739 /*
740 * If timing a segment in this window, stop the timer.
741 */
742 tp->t_rtttime = 0;
743
744 if (CC_ALGO(tp)->after_timeout != NULL)
745 CC_ALGO(tp)->after_timeout(tp);
746
747 tp->t_dupacks = 0;
748 EXIT_FASTRECOVERY(tp);
749
750 DTRACE_TCP5(cc, void, NULL, struct inpcb *, tp->t_inpcb,
751 struct tcpcb *, tp, struct tcphdr *, NULL,
752 int32_t, TCP_CC_REXMT_TIMEOUT);
753
754 (void) tcp_output(tp);
755 break;
756
757 /*
758 * Persistance timer into zero window.
759 * Force a byte to be output, if possible.
760 */
761 case TCPT_PERSIST:
762 tcpstat.tcps_persisttimeo++;
763 /*
764 * Hack: if the peer is dead/unreachable, we do not
765 * time out if the window is closed. After a full
766 * backoff, drop the connection if the idle time
767 * (no responses to probes) reaches the maximum
768 * backoff that we would use if retransmitting.
769 *
770 * Drop the connection if we reached the maximum allowed time for
771 * Zero Window Probes without a non-zero update from the peer.
772 * See rdar://5805356
773 */
774 if ((tp->t_rxtshift == TCP_MAXRXTSHIFT &&
775 (idle_time >= tcp_maxpersistidle ||
776 idle_time >= TCP_REXMTVAL(tp) * tcp_totbackoff)) ||
777 ((tp->t_persist_stop != 0) && (tp->t_persist_stop <= tcp_now))) {
778 tcpstat.tcps_persistdrop++;
779 so_tmp = tp->t_inpcb->inp_socket;
780 tp = tcp_drop(tp, ETIMEDOUT);
781 postevent(so_tmp, 0, EV_TIMEOUT);
782 break;
783 }
784 tcp_setpersist(tp);
785 tp->t_force = 1;
786 (void) tcp_output(tp);
787 tp->t_force = 0;
788 break;
789
790 /*
791 * Keep-alive timer went off; send something
792 * or drop connection if idle for too long.
793 */
794 case TCPT_KEEP:
795 tcpstat.tcps_keeptimeo++;
796 if (tp->t_state < TCPS_ESTABLISHED)
797 goto dropit;
798 if ((always_keepalive ||
799 tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) &&
800 (tp->t_state <= TCPS_CLOSING || tp->t_state == TCPS_FIN_WAIT_2)) {
801 if (idle_time >= TCP_KEEPIDLE(tp) + (u_int32_t)tcp_maxidle)
802 goto dropit;
803 /*
804 * Send a packet designed to force a response
805 * if the peer is up and reachable:
806 * either an ACK if the connection is still alive,
807 * or an RST if the peer has closed the connection
808 * due to timeout or reboot.
809 * Using sequence number tp->snd_una-1
810 * causes the transmitted zero-length segment
811 * to lie outside the receive window;
812 * by the protocol spec, this requires the
813 * correspondent TCP to respond.
814 */
815 tcpstat.tcps_keepprobe++;
816 t_template = tcp_maketemplate(tp);
817 if (t_template) {
818 unsigned int ifscope, nocell = 0;
819
820 if (tp->t_inpcb->inp_flags & INP_BOUND_IF)
821 ifscope = tp->t_inpcb->inp_boundif;
822 else
823 ifscope = IFSCOPE_NONE;
824
825 /*
826 * If the socket isn't allowed to use the
827 * cellular interface, indicate it as such.
828 */
829 if (tp->t_inpcb->inp_flags & INP_NO_IFT_CELLULAR)
830 nocell = 1;
831
832 tcp_respond(tp, t_template->tt_ipgen,
833 &t_template->tt_t, (struct mbuf *)NULL,
834 tp->rcv_nxt, tp->snd_una - 1, 0, ifscope,
835 nocell);
836 (void) m_free(dtom(t_template));
837 }
838 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, tcp_keepintvl);
839 } else
840 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, TCP_KEEPIDLE(tp));
841 break;
842 case TCPT_DELACK:
843 if (tcp_delack_enabled && (tp->t_flags & TF_DELACK)) {
844 tp->t_flags &= ~TF_DELACK;
845 tp->t_timer[TCPT_DELACK] = 0;
846 tp->t_flags |= TF_ACKNOW;
847
848 /* If delayed ack timer fired while we are stretching acks,
849 * go back to acking every other packet
850 */
851 if ((tp->t_flags & TF_STRETCHACK) != 0)
852 tcp_reset_stretch_ack(tp);
853
854 tcpstat.tcps_delack++;
855 (void) tcp_output(tp);
856 }
857 break;
858
859 #if TCPDEBUG
860 if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
861 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
862 PRU_SLOWTIMO);
863 #endif
864 dropit:
865 tcpstat.tcps_keepdrops++;
866 tp = tcp_drop(tp, ETIMEDOUT);
867 postevent(so_tmp, 0, EV_TIMEOUT);
868 break;
869 }
870 return (tp);
871 }
872
873 /* Remove a timer entry from timer list */
874 void
875 tcp_remove_timer(struct tcpcb *tp)
876 {
877 struct tcptimerlist *listp = &tcp_timer_list;
878
879 lck_mtx_assert(&tp->t_inpcb->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
880 if (!(TIMER_IS_ON_LIST(tp))) {
881 return;
882 }
883 lck_mtx_lock(listp->mtx);
884
885 /* Check if pcb is on timer list again after acquiring the lock */
886 if (!(TIMER_IS_ON_LIST(tp))) {
887 lck_mtx_unlock(listp->mtx);
888 return;
889 }
890
891 if (listp->next_te != NULL && listp->next_te == &tp->tentry)
892 listp->next_te = LIST_NEXT(&tp->tentry, le);
893
894 LIST_REMOVE(&tp->tentry, le);
895 tp->t_flags &= ~(TF_TIMER_ONLIST);
896
897 listp->entries--;
898
899 tp->tentry.le.le_next = NULL;
900 tp->tentry.le.le_prev = NULL;
901 lck_mtx_unlock(listp->mtx);
902 }
903
904 /* Function to check if the timerlist needs to be rescheduled to run
905 * the timer entry correctly. Basically, this is to check if we can avoid
906 * taking the list lock.
907 */
908
909 static boolean_t
910 need_to_resched_timerlist(uint32_t runtime, uint16_t index) {
911 struct tcptimerlist *listp = &tcp_timer_list;
912 int32_t diff;
913 boolean_t is_fast;
914
915 if (runtime == 0 || index == TCPT_NONE)
916 return FALSE;
917 is_fast = !(IS_TIMER_SLOW(index));
918
919 /* If the list is being processed then the state of the list is in flux.
920 * In this case always acquire the lock and set the state correctly.
921 */
922 if (listp->running) {
923 return TRUE;
924 }
925
926 diff = timer_diff(listp->runtime, 0, runtime, 0);
927 if (diff <= 0) {
928 /* The list is going to run before this timer */
929 return FALSE;
930 } else {
931 if (is_fast) {
932 if (diff <= listp->fast_quantum)
933 return FALSE;
934 } else {
935 if (diff <= listp->slow_quantum)
936 return FALSE;
937 }
938 }
939 return TRUE;
940 }
941
942 void
943 tcp_sched_timerlist(uint32_t offset)
944 {
945
946 uint64_t deadline = 0;
947 struct tcptimerlist *listp = &tcp_timer_list;
948
949 lck_mtx_assert(listp->mtx, LCK_MTX_ASSERT_OWNED);
950
951 listp->runtime = tcp_now + offset;
952
953 clock_interval_to_deadline(offset, NSEC_PER_SEC / TCP_RETRANSHZ,
954 &deadline);
955
956 thread_call_enter_delayed(listp->call, deadline);
957 }
958
959 /* Function to run the timers for a connection.
960 *
961 * Returns the offset of next timer to be run for this connection which
962 * can be used to reschedule the timerlist.
963 */
964 uint32_t
965 tcp_run_conn_timer(struct tcpcb *tp, uint16_t *next_index) {
966
967 struct socket *so;
968 uint16_t i = 0, index = TCPT_NONE, lo_index = TCPT_NONE;
969 uint32_t timer_val, offset = 0, lo_timer = 0;
970 int32_t diff;
971 boolean_t needtorun[TCPT_NTIMERS];
972 int count = 0;
973
974 VERIFY(tp != NULL);
975 bzero(needtorun, sizeof(needtorun));
976
977 tcp_lock(tp->t_inpcb->inp_socket, 1, 0);
978
979 so = tp->t_inpcb->inp_socket;
980 /* Release the want count on inp */
981 if (in_pcb_checkstate(tp->t_inpcb, WNT_RELEASE, 1) == WNT_STOPUSING) {
982 if (TIMER_IS_ON_LIST(tp)) {
983 tcp_remove_timer(tp);
984 }
985
986 /* Looks like the TCP connection got closed while we
987 * were waiting for the lock.. Done
988 */
989 goto done;
990 }
991
992 /* Since the timer thread needs to wait for tcp lock, it may race
993 * with another thread that can cancel or reschedule the timer that is
994 * about to run. Check if we need to run anything.
995 */
996 index = tp->tentry.index;
997 timer_val = tp->t_timer[index];
998
999 if (index == TCPT_NONE || tp->tentry.runtime == 0)
1000 goto done;
1001
1002 diff = timer_diff(tp->tentry.runtime, 0, tcp_now, 0);
1003 if (diff > 0) {
1004 if (tp->tentry.index != TCPT_NONE) {
1005 offset = diff;
1006 *(next_index) = tp->tentry.index;
1007 }
1008 goto done;
1009 }
1010
1011 tp->t_timer[index] = 0;
1012 if (timer_val > 0) {
1013 tp = tcp_timers(tp, index);
1014 if (tp == NULL)
1015 goto done;
1016 }
1017
1018 /* Check if there are any other timers that need to be run. While doing it,
1019 * adjust the timer values wrt tcp_now.
1020 */
1021 for (i = 0; i < TCPT_NTIMERS; ++i) {
1022 if (tp->t_timer[i] != 0) {
1023 diff = timer_diff(tp->tentry.timer_start, tp->t_timer[i], tcp_now, 0);
1024 if (diff <= 0) {
1025 tp->t_timer[i] = 0;
1026 needtorun[i] = TRUE;
1027 count++;
1028 } else {
1029 tp->t_timer[i] = diff;
1030 needtorun[i] = FALSE;
1031 if (lo_timer == 0 || diff < lo_timer) {
1032 lo_timer = diff;
1033 lo_index = i;
1034 }
1035 }
1036 }
1037 }
1038
1039 tp->tentry.timer_start = tcp_now;
1040 tp->tentry.index = lo_index;
1041 if (lo_index != TCPT_NONE) {
1042 tp->tentry.runtime = tp->tentry.timer_start + tp->t_timer[lo_index];
1043 } else {
1044 tp->tentry.runtime = 0;
1045 }
1046
1047 if (count > 0) {
1048 /* run any other timers that are also outstanding at this time. */
1049 for (i = 0; i < TCPT_NTIMERS; ++i) {
1050 if (needtorun[i]) {
1051 tp->t_timer[i] = 0;
1052 tp = tcp_timers(tp, i);
1053 if (tp == NULL)
1054 goto done;
1055 }
1056 }
1057 tcp_set_lotimer_index(tp);
1058 }
1059
1060 if (tp->tentry.index < TCPT_NONE) {
1061 offset = tp->t_timer[tp->tentry.index];
1062 *(next_index) = tp->tentry.index;
1063 }
1064
1065 done:
1066 if (tp != NULL && tp->tentry.index == TCPT_NONE) {
1067 tcp_remove_timer(tp);
1068 }
1069 tcp_unlock(so, 1, 0);
1070 return offset;
1071 }
1072
1073 void
1074 tcp_run_timerlist(void * arg1, void * arg2) {
1075
1076 #pragma unused(arg1, arg2)
1077
1078 struct tcptimerentry *te, *next_te;
1079 struct tcptimerlist *listp = &tcp_timer_list;
1080 struct tcpcb *tp;
1081 uint32_t next_timer = 0;
1082 uint16_t index = TCPT_NONE;
1083 boolean_t need_fast = FALSE;
1084 uint32_t active_count = 0;
1085 uint32_t mode = TCP_TIMERLIST_FASTMODE;
1086
1087 calculate_tcp_clock();
1088
1089 lck_mtx_lock(listp->mtx);
1090
1091 listp->running = TRUE;
1092
1093 LIST_FOREACH_SAFE(te, &listp->lhead, le, next_te) {
1094 uint32_t offset = 0;
1095 uint32_t runtime = te->runtime;
1096 if (TSTMP_GT(runtime, tcp_now)) {
1097 offset = timer_diff(runtime, 0, tcp_now, 0);
1098 if (next_timer == 0 || offset < next_timer) {
1099 next_timer = offset;
1100 }
1101 continue;
1102 }
1103 active_count++;
1104
1105 tp = TIMERENTRY_TO_TP(te);
1106
1107 /* Acquire an inp wantcnt on the inpcb so that the socket won't get
1108 * detached even if tcp_close is called
1109 */
1110 if (in_pcb_checkstate(tp->t_inpcb, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
1111 /* Some how this pcb went into dead state while on the timer list,
1112 * just take it off the list. Since the timer list entry pointers
1113 * are protected by the timer list lock, we can do it here
1114 */
1115 if (TIMER_IS_ON_LIST(tp)) {
1116 tp->t_flags &= ~(TF_TIMER_ONLIST);
1117 LIST_REMOVE(&tp->tentry, le);
1118 listp->entries--;
1119
1120 tp->tentry.le.le_next = NULL;
1121 tp->tentry.le.le_prev = NULL;
1122 }
1123 continue;
1124 }
1125
1126 /* Store the next timerentry pointer before releasing the list lock.
1127 * If that entry has to be removed when we release the lock, this
1128 * pointer will be updated to the element after that.
1129 */
1130 listp->next_te = next_te;
1131
1132 VERIFY_NEXT_LINK(&tp->tentry, le);
1133 VERIFY_PREV_LINK(&tp->tentry, le);
1134
1135 lck_mtx_unlock(listp->mtx);
1136
1137 index = TCPT_NONE;
1138 offset = tcp_run_conn_timer(tp, &index);
1139
1140 lck_mtx_lock(listp->mtx);
1141
1142 next_te = listp->next_te;
1143 listp->next_te = NULL;
1144
1145 if (offset > 0) {
1146 if (index < TCPT_NONE) {
1147 /* Check if this is a fast_timer. */
1148 if (!need_fast && !(IS_TIMER_SLOW(index))) {
1149 need_fast = TRUE;
1150 }
1151
1152 if (next_timer == 0 || offset < next_timer) {
1153 next_timer = offset;
1154 }
1155 }
1156 }
1157 }
1158
1159 if (!LIST_EMPTY(&listp->lhead)) {
1160 if (listp->mode == TCP_TIMERLIST_FASTMODE) {
1161 if (need_fast || active_count > 0 ||
1162 listp->pref_mode == TCP_TIMERLIST_FASTMODE) {
1163 listp->idlegen = 0;
1164 } else {
1165 listp->idlegen++;
1166 if (listp->idlegen > timer_fastmode_idlemax) {
1167 mode = TCP_TIMERLIST_SLOWMODE;
1168 listp->idlegen = 0;
1169 }
1170 }
1171 } else {
1172 if (!need_fast) {
1173 mode = TCP_TIMERLIST_SLOWMODE;
1174 }
1175 }
1176
1177 if (mode == TCP_TIMERLIST_FASTMODE ||
1178 listp->pref_mode == TCP_TIMERLIST_FASTMODE) {
1179 next_timer = listp->fast_quantum;
1180 } else {
1181 if (listp->pref_offset != 0 &&
1182 listp->pref_offset < next_timer)
1183 next_timer = listp->pref_offset;
1184 if (next_timer < listp->slow_quantum)
1185 next_timer = listp->slow_quantum;
1186 }
1187
1188 listp->mode = mode;
1189
1190 tcp_sched_timerlist(next_timer);
1191 } else {
1192 /* No need to reschedule this timer */
1193 listp->runtime = 0;
1194 }
1195
1196 listp->running = FALSE;
1197 listp->pref_mode = 0;
1198 listp->pref_offset = 0;
1199
1200 lck_mtx_unlock(listp->mtx);
1201 }
1202
1203 /* Function to verify if a change in timer state is required for a connection */
1204 void
1205 tcp_sched_timers(struct tcpcb *tp)
1206 {
1207 struct tcptimerentry *te = &tp->tentry;
1208 uint16_t index = te->index;
1209 struct tcptimerlist *listp = &tcp_timer_list;
1210 uint32_t offset = 0;
1211 boolean_t is_fast;
1212 int list_locked = 0;
1213
1214 if (tp->t_inpcb->inp_state == INPCB_STATE_DEAD) {
1215 /* Just return without adding the dead pcb to the list */
1216 if (TIMER_IS_ON_LIST(tp)) {
1217 tcp_remove_timer(tp);
1218 }
1219 return;
1220 }
1221
1222 if (index == TCPT_NONE) {
1223 tcp_remove_timer(tp);
1224 return;
1225 }
1226
1227 is_fast = !(IS_TIMER_SLOW(index));
1228 offset = te->runtime - tcp_now;
1229 if (offset == 0) {
1230 offset = 1;
1231 tcp_timer_advanced++;
1232 }
1233 if (is_fast)
1234 offset = listp->fast_quantum;
1235
1236 if (!TIMER_IS_ON_LIST(tp)) {
1237 if (!list_locked) {
1238 lck_mtx_lock(listp->mtx);
1239 list_locked = 1;
1240 }
1241
1242 LIST_INSERT_HEAD(&listp->lhead, te, le);
1243 tp->t_flags |= TF_TIMER_ONLIST;
1244
1245 listp->entries++;
1246 if (listp->entries > listp->maxentries)
1247 listp->maxentries = listp->entries;
1248
1249 /* if the list is not scheduled, just schedule it */
1250 if (listp->runtime == 0)
1251 goto schedule;
1252
1253 }
1254
1255
1256 /* timer entry is currently on the list */
1257 if (need_to_resched_timerlist(te->runtime, index)) {
1258 tcp_resched_timerlist++;
1259
1260 if (!list_locked) {
1261 lck_mtx_lock(listp->mtx);
1262 list_locked = 1;
1263 }
1264
1265 VERIFY_NEXT_LINK(te, le);
1266 VERIFY_PREV_LINK(te, le);
1267
1268 if (listp->running) {
1269 if (is_fast) {
1270 listp->pref_mode = TCP_TIMERLIST_FASTMODE;
1271 } else if (listp->pref_offset == 0 ||
1272 ((int)offset) < listp->pref_offset) {
1273 listp->pref_offset = offset;
1274 }
1275 } else {
1276 int32_t diff;
1277 diff = timer_diff(listp->runtime, 0, tcp_now, offset);
1278 if (diff <= 0) {
1279 /* The list is going to run before this timer */
1280 goto done;
1281 } else {
1282 goto schedule;
1283 }
1284 }
1285 }
1286 goto done;
1287
1288 schedule:
1289 if (is_fast) {
1290 listp->mode = TCP_TIMERLIST_FASTMODE;
1291 listp->idlegen = 0;
1292 }
1293 tcp_sched_timerlist(offset);
1294
1295 done:
1296 if (list_locked)
1297 lck_mtx_unlock(listp->mtx);
1298
1299 return;
1300 }
1301
1302 void
1303 tcp_set_lotimer_index(struct tcpcb *tp) {
1304 uint16_t i, lo_index = TCPT_NONE;
1305 uint32_t lo_timer = 0;
1306 for (i = 0; i < TCPT_NTIMERS; ++i) {
1307 if (tp->t_timer[i] != 0 &&
1308 (lo_timer == 0 || tp->t_timer[i] < lo_timer)) {
1309 lo_timer = tp->t_timer[i];
1310 lo_index = i;
1311 }
1312 }
1313 tp->tentry.index = lo_index;
1314 if (lo_index != TCPT_NONE) {
1315 tp->tentry.runtime = tp->tentry.timer_start + tp->t_timer[lo_index];
1316 } else {
1317 tp->tentry.runtime = 0;
1318 }
1319 }
1320
1321 void
1322 tcp_check_timer_state(struct tcpcb *tp) {
1323
1324 lck_mtx_assert(&tp->t_inpcb->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1325
1326 tcp_set_lotimer_index(tp);
1327
1328 tcp_sched_timers(tp);
1329 return;
1330 }