]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/tcp_timer.c
xnu-3247.10.11.tar.gz
[apple/xnu.git] / bsd / netinet / tcp_timer.c
1 /*
2 * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
30 * The Regents of the University of California. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95
61 * $FreeBSD: src/sys/netinet/tcp_timer.c,v 1.34.2.11 2001/08/22 00:59:12 silby Exp $
62 */
63
64
65 #include <sys/param.h>
66 #include <sys/systm.h>
67 #include <sys/kernel.h>
68 #include <sys/mbuf.h>
69 #include <sys/sysctl.h>
70 #include <sys/socket.h>
71 #include <sys/socketvar.h>
72 #include <sys/protosw.h>
73 #include <sys/domain.h>
74 #include <sys/mcache.h>
75 #include <sys/queue.h>
76 #include <kern/locks.h>
77 #include <kern/cpu_number.h> /* before tcp_seq.h, for tcp_random18() */
78 #include <mach/boolean.h>
79
80 #include <net/route.h>
81 #include <net/if_var.h>
82 #include <net/ntstat.h>
83
84 #include <netinet/in.h>
85 #include <netinet/in_systm.h>
86 #include <netinet/in_pcb.h>
87 #if INET6
88 #include <netinet6/in6_pcb.h>
89 #endif
90 #include <netinet/ip_var.h>
91 #include <netinet/tcp.h>
92 #include <netinet/tcp_cache.h>
93 #include <netinet/tcp_fsm.h>
94 #include <netinet/tcp_seq.h>
95 #include <netinet/tcp_timer.h>
96 #include <netinet/tcp_var.h>
97 #include <netinet/tcp_cc.h>
98 #if INET6
99 #include <netinet6/tcp6_var.h>
100 #endif
101 #include <netinet/tcpip.h>
102 #if TCPDEBUG
103 #include <netinet/tcp_debug.h>
104 #endif
105 #include <sys/kdebug.h>
106 #include <mach/sdt.h>
107 #include <netinet/mptcp_var.h>
108
109 #define TIMERENTRY_TO_TP(te) ((struct tcpcb *)((uintptr_t)te - offsetof(struct tcpcb, tentry.le.le_next)))
110
111 #define VERIFY_NEXT_LINK(elm,field) do { \
112 if (LIST_NEXT((elm),field) != NULL && \
113 LIST_NEXT((elm),field)->field.le_prev != \
114 &((elm)->field.le_next)) \
115 panic("Bad link elm %p next->prev != elm", (elm)); \
116 } while(0)
117
118 #define VERIFY_PREV_LINK(elm,field) do { \
119 if (*(elm)->field.le_prev != (elm)) \
120 panic("Bad link elm %p prev->next != elm", (elm)); \
121 } while(0)
122
123 #define TCP_SET_TIMER_MODE(mode, i) do { \
124 if (IS_TIMER_HZ_10MS(i)) \
125 (mode) |= TCP_TIMERLIST_10MS_MODE; \
126 else if (IS_TIMER_HZ_100MS(i)) \
127 (mode) |= TCP_TIMERLIST_100MS_MODE; \
128 else \
129 (mode) |= TCP_TIMERLIST_500MS_MODE; \
130 } while(0)
131
132 /* Max number of times a stretch ack can be delayed on a connection */
133 #define TCP_STRETCHACK_DELAY_THRESHOLD 5
134
135 /*
136 * If the host processor has been sleeping for too long, this is the threshold
137 * used to avoid sending stale retransmissions.
138 */
139 #define TCP_SLEEP_TOO_LONG (10 * 60 * 1000) /* 10 minutes in ms */
140
141 /* tcp timer list */
142 struct tcptimerlist tcp_timer_list;
143
144 /* List of pcbs in timewait state, protected by tcbinfo's ipi_lock */
145 struct tcptailq tcp_tw_tailq;
146
147 static int
148 sysctl_msec_to_ticks SYSCTL_HANDLER_ARGS
149 {
150 #pragma unused(arg1, arg2)
151 int error, s, tt;
152
153 tt = *(int *)oidp->oid_arg1;
154 s = tt * 1000 / TCP_RETRANSHZ;;
155
156 error = sysctl_handle_int(oidp, &s, 0, req);
157 if (error || !req->newptr)
158 return (error);
159
160 tt = s * TCP_RETRANSHZ / 1000;
161 if (tt < 1)
162 return (EINVAL);
163
164 *(int *)oidp->oid_arg1 = tt;
165 return (0);
166 }
167
168 int tcp_keepinit;
169 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit,
170 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
171 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "");
172
173 int tcp_keepidle;
174 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle,
175 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
176 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "");
177
178 int tcp_keepintvl;
179 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl,
180 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
181 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "");
182
183 int tcp_keepcnt;
184 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt,
185 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
186 &tcp_keepcnt, 0, "number of times to repeat keepalive");
187
188 int tcp_msl;
189 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl,
190 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
191 &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
192
193 /*
194 * Avoid DoS via TCP Robustness in Persist Condition
195 * (see http://www.ietf.org/id/draft-ananth-tcpm-persist-02.txt)
196 * by allowing a system wide maximum persistence timeout value when in
197 * Zero Window Probe mode.
198 *
199 * Expressed in milliseconds to be consistent without timeout related
200 * values, the TCP socket option is in seconds.
201 */
202 u_int32_t tcp_max_persist_timeout = 0;
203 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, max_persist_timeout,
204 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
205 &tcp_max_persist_timeout, 0, sysctl_msec_to_ticks, "I",
206 "Maximum persistence timeout for ZWP");
207
208 static int always_keepalive = 0;
209 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive,
210 CTLFLAG_RW | CTLFLAG_LOCKED,
211 &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections");
212
213 /*
214 * This parameter determines how long the timer list will stay in fast or
215 * quick mode even though all connections are idle. In this state, the
216 * timer will run more frequently anticipating new data.
217 */
218 int timer_fastmode_idlemax = TCP_FASTMODE_IDLERUN_MAX;
219 SYSCTL_INT(_net_inet_tcp, OID_AUTO, timer_fastmode_idlemax,
220 CTLFLAG_RW | CTLFLAG_LOCKED,
221 &timer_fastmode_idlemax, 0, "Maximum idle generations in fast mode");
222
223 /*
224 * See tcp_syn_backoff[] for interval values between SYN retransmits;
225 * the value set below defines the number of retransmits, before we
226 * disable the timestamp and window scaling options during subsequent
227 * SYN retransmits. Setting it to 0 disables the dropping off of those
228 * two options.
229 */
230 static int tcp_broken_peer_syn_rxmit_thres = 10;
231 SYSCTL_INT(_net_inet_tcp, OID_AUTO, broken_peer_syn_rexmit_thres,
232 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_broken_peer_syn_rxmit_thres, 0,
233 "Number of retransmitted SYNs before disabling RFC 1323 "
234 "options on local connections");
235
236 static int tcp_timer_advanced = 0;
237 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_timer_advanced,
238 CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_timer_advanced, 0,
239 "Number of times one of the timers was advanced");
240
241 static int tcp_resched_timerlist = 0;
242 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_resched_timerlist,
243 CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_resched_timerlist, 0,
244 "Number of times timer list was rescheduled as part of processing a packet");
245
246 int tcp_pmtud_black_hole_detect = 1 ;
247 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection,
248 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_pmtud_black_hole_detect, 0,
249 "Path MTU Discovery Black Hole Detection");
250
251 int tcp_pmtud_black_hole_mss = 1200 ;
252 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss,
253 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_pmtud_black_hole_mss, 0,
254 "Path MTU Discovery Black Hole Detection lowered MSS");
255
256 #define TCP_REPORT_STATS_INTERVAL 43200 /* 12 hours, in seconds */
257 int tcp_report_stats_interval = TCP_REPORT_STATS_INTERVAL;
258 #if (DEVELOPMENT || DEBUG)
259 SYSCTL_INT(_net_inet_tcp, OID_AUTO, report_stats_interval,
260 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_report_stats_interval, 0,
261 "Report stats interval");
262 #endif /* (DEVELOPMENT || DEBUG) */
263
264 /* performed garbage collection of "used" sockets */
265 static boolean_t tcp_gc_done = FALSE;
266
267 /* max idle probes */
268 int tcp_maxpersistidle;
269
270 /*
271 * TCP delack timer is set to 100 ms. Since the processing of timer list
272 * in fast mode will happen no faster than 100 ms, the delayed ack timer
273 * will fire some where between 100 and 200 ms.
274 */
275 int tcp_delack = TCP_RETRANSHZ / 10;
276
277 #if MPTCP
278 /*
279 * MP_JOIN retransmission of 3rd ACK will be every 500 msecs without backoff
280 */
281 int tcp_jack_rxmt = TCP_RETRANSHZ / 2;
282 #endif /* MPTCP */
283
284 static boolean_t tcp_itimer_done = FALSE;
285
286 static void tcp_remove_timer(struct tcpcb *tp);
287 static void tcp_sched_timerlist(uint32_t offset);
288 static u_int32_t tcp_run_conn_timer(struct tcpcb *tp, u_int16_t *mode,
289 u_int16_t probe_if_index);
290 static void tcp_sched_timers(struct tcpcb *tp);
291 static inline void tcp_set_lotimer_index(struct tcpcb *);
292 __private_extern__ void tcp_remove_from_time_wait(struct inpcb *inp);
293 __private_extern__ void tcp_report_stats(void);
294
295 /*
296 * Macro to compare two timers. If there is a reset of the sign bit,
297 * it is safe to assume that the timer has wrapped around. By doing
298 * signed comparision, we take care of wrap around such that the value
299 * with the sign bit reset is actually ahead of the other.
300 */
301 inline int32_t
302 timer_diff(uint32_t t1, uint32_t toff1, uint32_t t2, uint32_t toff2) {
303 return (int32_t)((t1 + toff1) - (t2 + toff2));
304 };
305
306 static u_int64_t tcp_last_report_time;
307
308 /*
309 * Structure to store previously reported stats so that we can send
310 * incremental changes in each report interval.
311 */
312 struct tcp_last_report_stats {
313 u_int32_t tcps_connattempt;
314 u_int32_t tcps_accepts;
315 u_int32_t tcps_ecn_client_setup;
316 u_int32_t tcps_ecn_server_setup;
317 u_int32_t tcps_ecn_client_success;
318 u_int32_t tcps_ecn_server_success;
319 u_int32_t tcps_ecn_not_supported;
320 u_int32_t tcps_ecn_lost_syn;
321 u_int32_t tcps_ecn_lost_synack;
322 u_int32_t tcps_ecn_recv_ce;
323 u_int32_t tcps_ecn_recv_ece;
324 u_int32_t tcps_ecn_sent_ece;
325 u_int32_t tcps_ecn_conn_recv_ce;
326 u_int32_t tcps_ecn_conn_recv_ece;
327 u_int32_t tcps_ecn_conn_plnoce;
328 u_int32_t tcps_ecn_conn_pl_ce;
329 u_int32_t tcps_ecn_conn_nopl_ce;
330
331 /* TFO-related statistics */
332 u_int32_t tcps_tfo_syn_data_rcv;
333 u_int32_t tcps_tfo_cookie_req_rcv;
334 u_int32_t tcps_tfo_cookie_sent;
335 u_int32_t tcps_tfo_cookie_invalid;
336 u_int32_t tcps_tfo_cookie_req;
337 u_int32_t tcps_tfo_cookie_rcv;
338 u_int32_t tcps_tfo_syn_data_sent;
339 u_int32_t tcps_tfo_syn_data_acked;
340 u_int32_t tcps_tfo_syn_loss;
341 u_int32_t tcps_tfo_blackhole;
342 };
343
344
345 /* Returns true if the timer is on the timer list */
346 #define TIMER_IS_ON_LIST(tp) ((tp)->t_flags & TF_TIMER_ONLIST)
347
348 /* Run the TCP timerlist atleast once every hour */
349 #define TCP_TIMERLIST_MAX_OFFSET (60 * 60 * TCP_RETRANSHZ)
350
351
352 static void add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay);
353 static boolean_t tcp_garbage_collect(struct inpcb *, int);
354
355 /*
356 * Add to tcp timewait list, delay is given in milliseconds.
357 */
358 static void
359 add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay)
360 {
361 struct inpcbinfo *pcbinfo = &tcbinfo;
362 struct inpcb *inp = tp->t_inpcb;
363 uint32_t timer;
364
365 /* pcb list should be locked when we get here */
366 lck_rw_assert(pcbinfo->ipi_lock, LCK_RW_ASSERT_EXCLUSIVE);
367
368 /* We may get here multiple times, so check */
369 if (!(inp->inp_flags2 & INP2_TIMEWAIT)) {
370 pcbinfo->ipi_twcount++;
371 inp->inp_flags2 |= INP2_TIMEWAIT;
372
373 /* Remove from global inp list */
374 LIST_REMOVE(inp, inp_list);
375 } else {
376 TAILQ_REMOVE(&tcp_tw_tailq, tp, t_twentry);
377 }
378
379 /* Compute the time at which this socket can be closed */
380 timer = tcp_now + delay;
381
382 /* We will use the TCPT_2MSL timer for tracking this delay */
383
384 if (TIMER_IS_ON_LIST(tp))
385 tcp_remove_timer(tp);
386 tp->t_timer[TCPT_2MSL] = timer;
387
388 TAILQ_INSERT_TAIL(&tcp_tw_tailq, tp, t_twentry);
389 }
390
391 void
392 add_to_time_wait(struct tcpcb *tp, uint32_t delay)
393 {
394 struct inpcbinfo *pcbinfo = &tcbinfo;
395 if (tp->t_inpcb->inp_socket->so_options & SO_NOWAKEFROMSLEEP)
396 socket_post_kev_msg_closed(tp->t_inpcb->inp_socket);
397
398 /* 19182803: Notify nstat that connection is closing before waiting. */
399 nstat_pcb_detach(tp->t_inpcb);
400
401 if (!lck_rw_try_lock_exclusive(pcbinfo->ipi_lock)) {
402 tcp_unlock(tp->t_inpcb->inp_socket, 0, 0);
403 lck_rw_lock_exclusive(pcbinfo->ipi_lock);
404 tcp_lock(tp->t_inpcb->inp_socket, 0, 0);
405 }
406 add_to_time_wait_locked(tp, delay);
407 lck_rw_done(pcbinfo->ipi_lock);
408
409 inpcb_gc_sched(pcbinfo, INPCB_TIMER_LAZY);
410 }
411
412 /* If this is on time wait queue, remove it. */
413 void
414 tcp_remove_from_time_wait(struct inpcb *inp)
415 {
416 struct tcpcb *tp = intotcpcb(inp);
417 if (inp->inp_flags2 & INP2_TIMEWAIT)
418 TAILQ_REMOVE(&tcp_tw_tailq, tp, t_twentry);
419 }
420
421 static boolean_t
422 tcp_garbage_collect(struct inpcb *inp, int istimewait)
423 {
424 boolean_t active = FALSE;
425 struct socket *so;
426 struct tcpcb *tp;
427
428 so = inp->inp_socket;
429 tp = intotcpcb(inp);
430
431 /*
432 * Skip if still in use or busy; it would have been more efficient
433 * if we were to test so_usecount against 0, but this isn't possible
434 * due to the current implementation of tcp_dropdropablreq() where
435 * overflow sockets that are eligible for garbage collection have
436 * their usecounts set to 1.
437 */
438 if (!lck_mtx_try_lock_spin(&inp->inpcb_mtx))
439 return (TRUE);
440
441 /* Check again under the lock */
442 if (so->so_usecount > 1) {
443 if (inp->inp_wantcnt == WNT_STOPUSING)
444 active = TRUE;
445 lck_mtx_unlock(&inp->inpcb_mtx);
446 return (active);
447 }
448
449 if (istimewait &&
450 TSTMP_GEQ(tcp_now, tp->t_timer[TCPT_2MSL]) &&
451 tp->t_state != TCPS_CLOSED) {
452 /* Become a regular mutex */
453 lck_mtx_convert_spin(&inp->inpcb_mtx);
454 tcp_close(tp);
455 }
456
457 /*
458 * Overflowed socket dropped from the listening queue? Do this
459 * only if we are called to clean up the time wait slots, since
460 * tcp_dropdropablreq() considers a socket to have been fully
461 * dropped after add_to_time_wait() is finished.
462 * Also handle the case of connections getting closed by the peer
463 * while in the queue as seen with rdar://6422317
464 *
465 */
466 if (so->so_usecount == 1 &&
467 ((istimewait && (so->so_flags & SOF_OVERFLOW)) ||
468 ((tp != NULL) && (tp->t_state == TCPS_CLOSED) &&
469 (so->so_head != NULL) &&
470 ((so->so_state & (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) ==
471 (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE))))) {
472
473 if (inp->inp_state != INPCB_STATE_DEAD) {
474 /* Become a regular mutex */
475 lck_mtx_convert_spin(&inp->inpcb_mtx);
476 #if INET6
477 if (SOCK_CHECK_DOM(so, PF_INET6))
478 in6_pcbdetach(inp);
479 else
480 #endif /* INET6 */
481 in_pcbdetach(inp);
482 }
483 so->so_usecount--;
484 if (inp->inp_wantcnt == WNT_STOPUSING)
485 active = TRUE;
486 lck_mtx_unlock(&inp->inpcb_mtx);
487 return (active);
488 } else if (inp->inp_wantcnt != WNT_STOPUSING) {
489 lck_mtx_unlock(&inp->inpcb_mtx);
490 return (FALSE);
491 }
492
493 /*
494 * We get here because the PCB is no longer searchable
495 * (WNT_STOPUSING); detach (if needed) and dispose if it is dead
496 * (usecount is 0). This covers all cases, including overflow
497 * sockets and those that are considered as "embryonic",
498 * i.e. created by sonewconn() in TCP input path, and have
499 * not yet been committed. For the former, we reduce the usecount
500 * to 0 as done by the code above. For the latter, the usecount
501 * would have reduced to 0 as part calling soabort() when the
502 * socket is dropped at the end of tcp_input().
503 */
504 if (so->so_usecount == 0) {
505 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
506 struct tcpcb *, tp, int32_t, TCPS_CLOSED);
507 /* Become a regular mutex */
508 lck_mtx_convert_spin(&inp->inpcb_mtx);
509
510 /*
511 * If this tp still happens to be on the timer list,
512 * take it out
513 */
514 if (TIMER_IS_ON_LIST(tp)) {
515 tcp_remove_timer(tp);
516 }
517
518 if (inp->inp_state != INPCB_STATE_DEAD) {
519 #if INET6
520 if (SOCK_CHECK_DOM(so, PF_INET6))
521 in6_pcbdetach(inp);
522 else
523 #endif /* INET6 */
524 in_pcbdetach(inp);
525 }
526 in_pcbdispose(inp);
527 return (FALSE);
528 }
529
530 lck_mtx_unlock(&inp->inpcb_mtx);
531 return (TRUE);
532 }
533
534 /*
535 * TCP garbage collector callback (inpcb_timer_func_t).
536 *
537 * Returns the number of pcbs that will need to be gc-ed soon,
538 * returnining > 0 will keep timer active.
539 */
540 void
541 tcp_gc(struct inpcbinfo *ipi)
542 {
543 struct inpcb *inp, *nxt;
544 struct tcpcb *tw_tp, *tw_ntp;
545 #if TCPDEBUG
546 int ostate;
547 #endif
548 #if KDEBUG
549 static int tws_checked = 0;
550 #endif
551
552 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_START, 0, 0, 0, 0, 0);
553
554 /*
555 * Update tcp_now here as it may get used while
556 * processing the slow timer.
557 */
558 calculate_tcp_clock();
559
560 /*
561 * Garbage collect socket/tcpcb: We need to acquire the list lock
562 * exclusively to do this
563 */
564
565 if (lck_rw_try_lock_exclusive(ipi->ipi_lock) == FALSE) {
566 /* don't sweat it this time; cleanup was done last time */
567 if (tcp_gc_done == TRUE) {
568 tcp_gc_done = FALSE;
569 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END,
570 tws_checked, cur_tw_slot, 0, 0, 0);
571 /* Lock upgrade failed, give up this round */
572 atomic_add_32(&ipi->ipi_gc_req.intimer_fast, 1);
573 return;
574 }
575 /* Upgrade failed, lost lock now take it again exclusive */
576 lck_rw_lock_exclusive(ipi->ipi_lock);
577 }
578 tcp_gc_done = TRUE;
579
580 LIST_FOREACH_SAFE(inp, &tcb, inp_list, nxt) {
581 if (tcp_garbage_collect(inp, 0))
582 atomic_add_32(&ipi->ipi_gc_req.intimer_fast, 1);
583 }
584
585 /* Now cleanup the time wait ones */
586 TAILQ_FOREACH_SAFE(tw_tp, &tcp_tw_tailq, t_twentry, tw_ntp) {
587 /*
588 * We check the timestamp here without holding the
589 * socket lock for better performance. If there are
590 * any pcbs in time-wait, the timer will get rescheduled.
591 * Hence some error in this check can be tolerated.
592 *
593 * Sometimes a socket on time-wait queue can be closed if
594 * 2MSL timer expired but the application still has a
595 * usecount on it.
596 */
597 if (tw_tp->t_state == TCPS_CLOSED ||
598 TSTMP_GEQ(tcp_now, tw_tp->t_timer[TCPT_2MSL])) {
599 if (tcp_garbage_collect(tw_tp->t_inpcb, 1))
600 atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, 1);
601 }
602 }
603
604 /* take into account pcbs that are still in time_wait_slots */
605 atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, ipi->ipi_twcount);
606
607 lck_rw_done(ipi->ipi_lock);
608
609 /* Clean up the socache while we are here */
610 if (so_cache_timer())
611 atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, 1);
612
613 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END, tws_checked,
614 cur_tw_slot, 0, 0, 0);
615
616 return;
617 }
618
619 /*
620 * Cancel all timers for TCP tp.
621 */
622 void
623 tcp_canceltimers(tp)
624 struct tcpcb *tp;
625 {
626 register int i;
627
628 tcp_remove_timer(tp);
629 for (i = 0; i < TCPT_NTIMERS; i++)
630 tp->t_timer[i] = 0;
631 tp->tentry.timer_start = tcp_now;
632 tp->tentry.index = TCPT_NONE;
633 }
634
635 int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
636 { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
637
638 int tcp_backoff[TCP_MAXRXTSHIFT + 1] =
639 { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 };
640
641 static int tcp_totbackoff = 511; /* sum of tcp_backoff[] */
642
643 void tcp_rexmt_save_state(struct tcpcb *tp)
644 {
645 u_int32_t fsize;
646 if (TSTMP_SUPPORTED(tp)) {
647 /*
648 * Since timestamps are supported on the connection,
649 * we can do recovery as described in rfc 4015.
650 */
651 fsize = tp->snd_max - tp->snd_una;
652 tp->snd_ssthresh_prev = max(fsize, tp->snd_ssthresh);
653 tp->snd_recover_prev = tp->snd_recover;
654 } else {
655 /*
656 * Timestamp option is not supported on this connection.
657 * Record ssthresh and cwnd so they can
658 * be recovered if this turns out to be a "bad" retransmit.
659 * A retransmit is considered "bad" if an ACK for this
660 * segment is received within RTT/2 interval; the assumption
661 * here is that the ACK was already in flight. See
662 * "On Estimating End-to-End Network Path Properties" by
663 * Allman and Paxson for more details.
664 */
665 tp->snd_cwnd_prev = tp->snd_cwnd;
666 tp->snd_ssthresh_prev = tp->snd_ssthresh;
667 tp->snd_recover_prev = tp->snd_recover;
668 if (IN_FASTRECOVERY(tp))
669 tp->t_flags |= TF_WASFRECOVERY;
670 else
671 tp->t_flags &= ~TF_WASFRECOVERY;
672 }
673 tp->t_srtt_prev = (tp->t_srtt >> TCP_RTT_SHIFT) + 2;
674 tp->t_rttvar_prev = (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
675 tp->t_flagsext &= ~(TF_RECOMPUTE_RTT);
676 }
677
678 /*
679 * Revert to the older segment size if there is an indication that PMTU
680 * blackhole detection was not needed.
681 */
682 void tcp_pmtud_revert_segment_size(struct tcpcb *tp)
683 {
684 int32_t optlen;
685
686 VERIFY(tp->t_pmtud_saved_maxopd > 0);
687 tp->t_flags |= TF_PMTUD;
688 tp->t_flags &= ~TF_BLACKHOLE;
689 optlen = tp->t_maxopd - tp->t_maxseg;
690 tp->t_maxopd = tp->t_pmtud_saved_maxopd;
691 tp->t_maxseg = tp->t_maxopd - optlen;
692 /*
693 * Reset the slow-start flight size as it
694 * may depend on the new MSS
695 */
696 if (CC_ALGO(tp)->cwnd_init != NULL)
697 CC_ALGO(tp)->cwnd_init(tp);
698 tp->t_pmtud_start_ts = 0;
699 tcpstat.tcps_pmtudbh_reverted++;
700 }
701
702 /*
703 * TCP timer processing.
704 */
705 struct tcpcb *
706 tcp_timers(tp, timer)
707 register struct tcpcb *tp;
708 int timer;
709 {
710 int32_t rexmt, optlen = 0, idle_time = 0;
711 struct socket *so;
712 struct tcptemp *t_template;
713 #if TCPDEBUG
714 int ostate;
715 #endif
716
717 #if INET6
718 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV4) == 0;
719 #endif /* INET6 */
720 u_int64_t accsleep_ms;
721 u_int32_t last_sleep_ms = 0;
722
723 so = tp->t_inpcb->inp_socket;
724 idle_time = tcp_now - tp->t_rcvtime;
725
726 switch (timer) {
727
728 /*
729 * 2 MSL timeout in shutdown went off. If we're closed but
730 * still waiting for peer to close and connection has been idle
731 * too long, or if 2MSL time is up from TIME_WAIT or FIN_WAIT_2,
732 * delete connection control block.
733 * Otherwise, (this case shouldn't happen) check again in a bit
734 * we keep the socket in the main list in that case.
735 */
736 case TCPT_2MSL:
737 tcp_free_sackholes(tp);
738 if (tp->t_state != TCPS_TIME_WAIT &&
739 tp->t_state != TCPS_FIN_WAIT_2 &&
740 ((idle_time > 0) && (idle_time < TCP_CONN_MAXIDLE(tp)))) {
741 tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp,
742 (u_int32_t)TCP_CONN_KEEPINTVL(tp));
743 } else {
744 tp = tcp_close(tp);
745 return(tp);
746 }
747 break;
748
749 /*
750 * Retransmission timer went off. Message has not
751 * been acked within retransmit interval. Back off
752 * to a longer retransmit interval and retransmit one segment.
753 */
754 case TCPT_REXMT:
755 accsleep_ms = mach_absolutetime_asleep / 1000000UL;
756 if (accsleep_ms > tp->t_accsleep_ms)
757 last_sleep_ms = accsleep_ms - tp->t_accsleep_ms;
758 /*
759 * Drop a connection in the retransmit timer
760 * 1. If we have retransmitted more than TCP_MAXRXTSHIFT
761 * times
762 * 2. If the time spent in this retransmission episode is
763 * more than the time limit set with TCP_RXT_CONNDROPTIME
764 * socket option
765 * 3. If TCP_RXT_FINDROP socket option was set and
766 * we have already retransmitted the FIN 3 times without
767 * receiving an ack
768 */
769 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT ||
770 (tp->t_rxt_conndroptime > 0 && tp->t_rxtstart > 0 &&
771 (tcp_now - tp->t_rxtstart) >= tp->t_rxt_conndroptime) ||
772 ((tp->t_flagsext & TF_RXTFINDROP) != 0 &&
773 (tp->t_flags & TF_SENTFIN) != 0 && tp->t_rxtshift >= 4) ||
774 (tp->t_rxtshift > 4 && last_sleep_ms >= TCP_SLEEP_TOO_LONG)) {
775 if ((tp->t_flagsext & TF_RXTFINDROP) != 0) {
776 tcpstat.tcps_rxtfindrop++;
777 } else if (last_sleep_ms >= TCP_SLEEP_TOO_LONG) {
778 tcpstat.tcps_drop_after_sleep++;
779 } else {
780 tcpstat.tcps_timeoutdrop++;
781 }
782 tp->t_rxtshift = TCP_MAXRXTSHIFT;
783 postevent(so, 0, EV_TIMEOUT);
784 soevent(so,
785 (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT));
786 tp = tcp_drop(tp, tp->t_softerror ?
787 tp->t_softerror : ETIMEDOUT);
788
789 break;
790 }
791
792 tcpstat.tcps_rexmttimeo++;
793 tp->t_accsleep_ms = accsleep_ms;
794
795 if (tp->t_rxtshift == 1 &&
796 tp->t_state == TCPS_ESTABLISHED) {
797 /* Set the time at which retransmission started. */
798 tp->t_rxtstart = tcp_now;
799
800 /*
801 * if this is the first retransmit timeout, save
802 * the state so that we can recover if the timeout
803 * is spurious.
804 */
805 tcp_rexmt_save_state(tp);
806 }
807 #if MPTCP
808 if ((tp->t_rxtshift >= mptcp_fail_thresh) &&
809 (tp->t_state == TCPS_ESTABLISHED) &&
810 (tp->t_mpflags & TMPF_MPTCP_TRUE)) {
811 mptcp_act_on_txfail(so);
812
813 }
814 #endif /* MPTCP */
815
816 if (tp->t_adaptive_wtimo > 0 &&
817 tp->t_rxtshift > tp->t_adaptive_wtimo &&
818 TCPS_HAVEESTABLISHED(tp->t_state)) {
819 /* Send an event to the application */
820 soevent(so,
821 (SO_FILT_HINT_LOCKED|
822 SO_FILT_HINT_ADAPTIVE_WTIMO));
823 }
824
825 /*
826 * If this is a retransmit timeout after PTO, the PTO
827 * was not effective
828 */
829 if (tp->t_flagsext & TF_SENT_TLPROBE) {
830 tp->t_flagsext &= ~(TF_SENT_TLPROBE);
831 tcpstat.tcps_rto_after_pto++;
832 }
833
834 if (tp->t_flagsext & TF_DELAY_RECOVERY) {
835 /*
836 * Retransmit timer fired before entering recovery
837 * on a connection with packet re-ordering. This
838 * suggests that the reordering metrics computed
839 * are not accurate.
840 */
841 tp->t_reorderwin = 0;
842 tp->t_timer[TCPT_DELAYFR] = 0;
843 tp->t_flagsext &= ~(TF_DELAY_RECOVERY);
844 }
845
846 if (tp->t_state == TCPS_SYN_RECEIVED)
847 tcp_disable_tfo(tp);
848
849 if ((tp->t_tfo_stats & TFO_S_SYN_DATA_SENT) &&
850 !(tp->t_tfo_flags & TFO_F_NO_SNDPROBING) &&
851 ((tp->t_state != TCPS_SYN_SENT && tp->t_rxtshift > 1) ||
852 tp->t_rxtshift > 2)) {
853 /*
854 * For regular retransmissions, a first one is being
855 * done for tail-loss probe.
856 * Thus, if rxtshift > 1, this means we have sent the segment
857 * a total of 3 times.
858 *
859 * If we are in SYN-SENT state, then there is no tail-loss
860 * probe thus we have to let rxtshift go up to 3.
861 */
862 tcp_heuristic_tfo_middlebox(tp);
863
864 so->so_error = ENODATA;
865 sorwakeup(so);
866 sowwakeup(so);
867 }
868
869 if (tp->t_state == TCPS_SYN_SENT) {
870 rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift];
871 tp->t_stat.synrxtshift = tp->t_rxtshift;
872
873 /* When retransmitting, disable TFO */
874 if (tfo_enabled(tp)) {
875 tp->t_flagsext &= ~TF_FASTOPEN;
876 tp->t_tfo_flags |= TFO_F_SYN_LOSS;
877
878 tp->t_tfo_stats |= TFO_S_SYN_LOSS;
879 tcpstat.tcps_tfo_syn_loss++;
880 }
881 } else {
882 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
883 }
884
885 TCPT_RANGESET(tp->t_rxtcur, rexmt,
886 tp->t_rttmin, TCPTV_REXMTMAX,
887 TCP_ADD_REXMTSLOP(tp));
888 tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
889
890 if (INP_WAIT_FOR_IF_FEEDBACK(tp->t_inpcb))
891 goto fc_output;
892
893 tcp_free_sackholes(tp);
894 /*
895 * Check for potential Path MTU Discovery Black Hole
896 */
897 if (tcp_pmtud_black_hole_detect &&
898 !(tp->t_flagsext & TF_NOBLACKHOLE_DETECTION) &&
899 (tp->t_state == TCPS_ESTABLISHED)) {
900 if ((tp->t_flags & TF_PMTUD) &&
901 ((tp->t_flags & TF_MAXSEGSNT)
902 || tp->t_pmtud_lastseg_size > tcp_pmtud_black_hole_mss) &&
903 tp->t_rxtshift == 2) {
904 /*
905 * Enter Path MTU Black-hole Detection mechanism:
906 * - Disable Path MTU Discovery (IP "DF" bit).
907 * - Reduce MTU to lower value than what we
908 * negotiated with the peer.
909 */
910 /* Disable Path MTU Discovery for now */
911 tp->t_flags &= ~TF_PMTUD;
912 /* Record that we may have found a black hole */
913 tp->t_flags |= TF_BLACKHOLE;
914 optlen = tp->t_maxopd - tp->t_maxseg;
915 /* Keep track of previous MSS */
916 tp->t_pmtud_saved_maxopd = tp->t_maxopd;
917 tp->t_pmtud_start_ts = tcp_now;
918 if (tp->t_pmtud_start_ts == 0)
919 tp->t_pmtud_start_ts++;
920 /* Reduce the MSS to intermediary value */
921 if (tp->t_maxopd > tcp_pmtud_black_hole_mss) {
922 tp->t_maxopd = tcp_pmtud_black_hole_mss;
923 } else {
924 tp->t_maxopd = /* use the default MSS */
925 #if INET6
926 isipv6 ? tcp_v6mssdflt :
927 #endif /* INET6 */
928 tcp_mssdflt;
929 }
930 tp->t_maxseg = tp->t_maxopd - optlen;
931
932 /*
933 * Reset the slow-start flight size
934 * as it may depend on the new MSS
935 */
936 if (CC_ALGO(tp)->cwnd_init != NULL)
937 CC_ALGO(tp)->cwnd_init(tp);
938 }
939 /*
940 * If further retransmissions are still
941 * unsuccessful with a lowered MTU, maybe this
942 * isn't a Black Hole and we restore the previous
943 * MSS and blackhole detection flags.
944 */
945 else {
946
947 if ((tp->t_flags & TF_BLACKHOLE) &&
948 (tp->t_rxtshift > 4)) {
949 tcp_pmtud_revert_segment_size(tp);
950 }
951 }
952 }
953
954
955 /*
956 * Disable rfc1323 and rfc1644 if we haven't got any
957 * response to our SYN (after we reach the threshold)
958 * to work-around some broken terminal servers (most of
959 * which have hopefully been retired) that have bad VJ
960 * header compression code which trashes TCP segments
961 * containing unknown-to-them TCP options.
962 * Do this only on non-local connections.
963 */
964 if (tp->t_state == TCPS_SYN_SENT &&
965 tp->t_rxtshift == tcp_broken_peer_syn_rxmit_thres)
966 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_REQ_CC);
967
968 /*
969 * If losing, let the lower level know and try for
970 * a better route. Also, if we backed off this far,
971 * our srtt estimate is probably bogus. Clobber it
972 * so we'll take the next rtt measurement as our srtt;
973 * move the current srtt into rttvar to keep the current
974 * retransmit times until then.
975 */
976 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
977 #if INET6
978 if (isipv6)
979 in6_losing(tp->t_inpcb);
980 else
981 #endif /* INET6 */
982 in_losing(tp->t_inpcb);
983 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
984 tp->t_srtt = 0;
985 }
986 tp->snd_nxt = tp->snd_una;
987 /*
988 * Note: We overload snd_recover to function also as the
989 * snd_last variable described in RFC 2582
990 */
991 tp->snd_recover = tp->snd_max;
992 /*
993 * Force a segment to be sent.
994 */
995 tp->t_flags |= TF_ACKNOW;
996
997 /* If timing a segment in this window, stop the timer */
998 tp->t_rtttime = 0;
999
1000 if (!IN_FASTRECOVERY(tp) && tp->t_rxtshift == 1)
1001 tcpstat.tcps_tailloss_rto++;
1002
1003
1004 /*
1005 * RFC 5681 says: when a TCP sender detects segment loss
1006 * using retransmit timer and the given segment has already
1007 * been retransmitted by way of the retransmission timer at
1008 * least once, the value of ssthresh is held constant
1009 */
1010 if (tp->t_rxtshift == 1 &&
1011 CC_ALGO(tp)->after_timeout != NULL) {
1012 CC_ALGO(tp)->after_timeout(tp);
1013 /*
1014 * CWR notifications are to be sent on new data
1015 * right after Fast Retransmits and ECE
1016 * notification receipts.
1017 */
1018 if (TCP_ECN_ENABLED(tp))
1019 tp->ecn_flags |= TE_SENDCWR;
1020 }
1021
1022 EXIT_FASTRECOVERY(tp);
1023
1024 /* Exit cwnd non validated phase */
1025 tp->t_flagsext &= ~TF_CWND_NONVALIDATED;
1026
1027
1028 fc_output:
1029 tcp_ccdbg_trace(tp, NULL, TCP_CC_REXMT_TIMEOUT);
1030
1031 (void) tcp_output(tp);
1032 break;
1033
1034 /*
1035 * Persistance timer into zero window.
1036 * Force a byte to be output, if possible.
1037 */
1038 case TCPT_PERSIST:
1039 tcpstat.tcps_persisttimeo++;
1040 /*
1041 * Hack: if the peer is dead/unreachable, we do not
1042 * time out if the window is closed. After a full
1043 * backoff, drop the connection if the idle time
1044 * (no responses to probes) reaches the maximum
1045 * backoff that we would use if retransmitting.
1046 *
1047 * Drop the connection if we reached the maximum allowed time for
1048 * Zero Window Probes without a non-zero update from the peer.
1049 * See rdar://5805356
1050 */
1051 if ((tp->t_rxtshift == TCP_MAXRXTSHIFT &&
1052 (idle_time >= tcp_maxpersistidle ||
1053 idle_time >= TCP_REXMTVAL(tp) * tcp_totbackoff)) ||
1054 ((tp->t_persist_stop != 0) &&
1055 TSTMP_LEQ(tp->t_persist_stop, tcp_now))) {
1056 tcpstat.tcps_persistdrop++;
1057 postevent(so, 0, EV_TIMEOUT);
1058 soevent(so,
1059 (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT));
1060 tp = tcp_drop(tp, ETIMEDOUT);
1061 break;
1062 }
1063 tcp_setpersist(tp);
1064 tp->t_flagsext |= TF_FORCE;
1065 (void) tcp_output(tp);
1066 tp->t_flagsext &= ~TF_FORCE;
1067 break;
1068
1069 /*
1070 * Keep-alive timer went off; send something
1071 * or drop connection if idle for too long.
1072 */
1073 case TCPT_KEEP:
1074 tcpstat.tcps_keeptimeo++;
1075 #if MPTCP
1076 /*
1077 * Regular TCP connections do not send keepalives after closing
1078 * MPTCP must not also, after sending Data FINs.
1079 */
1080 struct mptcb *mp_tp = tp->t_mptcb;
1081 if ((tp->t_mpflags & TMPF_MPTCP_TRUE) &&
1082 (tp->t_state > TCPS_ESTABLISHED)) {
1083 goto dropit;
1084 } else if (mp_tp != NULL) {
1085 if ((mptcp_ok_to_keepalive(mp_tp) == 0))
1086 goto dropit;
1087 }
1088 #endif /* MPTCP */
1089 if (tp->t_state < TCPS_ESTABLISHED)
1090 goto dropit;
1091 if ((always_keepalive ||
1092 (tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) ||
1093 (tp->t_flagsext & TF_DETECT_READSTALL) ||
1094 (tp->t_tfo_probe_state == TFO_PROBE_PROBING)) &&
1095 (tp->t_state <= TCPS_CLOSING || tp->t_state == TCPS_FIN_WAIT_2)) {
1096 if (idle_time >= TCP_CONN_KEEPIDLE(tp) + TCP_CONN_MAXIDLE(tp))
1097 goto dropit;
1098 /*
1099 * Send a packet designed to force a response
1100 * if the peer is up and reachable:
1101 * either an ACK if the connection is still alive,
1102 * or an RST if the peer has closed the connection
1103 * due to timeout or reboot.
1104 * Using sequence number tp->snd_una-1
1105 * causes the transmitted zero-length segment
1106 * to lie outside the receive window;
1107 * by the protocol spec, this requires the
1108 * correspondent TCP to respond.
1109 */
1110 tcpstat.tcps_keepprobe++;
1111 t_template = tcp_maketemplate(tp);
1112 if (t_template) {
1113 struct inpcb *inp = tp->t_inpcb;
1114 struct tcp_respond_args tra;
1115
1116 bzero(&tra, sizeof(tra));
1117 tra.nocell = INP_NO_CELLULAR(inp);
1118 tra.noexpensive = INP_NO_EXPENSIVE(inp);
1119 tra.awdl_unrestricted = INP_AWDL_UNRESTRICTED(inp);
1120 if (tp->t_inpcb->inp_flags & INP_BOUND_IF)
1121 tra.ifscope = tp->t_inpcb->inp_boundifp->if_index;
1122 else
1123 tra.ifscope = IFSCOPE_NONE;
1124 tcp_respond(tp, t_template->tt_ipgen,
1125 &t_template->tt_t, (struct mbuf *)NULL,
1126 tp->rcv_nxt, tp->snd_una - 1, 0, &tra);
1127 (void) m_free(dtom(t_template));
1128 if (tp->t_flagsext & TF_DETECT_READSTALL)
1129 tp->t_rtimo_probes++;
1130 }
1131 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
1132 TCP_CONN_KEEPINTVL(tp));
1133 } else {
1134 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
1135 TCP_CONN_KEEPIDLE(tp));
1136 }
1137 if (tp->t_flagsext & TF_DETECT_READSTALL) {
1138 struct ifnet *outifp = tp->t_inpcb->inp_last_outifp;
1139 bool reenable_probe = false;
1140 /*
1141 * The keep alive packets sent to detect a read
1142 * stall did not get a response from the
1143 * peer. Generate more keep-alives to confirm this.
1144 * If the number of probes sent reaches the limit,
1145 * generate an event.
1146 */
1147 if (tp->t_adaptive_rtimo > 0) {
1148 if (tp->t_rtimo_probes > tp->t_adaptive_rtimo) {
1149 /* Generate an event */
1150 soevent(so,
1151 (SO_FILT_HINT_LOCKED |
1152 SO_FILT_HINT_ADAPTIVE_RTIMO));
1153 tcp_keepalive_reset(tp);
1154 } else {
1155 reenable_probe = true;
1156 }
1157 } else if (outifp != NULL &&
1158 (outifp->if_eflags & IFEF_PROBE_CONNECTIVITY) &&
1159 tp->t_rtimo_probes <= TCP_CONNECTIVITY_PROBES_MAX) {
1160 reenable_probe = true;
1161 } else {
1162 tp->t_flagsext &= ~TF_DETECT_READSTALL;
1163 }
1164 if (reenable_probe) {
1165 int ind = min(tp->t_rtimo_probes,
1166 TCP_MAXRXTSHIFT);
1167 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(
1168 tp, tcp_backoff[ind] * TCP_REXMTVAL(tp));
1169 }
1170 }
1171 if (tp->t_tfo_probe_state == TFO_PROBE_PROBING) {
1172 int ind;
1173
1174 tp->t_tfo_probes++;
1175 ind = min(tp->t_tfo_probes, TCP_MAXRXTSHIFT);
1176
1177 /*
1178 * We take the minimum among the time set by true
1179 * keepalive (see above) and the backoff'd RTO. That
1180 * way we backoff in case of packet-loss but will never
1181 * timeout slower than regular keepalive due to the
1182 * backing off.
1183 */
1184 tp->t_timer[TCPT_KEEP] = min(OFFSET_FROM_START(
1185 tp, tcp_backoff[ind] * TCP_REXMTVAL(tp)),
1186 tp->t_timer[TCPT_KEEP]);
1187 } else if (tp->t_tfo_probe_state == TFO_PROBE_WAIT_DATA) {
1188 /* Still no data! Let's assume a TFO-error and err out... */
1189 tcp_heuristic_tfo_middlebox(tp);
1190
1191 so->so_error = ENODATA;
1192 sorwakeup(so);
1193 tcpstat.tcps_tfo_blackhole++;
1194 }
1195 break;
1196 case TCPT_DELACK:
1197 if (tcp_delack_enabled && (tp->t_flags & TF_DELACK)) {
1198 tp->t_flags &= ~TF_DELACK;
1199 tp->t_timer[TCPT_DELACK] = 0;
1200 tp->t_flags |= TF_ACKNOW;
1201
1202 /*
1203 * If delayed ack timer fired while stretching
1204 * acks, count the number of times the streaming
1205 * detection was not correct. If this exceeds a
1206 * threshold, disable strech ack on this
1207 * connection
1208 *
1209 * Also, go back to acking every other packet.
1210 */
1211 if ((tp->t_flags & TF_STRETCHACK)) {
1212 if (tp->t_unacksegs > 1 &&
1213 tp->t_unacksegs < maxseg_unacked)
1214 tp->t_stretchack_delayed++;
1215
1216 if (tp->t_stretchack_delayed >
1217 TCP_STRETCHACK_DELAY_THRESHOLD) {
1218 tp->t_flagsext |= TF_DISABLE_STRETCHACK;
1219 /*
1220 * Note the time at which stretch
1221 * ack was disabled automatically
1222 */
1223 tp->rcv_nostrack_ts = tcp_now;
1224 tcpstat.tcps_nostretchack++;
1225 tp->t_stretchack_delayed = 0;
1226 }
1227 tcp_reset_stretch_ack(tp);
1228 }
1229
1230 /*
1231 * If we are measuring inter packet arrival jitter
1232 * for throttling a connection, this delayed ack
1233 * might be the reason for accumulating some
1234 * jitter. So let's restart the measurement.
1235 */
1236 CLEAR_IAJ_STATE(tp);
1237
1238 tcpstat.tcps_delack++;
1239 (void) tcp_output(tp);
1240 }
1241 break;
1242
1243 #if MPTCP
1244 case TCPT_JACK_RXMT:
1245 if ((tp->t_state == TCPS_ESTABLISHED) &&
1246 (tp->t_mpflags & TMPF_PREESTABLISHED) &&
1247 (tp->t_mpflags & TMPF_JOINED_FLOW)) {
1248 if (++tp->t_mprxtshift > TCP_MAXRXTSHIFT) {
1249 tcpstat.tcps_timeoutdrop++;
1250 postevent(so, 0, EV_TIMEOUT);
1251 soevent(so,
1252 (SO_FILT_HINT_LOCKED|
1253 SO_FILT_HINT_TIMEOUT));
1254 tp = tcp_drop(tp, tp->t_softerror ?
1255 tp->t_softerror : ETIMEDOUT);
1256 break;
1257 }
1258 tcpstat.tcps_join_rxmts++;
1259 tp->t_flags |= TF_ACKNOW;
1260
1261 /*
1262 * No backoff is implemented for simplicity for this
1263 * corner case.
1264 */
1265 (void) tcp_output(tp);
1266 }
1267 break;
1268 #endif /* MPTCP */
1269
1270 case TCPT_PTO:
1271 {
1272 int32_t snd_len;
1273 tp->t_flagsext &= ~(TF_SENT_TLPROBE);
1274
1275 /*
1276 * Check if the connection is in the right state to
1277 * send a probe
1278 */
1279 if (tp->t_state != TCPS_ESTABLISHED ||
1280 (tp->t_rxtshift > 0 && !(tp->t_flagsext & TF_PROBING))
1281 || tp->snd_max == tp->snd_una ||
1282 !SACK_ENABLED(tp) || !TAILQ_EMPTY(&tp->snd_holes) ||
1283 IN_FASTRECOVERY(tp))
1284 break;
1285
1286 /*
1287 * If there is no new data to send or if the
1288 * connection is limited by receive window then
1289 * retransmit the last segment, otherwise send
1290 * new data.
1291 */
1292 snd_len = min(so->so_snd.sb_cc, tp->snd_wnd)
1293 - (tp->snd_max - tp->snd_una);
1294 if (snd_len > 0) {
1295 tp->snd_nxt = tp->snd_max;
1296 } else {
1297 snd_len = min((tp->snd_max - tp->snd_una),
1298 tp->t_maxseg);
1299 tp->snd_nxt = tp->snd_max - snd_len;
1300 }
1301
1302 tcpstat.tcps_pto++;
1303 if (tp->t_flagsext & TF_PROBING)
1304 tcpstat.tcps_probe_if++;
1305
1306 /* If timing a segment in this window, stop the timer */
1307 tp->t_rtttime = 0;
1308 /* Note that tail loss probe is being sent */
1309 tp->t_flagsext |= TF_SENT_TLPROBE;
1310 tp->t_tlpstart = tcp_now;
1311
1312 tp->snd_cwnd += tp->t_maxseg;
1313 (void )tcp_output(tp);
1314 tp->snd_cwnd -= tp->t_maxseg;
1315
1316 tp->t_tlphighrxt = tp->snd_nxt;
1317 break;
1318 }
1319 case TCPT_DELAYFR:
1320 tp->t_flagsext &= ~TF_DELAY_RECOVERY;
1321
1322 /*
1323 * Don't do anything if one of the following is true:
1324 * - the connection is already in recovery
1325 * - sequence until snd_recover has been acknowledged.
1326 * - retransmit timeout has fired
1327 */
1328 if (IN_FASTRECOVERY(tp) ||
1329 SEQ_GEQ(tp->snd_una, tp->snd_recover) ||
1330 tp->t_rxtshift > 0)
1331 break;
1332
1333 VERIFY(SACK_ENABLED(tp));
1334 tcp_rexmt_save_state(tp);
1335 if (CC_ALGO(tp)->pre_fr != NULL) {
1336 CC_ALGO(tp)->pre_fr(tp);
1337 if (TCP_ECN_ENABLED(tp))
1338 tp->ecn_flags |= TE_SENDCWR;
1339 }
1340 ENTER_FASTRECOVERY(tp);
1341
1342 tp->t_timer[TCPT_REXMT] = 0;
1343 tcpstat.tcps_sack_recovery_episode++;
1344 tp->sack_newdata = tp->snd_nxt;
1345 tp->snd_cwnd = tp->t_maxseg;
1346 tcp_ccdbg_trace(tp, NULL, TCP_CC_ENTER_FASTRECOVERY);
1347 (void) tcp_output(tp);
1348 break;
1349 dropit:
1350 tcpstat.tcps_keepdrops++;
1351 postevent(so, 0, EV_TIMEOUT);
1352 soevent(so,
1353 (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT));
1354 tp = tcp_drop(tp, ETIMEDOUT);
1355 break;
1356 }
1357 #if TCPDEBUG
1358 if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
1359 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
1360 PRU_SLOWTIMO);
1361 #endif
1362 return (tp);
1363 }
1364
1365 /* Remove a timer entry from timer list */
1366 void
1367 tcp_remove_timer(struct tcpcb *tp)
1368 {
1369 struct tcptimerlist *listp = &tcp_timer_list;
1370
1371 lck_mtx_assert(&tp->t_inpcb->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1372 if (!(TIMER_IS_ON_LIST(tp))) {
1373 return;
1374 }
1375 lck_mtx_lock(listp->mtx);
1376
1377 /* Check if pcb is on timer list again after acquiring the lock */
1378 if (!(TIMER_IS_ON_LIST(tp))) {
1379 lck_mtx_unlock(listp->mtx);
1380 return;
1381 }
1382
1383 if (listp->next_te != NULL && listp->next_te == &tp->tentry)
1384 listp->next_te = LIST_NEXT(&tp->tentry, le);
1385
1386 LIST_REMOVE(&tp->tentry, le);
1387 tp->t_flags &= ~(TF_TIMER_ONLIST);
1388
1389 listp->entries--;
1390
1391 tp->tentry.le.le_next = NULL;
1392 tp->tentry.le.le_prev = NULL;
1393 lck_mtx_unlock(listp->mtx);
1394 }
1395
1396 /*
1397 * Function to check if the timerlist needs to be rescheduled to run
1398 * the timer entry correctly. Basically, this is to check if we can avoid
1399 * taking the list lock.
1400 */
1401
1402 static boolean_t
1403 need_to_resched_timerlist(u_int32_t runtime, u_int16_t mode)
1404 {
1405 struct tcptimerlist *listp = &tcp_timer_list;
1406 int32_t diff;
1407
1408 /*
1409 * If the list is being processed then the state of the list is
1410 * in flux. In this case always acquire the lock and set the state
1411 * correctly.
1412 */
1413 if (listp->running)
1414 return (TRUE);
1415
1416 if (!listp->scheduled)
1417 return (TRUE);
1418
1419 diff = timer_diff(listp->runtime, 0, runtime, 0);
1420 if (diff <= 0) {
1421 /* The list is going to run before this timer */
1422 return (FALSE);
1423 } else {
1424 if (mode & TCP_TIMERLIST_10MS_MODE) {
1425 if (diff <= TCP_TIMER_10MS_QUANTUM)
1426 return (FALSE);
1427 } else if (mode & TCP_TIMERLIST_100MS_MODE) {
1428 if (diff <= TCP_TIMER_100MS_QUANTUM)
1429 return (FALSE);
1430 } else {
1431 if (diff <= TCP_TIMER_500MS_QUANTUM)
1432 return (FALSE);
1433 }
1434 }
1435 return (TRUE);
1436 }
1437
1438 void
1439 tcp_sched_timerlist(uint32_t offset)
1440 {
1441 uint64_t deadline = 0;
1442 struct tcptimerlist *listp = &tcp_timer_list;
1443
1444 lck_mtx_assert(listp->mtx, LCK_MTX_ASSERT_OWNED);
1445
1446 offset = min(offset, TCP_TIMERLIST_MAX_OFFSET);
1447 listp->runtime = tcp_now + offset;
1448 if (listp->runtime == 0) {
1449 listp->runtime++;
1450 offset++;
1451 }
1452
1453 clock_interval_to_deadline(offset, USEC_PER_SEC, &deadline);
1454
1455 thread_call_enter_delayed(listp->call, deadline);
1456 listp->scheduled = TRUE;
1457 }
1458
1459 /*
1460 * Function to run the timers for a connection.
1461 *
1462 * Returns the offset of next timer to be run for this connection which
1463 * can be used to reschedule the timerlist.
1464 *
1465 * te_mode is an out parameter that indicates the modes of active
1466 * timers for this connection.
1467 */
1468 u_int32_t
1469 tcp_run_conn_timer(struct tcpcb *tp, u_int16_t *te_mode,
1470 u_int16_t probe_if_index)
1471 {
1472 struct socket *so;
1473 u_int16_t i = 0, index = TCPT_NONE, lo_index = TCPT_NONE;
1474 u_int32_t timer_val, offset = 0, lo_timer = 0;
1475 int32_t diff;
1476 boolean_t needtorun[TCPT_NTIMERS];
1477 int count = 0;
1478
1479 VERIFY(tp != NULL);
1480 bzero(needtorun, sizeof(needtorun));
1481 *te_mode = 0;
1482
1483 tcp_lock(tp->t_inpcb->inp_socket, 1, 0);
1484
1485 so = tp->t_inpcb->inp_socket;
1486 /* Release the want count on inp */
1487 if (in_pcb_checkstate(tp->t_inpcb, WNT_RELEASE, 1)
1488 == WNT_STOPUSING) {
1489 if (TIMER_IS_ON_LIST(tp)) {
1490 tcp_remove_timer(tp);
1491 }
1492
1493 /* Looks like the TCP connection got closed while we
1494 * were waiting for the lock.. Done
1495 */
1496 goto done;
1497 }
1498
1499 /*
1500 * If this connection is over an interface that needs to
1501 * be probed, send probe packets to reinitiate communication.
1502 */
1503 if (probe_if_index > 0 && tp->t_inpcb->inp_last_outifp != NULL &&
1504 tp->t_inpcb->inp_last_outifp->if_index == probe_if_index) {
1505 tp->t_flagsext |= TF_PROBING;
1506 tcp_timers(tp, TCPT_PTO);
1507 tp->t_timer[TCPT_PTO] = 0;
1508 tp->t_flagsext &= TF_PROBING;
1509 }
1510
1511 /*
1512 * Since the timer thread needs to wait for tcp lock, it may race
1513 * with another thread that can cancel or reschedule the timer
1514 * that is about to run. Check if we need to run anything.
1515 */
1516 if ((index = tp->tentry.index) == TCPT_NONE)
1517 goto done;
1518
1519 timer_val = tp->t_timer[index];
1520
1521 diff = timer_diff(tp->tentry.runtime, 0, tcp_now, 0);
1522 if (diff > 0) {
1523 if (tp->tentry.index != TCPT_NONE) {
1524 offset = diff;
1525 *(te_mode) = tp->tentry.mode;
1526 }
1527 goto done;
1528 }
1529
1530 tp->t_timer[index] = 0;
1531 if (timer_val > 0) {
1532 tp = tcp_timers(tp, index);
1533 if (tp == NULL)
1534 goto done;
1535 }
1536
1537 /*
1538 * Check if there are any other timers that need to be run.
1539 * While doing it, adjust the timer values wrt tcp_now.
1540 */
1541 tp->tentry.mode = 0;
1542 for (i = 0; i < TCPT_NTIMERS; ++i) {
1543 if (tp->t_timer[i] != 0) {
1544 diff = timer_diff(tp->tentry.timer_start,
1545 tp->t_timer[i], tcp_now, 0);
1546 if (diff <= 0) {
1547 needtorun[i] = TRUE;
1548 count++;
1549 } else {
1550 tp->t_timer[i] = diff;
1551 needtorun[i] = FALSE;
1552 if (lo_timer == 0 || diff < lo_timer) {
1553 lo_timer = diff;
1554 lo_index = i;
1555 }
1556 TCP_SET_TIMER_MODE(tp->tentry.mode, i);
1557 }
1558 }
1559 }
1560
1561 tp->tentry.timer_start = tcp_now;
1562 tp->tentry.index = lo_index;
1563 VERIFY(tp->tentry.index == TCPT_NONE || tp->tentry.mode > 0);
1564
1565 if (tp->tentry.index != TCPT_NONE) {
1566 tp->tentry.runtime = tp->tentry.timer_start +
1567 tp->t_timer[tp->tentry.index];
1568 if (tp->tentry.runtime == 0)
1569 tp->tentry.runtime++;
1570 }
1571
1572 if (count > 0) {
1573 /* run any other timers outstanding at this time. */
1574 for (i = 0; i < TCPT_NTIMERS; ++i) {
1575 if (needtorun[i]) {
1576 tp->t_timer[i] = 0;
1577 tp = tcp_timers(tp, i);
1578 if (tp == NULL) {
1579 offset = 0;
1580 *(te_mode) = 0;
1581 goto done;
1582 }
1583 }
1584 }
1585 tcp_set_lotimer_index(tp);
1586 }
1587
1588 if (tp->tentry.index < TCPT_NONE) {
1589 offset = tp->t_timer[tp->tentry.index];
1590 *(te_mode) = tp->tentry.mode;
1591 }
1592
1593 done:
1594 if (tp != NULL && tp->tentry.index == TCPT_NONE) {
1595 tcp_remove_timer(tp);
1596 offset = 0;
1597 }
1598
1599 tcp_unlock(so, 1, 0);
1600 return(offset);
1601 }
1602
1603 void
1604 tcp_run_timerlist(void * arg1, void * arg2) {
1605 #pragma unused(arg1, arg2)
1606 struct tcptimerentry *te, *next_te;
1607 struct tcptimerlist *listp = &tcp_timer_list;
1608 struct tcpcb *tp;
1609 uint32_t next_timer = 0; /* offset of the next timer on the list */
1610 u_int16_t te_mode = 0; /* modes of all active timers in a tcpcb */
1611 u_int16_t list_mode = 0; /* cumulative of modes of all tcpcbs */
1612 uint32_t active_count = 0;
1613
1614 calculate_tcp_clock();
1615
1616 lck_mtx_lock(listp->mtx);
1617
1618 listp->running = TRUE;
1619
1620 LIST_FOREACH_SAFE(te, &listp->lhead, le, next_te) {
1621 uint32_t offset = 0;
1622 uint32_t runtime = te->runtime;
1623 if (te->index < TCPT_NONE && TSTMP_GT(runtime, tcp_now)) {
1624 offset = timer_diff(runtime, 0, tcp_now, 0);
1625 if (next_timer == 0 || offset < next_timer) {
1626 next_timer = offset;
1627 }
1628 list_mode |= te->mode;
1629 continue;
1630 }
1631
1632 tp = TIMERENTRY_TO_TP(te);
1633
1634 /*
1635 * Acquire an inp wantcnt on the inpcb so that the socket
1636 * won't get detached even if tcp_close is called
1637 */
1638 if (in_pcb_checkstate(tp->t_inpcb, WNT_ACQUIRE, 0)
1639 == WNT_STOPUSING) {
1640 /*
1641 * Some how this pcb went into dead state while
1642 * on the timer list, just take it off the list.
1643 * Since the timer list entry pointers are
1644 * protected by the timer list lock, we can
1645 * do it here without the socket lock.
1646 */
1647 if (TIMER_IS_ON_LIST(tp)) {
1648 tp->t_flags &= ~(TF_TIMER_ONLIST);
1649 LIST_REMOVE(&tp->tentry, le);
1650 listp->entries--;
1651
1652 tp->tentry.le.le_next = NULL;
1653 tp->tentry.le.le_prev = NULL;
1654 }
1655 continue;
1656 }
1657 active_count++;
1658
1659 /*
1660 * Store the next timerentry pointer before releasing the
1661 * list lock. If that entry has to be removed when we
1662 * release the lock, this pointer will be updated to the
1663 * element after that.
1664 */
1665 listp->next_te = next_te;
1666
1667 VERIFY_NEXT_LINK(&tp->tentry, le);
1668 VERIFY_PREV_LINK(&tp->tentry, le);
1669
1670 lck_mtx_unlock(listp->mtx);
1671
1672 offset = tcp_run_conn_timer(tp, &te_mode,
1673 listp->probe_if_index);
1674
1675 lck_mtx_lock(listp->mtx);
1676
1677 next_te = listp->next_te;
1678 listp->next_te = NULL;
1679
1680 if (offset > 0 && te_mode != 0) {
1681 list_mode |= te_mode;
1682
1683 if (next_timer == 0 || offset < next_timer)
1684 next_timer = offset;
1685 }
1686 }
1687
1688 if (!LIST_EMPTY(&listp->lhead)) {
1689 u_int16_t next_mode = 0;
1690 if ((list_mode & TCP_TIMERLIST_10MS_MODE) ||
1691 (listp->pref_mode & TCP_TIMERLIST_10MS_MODE))
1692 next_mode = TCP_TIMERLIST_10MS_MODE;
1693 else if ((list_mode & TCP_TIMERLIST_100MS_MODE) ||
1694 (listp->pref_mode & TCP_TIMERLIST_100MS_MODE))
1695 next_mode = TCP_TIMERLIST_100MS_MODE;
1696 else
1697 next_mode = TCP_TIMERLIST_500MS_MODE;
1698
1699 if (next_mode != TCP_TIMERLIST_500MS_MODE) {
1700 listp->idleruns = 0;
1701 } else {
1702 /*
1703 * the next required mode is slow mode, but if
1704 * the last one was a faster mode and we did not
1705 * have enough idle runs, repeat the last mode.
1706 *
1707 * We try to keep the timer list in fast mode for
1708 * some idle time in expectation of new data.
1709 */
1710 if (listp->mode != next_mode &&
1711 listp->idleruns < timer_fastmode_idlemax) {
1712 listp->idleruns++;
1713 next_mode = listp->mode;
1714 next_timer = TCP_TIMER_100MS_QUANTUM;
1715 } else {
1716 listp->idleruns = 0;
1717 }
1718 }
1719 listp->mode = next_mode;
1720 if (listp->pref_offset != 0)
1721 next_timer = min(listp->pref_offset, next_timer);
1722
1723 if (listp->mode == TCP_TIMERLIST_500MS_MODE)
1724 next_timer = max(next_timer,
1725 TCP_TIMER_500MS_QUANTUM);
1726
1727 tcp_sched_timerlist(next_timer);
1728 } else {
1729 /*
1730 * No need to reschedule this timer, but always run
1731 * periodically at a much higher granularity.
1732 */
1733 tcp_sched_timerlist(TCP_TIMERLIST_MAX_OFFSET);
1734 }
1735
1736 listp->running = FALSE;
1737 listp->pref_mode = 0;
1738 listp->pref_offset = 0;
1739 listp->probe_if_index = 0;
1740
1741 lck_mtx_unlock(listp->mtx);
1742 }
1743
1744 /*
1745 * Function to check if the timerlist needs to be rescheduled to run this
1746 * connection's timers correctly.
1747 */
1748 void
1749 tcp_sched_timers(struct tcpcb *tp)
1750 {
1751 struct tcptimerentry *te = &tp->tentry;
1752 u_int16_t index = te->index;
1753 u_int16_t mode = te->mode;
1754 struct tcptimerlist *listp = &tcp_timer_list;
1755 int32_t offset = 0;
1756 boolean_t list_locked = FALSE;
1757
1758 if (tp->t_inpcb->inp_state == INPCB_STATE_DEAD) {
1759 /* Just return without adding the dead pcb to the list */
1760 if (TIMER_IS_ON_LIST(tp)) {
1761 tcp_remove_timer(tp);
1762 }
1763 return;
1764 }
1765
1766 if (index == TCPT_NONE) {
1767 /* Nothing to run */
1768 tcp_remove_timer(tp);
1769 return;
1770 }
1771
1772 /*
1773 * compute the offset at which the next timer for this connection
1774 * has to run.
1775 */
1776 offset = timer_diff(te->runtime, 0, tcp_now, 0);
1777 if (offset <= 0) {
1778 offset = 1;
1779 tcp_timer_advanced++;
1780 }
1781
1782 if (!TIMER_IS_ON_LIST(tp)) {
1783 if (!list_locked) {
1784 lck_mtx_lock(listp->mtx);
1785 list_locked = TRUE;
1786 }
1787
1788 LIST_INSERT_HEAD(&listp->lhead, te, le);
1789 tp->t_flags |= TF_TIMER_ONLIST;
1790
1791 listp->entries++;
1792 if (listp->entries > listp->maxentries)
1793 listp->maxentries = listp->entries;
1794
1795 /* if the list is not scheduled, just schedule it */
1796 if (!listp->scheduled)
1797 goto schedule;
1798 }
1799
1800
1801 /*
1802 * Timer entry is currently on the list, check if the list needs
1803 * to be rescheduled.
1804 */
1805 if (need_to_resched_timerlist(te->runtime, mode)) {
1806 tcp_resched_timerlist++;
1807
1808 if (!list_locked) {
1809 lck_mtx_lock(listp->mtx);
1810 list_locked = TRUE;
1811 }
1812
1813 VERIFY_NEXT_LINK(te, le);
1814 VERIFY_PREV_LINK(te, le);
1815
1816 if (listp->running) {
1817 listp->pref_mode |= mode;
1818 if (listp->pref_offset == 0 ||
1819 offset < listp->pref_offset) {
1820 listp->pref_offset = offset;
1821 }
1822 } else {
1823 /*
1824 * The list could have got rescheduled while
1825 * this thread was waiting for the lock
1826 */
1827 if (listp->scheduled) {
1828 int32_t diff;
1829 diff = timer_diff(listp->runtime, 0,
1830 tcp_now, offset);
1831 if (diff <= 0)
1832 goto done;
1833 else
1834 goto schedule;
1835 } else {
1836 goto schedule;
1837 }
1838 }
1839 }
1840 goto done;
1841
1842 schedule:
1843 /*
1844 * Since a connection with timers is getting scheduled, the timer
1845 * list moves from idle to active state and that is why idlegen is
1846 * reset
1847 */
1848 if (mode & TCP_TIMERLIST_10MS_MODE) {
1849 listp->mode = TCP_TIMERLIST_10MS_MODE;
1850 listp->idleruns = 0;
1851 offset = min(offset, TCP_TIMER_10MS_QUANTUM);
1852 } else if (mode & TCP_TIMERLIST_100MS_MODE) {
1853 if (listp->mode > TCP_TIMERLIST_100MS_MODE)
1854 listp->mode = TCP_TIMERLIST_100MS_MODE;
1855 listp->idleruns = 0;
1856 offset = min(offset, TCP_TIMER_100MS_QUANTUM);
1857 }
1858 tcp_sched_timerlist(offset);
1859
1860 done:
1861 if (list_locked)
1862 lck_mtx_unlock(listp->mtx);
1863
1864 return;
1865 }
1866
1867 static inline void
1868 tcp_set_lotimer_index(struct tcpcb *tp)
1869 {
1870 uint16_t i, lo_index = TCPT_NONE, mode = 0;
1871 uint32_t lo_timer = 0;
1872 for (i = 0; i < TCPT_NTIMERS; ++i) {
1873 if (tp->t_timer[i] != 0) {
1874 TCP_SET_TIMER_MODE(mode, i);
1875 if (lo_timer == 0 || tp->t_timer[i] < lo_timer) {
1876 lo_timer = tp->t_timer[i];
1877 lo_index = i;
1878 }
1879 }
1880 }
1881 tp->tentry.index = lo_index;
1882 tp->tentry.mode = mode;
1883 VERIFY(tp->tentry.index == TCPT_NONE || tp->tentry.mode > 0);
1884
1885 if (tp->tentry.index != TCPT_NONE) {
1886 tp->tentry.runtime = tp->tentry.timer_start
1887 + tp->t_timer[tp->tentry.index];
1888 if (tp->tentry.runtime == 0)
1889 tp->tentry.runtime++;
1890 }
1891 }
1892
1893 void
1894 tcp_check_timer_state(struct tcpcb *tp)
1895 {
1896 lck_mtx_assert(&tp->t_inpcb->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1897
1898 if (tp->t_inpcb->inp_flags2 & INP2_TIMEWAIT)
1899 return;
1900
1901 tcp_set_lotimer_index(tp);
1902
1903 tcp_sched_timers(tp);
1904 return;
1905 }
1906
1907 static inline void
1908 tcp_cumulative_stat(u_int32_t cur, u_int32_t *prev, u_int32_t *dest)
1909 {
1910 /* handle wrap around */
1911 int32_t diff = (int32_t) (cur - *prev);
1912 if (diff > 0)
1913 *dest = diff;
1914 else
1915 *dest = 0;
1916 *prev = cur;
1917 return;
1918 }
1919
1920 __private_extern__ void
1921 tcp_report_stats(void)
1922 {
1923 struct nstat_sysinfo_data data;
1924 struct sockaddr_in dst;
1925 struct sockaddr_in6 dst6;
1926 struct rtentry *rt = NULL;
1927 static struct tcp_last_report_stats prev;
1928 u_int64_t var, uptime;
1929
1930 #define stat data.u.tcp_stats
1931 if (((uptime = net_uptime()) - tcp_last_report_time) <
1932 tcp_report_stats_interval)
1933 return;
1934
1935 tcp_last_report_time = uptime;
1936
1937 bzero(&data, sizeof(data));
1938 data.flags = NSTAT_SYSINFO_TCP_STATS;
1939
1940 bzero(&dst, sizeof(dst));
1941 dst.sin_len = sizeof(dst);
1942 dst.sin_family = AF_INET;
1943
1944 /* ipv4 avg rtt */
1945 lck_mtx_lock(rnh_lock);
1946 rt = rt_lookup(TRUE, (struct sockaddr *)&dst, NULL,
1947 rt_tables[AF_INET], IFSCOPE_NONE);
1948 lck_mtx_unlock(rnh_lock);
1949 if (rt != NULL) {
1950 RT_LOCK(rt);
1951 if (rt_primary_default(rt, rt_key(rt)) &&
1952 rt->rt_stats != NULL) {
1953 stat.ipv4_avgrtt = rt->rt_stats->nstat_avg_rtt;
1954 }
1955 RT_UNLOCK(rt);
1956 rtfree(rt);
1957 rt = NULL;
1958 }
1959
1960 /* ipv6 avg rtt */
1961 bzero(&dst6, sizeof(dst6));
1962 dst6.sin6_len = sizeof(dst6);
1963 dst6.sin6_family = AF_INET6;
1964
1965 lck_mtx_lock(rnh_lock);
1966 rt = rt_lookup(TRUE,(struct sockaddr *)&dst6, NULL,
1967 rt_tables[AF_INET6], IFSCOPE_NONE);
1968 lck_mtx_unlock(rnh_lock);
1969 if (rt != NULL) {
1970 RT_LOCK(rt);
1971 if (rt_primary_default(rt, rt_key(rt)) &&
1972 rt->rt_stats != NULL) {
1973 stat.ipv6_avgrtt = rt->rt_stats->nstat_avg_rtt;
1974 }
1975 RT_UNLOCK(rt);
1976 rtfree(rt);
1977 rt = NULL;
1978 }
1979
1980 /* send packet loss rate, shift by 10 for precision */
1981 if (tcpstat.tcps_sndpack > 0 && tcpstat.tcps_sndrexmitpack > 0) {
1982 var = tcpstat.tcps_sndrexmitpack << 10;
1983 stat.send_plr = (var * 100) / tcpstat.tcps_sndpack;
1984 }
1985
1986 /* recv packet loss rate, shift by 10 for precision */
1987 if (tcpstat.tcps_rcvpack > 0 && tcpstat.tcps_recovered_pkts > 0) {
1988 var = tcpstat.tcps_recovered_pkts << 10;
1989 stat.recv_plr = (var * 100) / tcpstat.tcps_rcvpack;
1990 }
1991
1992 /* RTO after tail loss, shift by 10 for precision */
1993 if (tcpstat.tcps_sndrexmitpack > 0
1994 && tcpstat.tcps_tailloss_rto > 0) {
1995 var = tcpstat.tcps_tailloss_rto << 10;
1996 stat.send_tlrto_rate =
1997 (var * 100) / tcpstat.tcps_sndrexmitpack;
1998 }
1999
2000 /* packet reordering */
2001 if (tcpstat.tcps_sndpack > 0 && tcpstat.tcps_reordered_pkts > 0) {
2002 var = tcpstat.tcps_reordered_pkts << 10;
2003 stat.send_reorder_rate =
2004 (var * 100) / tcpstat.tcps_sndpack;
2005 }
2006
2007 if (tcp_ecn_outbound == 1)
2008 stat.ecn_client_enabled = 1;
2009 if (tcp_ecn_inbound == 1)
2010 stat.ecn_server_enabled = 1;
2011 tcp_cumulative_stat(tcpstat.tcps_connattempt,
2012 &prev.tcps_connattempt, &stat.connection_attempts);
2013 tcp_cumulative_stat(tcpstat.tcps_accepts,
2014 &prev.tcps_accepts, &stat.connection_accepts);
2015 tcp_cumulative_stat(tcpstat.tcps_ecn_client_setup,
2016 &prev.tcps_ecn_client_setup, &stat.ecn_client_setup);
2017 tcp_cumulative_stat(tcpstat.tcps_ecn_server_setup,
2018 &prev.tcps_ecn_server_setup, &stat.ecn_server_setup);
2019 tcp_cumulative_stat(tcpstat.tcps_ecn_client_success,
2020 &prev.tcps_ecn_client_success, &stat.ecn_client_success);
2021 tcp_cumulative_stat(tcpstat.tcps_ecn_server_success,
2022 &prev.tcps_ecn_server_success, &stat.ecn_server_success);
2023 tcp_cumulative_stat(tcpstat.tcps_ecn_not_supported,
2024 &prev.tcps_ecn_not_supported, &stat.ecn_not_supported);
2025 tcp_cumulative_stat(tcpstat.tcps_ecn_lost_syn,
2026 &prev.tcps_ecn_lost_syn, &stat.ecn_lost_syn);
2027 tcp_cumulative_stat(tcpstat.tcps_ecn_lost_synack,
2028 &prev.tcps_ecn_lost_synack, &stat.ecn_lost_synack);
2029 tcp_cumulative_stat(tcpstat.tcps_ecn_recv_ce,
2030 &prev.tcps_ecn_recv_ce, &stat.ecn_recv_ce);
2031 tcp_cumulative_stat(tcpstat.tcps_ecn_recv_ece,
2032 &prev.tcps_ecn_recv_ece, &stat.ecn_recv_ece);
2033 tcp_cumulative_stat(tcpstat.tcps_ecn_recv_ece,
2034 &prev.tcps_ecn_recv_ece, &stat.ecn_recv_ece);
2035 tcp_cumulative_stat(tcpstat.tcps_ecn_sent_ece,
2036 &prev.tcps_ecn_sent_ece, &stat.ecn_sent_ece);
2037 tcp_cumulative_stat(tcpstat.tcps_ecn_sent_ece,
2038 &prev.tcps_ecn_sent_ece, &stat.ecn_sent_ece);
2039 tcp_cumulative_stat(tcpstat.tcps_ecn_conn_recv_ce,
2040 &prev.tcps_ecn_conn_recv_ce, &stat.ecn_conn_recv_ce);
2041 tcp_cumulative_stat(tcpstat.tcps_ecn_conn_recv_ece,
2042 &prev.tcps_ecn_conn_recv_ece, &stat.ecn_conn_recv_ece);
2043 tcp_cumulative_stat(tcpstat.tcps_ecn_conn_plnoce,
2044 &prev.tcps_ecn_conn_plnoce, &stat.ecn_conn_plnoce);
2045 tcp_cumulative_stat(tcpstat.tcps_ecn_conn_pl_ce,
2046 &prev.tcps_ecn_conn_pl_ce, &stat.ecn_conn_pl_ce);
2047 tcp_cumulative_stat(tcpstat.tcps_ecn_conn_nopl_ce,
2048 &prev.tcps_ecn_conn_nopl_ce, &stat.ecn_conn_nopl_ce);
2049 tcp_cumulative_stat(tcpstat.tcps_tfo_syn_data_rcv,
2050 &prev.tcps_tfo_syn_data_rcv, &stat.tfo_syn_data_rcv);
2051 tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_req_rcv,
2052 &prev.tcps_tfo_cookie_req_rcv, &stat.tfo_cookie_req_rcv);
2053 tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_sent,
2054 &prev.tcps_tfo_cookie_sent, &stat.tfo_cookie_sent);
2055 tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_invalid,
2056 &prev.tcps_tfo_cookie_invalid, &stat.tfo_cookie_invalid);
2057 tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_req,
2058 &prev.tcps_tfo_cookie_req, &stat.tfo_cookie_req);
2059 tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_rcv,
2060 &prev.tcps_tfo_cookie_rcv, &stat.tfo_cookie_rcv);
2061 tcp_cumulative_stat(tcpstat.tcps_tfo_syn_data_sent,
2062 &prev.tcps_tfo_syn_data_sent, &stat.tfo_syn_data_sent);
2063 tcp_cumulative_stat(tcpstat.tcps_tfo_syn_data_acked,
2064 &prev.tcps_tfo_syn_data_acked, &stat.tfo_syn_data_acked);
2065 tcp_cumulative_stat(tcpstat.tcps_tfo_syn_loss,
2066 &prev.tcps_tfo_syn_loss, &stat.tfo_syn_loss);
2067 tcp_cumulative_stat(tcpstat.tcps_tfo_blackhole,
2068 &prev.tcps_tfo_blackhole, &stat.tfo_blackhole);
2069
2070 nstat_sysinfo_send_data(&data);
2071
2072 #undef stat
2073 }
2074
2075 void
2076 tcp_interface_send_probe(u_int16_t probe_if_index)
2077 {
2078 int32_t offset = 0;
2079 struct tcptimerlist *listp = &tcp_timer_list;
2080
2081 /* Make sure TCP clock is up to date */
2082 calculate_tcp_clock();
2083
2084 lck_mtx_lock(listp->mtx);
2085 if (listp->probe_if_index > 0) {
2086 tcpstat.tcps_probe_if_conflict++;
2087 goto done;
2088 }
2089
2090 listp->probe_if_index = probe_if_index;
2091 if (listp->running)
2092 goto done;
2093
2094 /*
2095 * Reschedule the timerlist to run within the next 10ms, which is
2096 * the fastest that we can do.
2097 */
2098 offset = TCP_TIMER_10MS_QUANTUM;
2099 if (listp->scheduled) {
2100 int32_t diff;
2101 diff = timer_diff(listp->runtime, 0, tcp_now, offset);
2102 if (diff <= 0) {
2103 /* The timer will fire sooner than what's needed */
2104 goto done;
2105 }
2106 }
2107 listp->mode = TCP_TIMERLIST_10MS_MODE;
2108 listp->idleruns = 0;
2109
2110 tcp_sched_timerlist(offset);
2111
2112 done:
2113 lck_mtx_unlock(listp->mtx);
2114 return;
2115 }
2116
2117 /*
2118 * Enable read probes on this connection, if:
2119 * - it is in established state
2120 * - doesn't have any data outstanding
2121 * - the outgoing ifp matches
2122 * - we have not already sent any read probes
2123 */
2124 static void
2125 tcp_enable_read_probe(struct tcpcb *tp, struct ifnet *ifp)
2126 {
2127 if (tp->t_state == TCPS_ESTABLISHED &&
2128 tp->snd_max == tp->snd_una &&
2129 tp->t_inpcb->inp_last_outifp == ifp &&
2130 !(tp->t_flagsext & TF_DETECT_READSTALL) &&
2131 tp->t_rtimo_probes == 0) {
2132 tp->t_flagsext |= TF_DETECT_READSTALL;
2133 tp->t_rtimo_probes = 0;
2134 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
2135 TCP_TIMER_10MS_QUANTUM);
2136 if (tp->tentry.index == TCPT_NONE) {
2137 tp->tentry.index = TCPT_KEEP;
2138 tp->tentry.runtime = tcp_now +
2139 TCP_TIMER_10MS_QUANTUM;
2140 } else {
2141 int32_t diff = 0;
2142
2143 /* Reset runtime to be in next 10ms */
2144 diff = timer_diff(tp->tentry.runtime, 0,
2145 tcp_now, TCP_TIMER_10MS_QUANTUM);
2146 if (diff > 0) {
2147 tp->tentry.index = TCPT_KEEP;
2148 tp->tentry.runtime = tcp_now +
2149 TCP_TIMER_10MS_QUANTUM;
2150 if (tp->tentry.runtime == 0)
2151 tp->tentry.runtime++;
2152 }
2153 }
2154 }
2155 }
2156
2157 /*
2158 * Disable read probe and reset the keep alive timer
2159 */
2160 static void
2161 tcp_disable_read_probe(struct tcpcb *tp)
2162 {
2163 if (tp->t_adaptive_rtimo == 0 &&
2164 ((tp->t_flagsext & TF_DETECT_READSTALL) ||
2165 tp->t_rtimo_probes > 0)) {
2166 tcp_keepalive_reset(tp);
2167 }
2168 }
2169
2170 /*
2171 * Reschedule the tcp timerlist in the next 10ms to re-enable read/write
2172 * probes on connections going over a particular interface.
2173 */
2174 void
2175 tcp_probe_connectivity(struct ifnet *ifp, u_int32_t enable)
2176 {
2177 int32_t offset;
2178 struct tcptimerlist *listp = &tcp_timer_list;
2179 struct inpcbinfo *pcbinfo = &tcbinfo;
2180 struct inpcb *inp, *nxt;
2181
2182 if (ifp == NULL)
2183 return;
2184
2185 /* update clock */
2186 calculate_tcp_clock();
2187
2188 /*
2189 * Enable keep alive timer on all connections that are
2190 * active/established on this interface.
2191 */
2192 lck_rw_lock_shared(pcbinfo->ipi_lock);
2193
2194 LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, nxt) {
2195 struct tcpcb *tp = NULL;
2196 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) ==
2197 WNT_STOPUSING)
2198 continue;
2199
2200 /* Acquire lock to look at the state of the connection */
2201 tcp_lock(inp->inp_socket, 1, 0);
2202
2203 /* Release the want count */
2204 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
2205 tcp_unlock(inp->inp_socket, 1, 0);
2206 continue;
2207 }
2208
2209 tp = intotcpcb(inp);
2210 if (enable)
2211 tcp_enable_read_probe(tp, ifp);
2212 else
2213 tcp_disable_read_probe(tp);
2214
2215 tcp_unlock(inp->inp_socket, 1, 0);
2216 }
2217 lck_rw_done(pcbinfo->ipi_lock);
2218
2219 lck_mtx_lock(listp->mtx);
2220 if (listp->running) {
2221 listp->pref_mode |= TCP_TIMERLIST_10MS_MODE;
2222 goto done;
2223 }
2224
2225 /* Reschedule within the next 10ms */
2226 offset = TCP_TIMER_10MS_QUANTUM;
2227 if (listp->scheduled) {
2228 int32_t diff;
2229 diff = timer_diff(listp->runtime, 0, tcp_now, offset);
2230 if (diff <= 0) {
2231 /* The timer will fire sooner than what's needed */
2232 goto done;
2233 }
2234 }
2235 listp->mode = TCP_TIMERLIST_10MS_MODE;
2236 listp->idleruns = 0;
2237
2238 tcp_sched_timerlist(offset);
2239 done:
2240 lck_mtx_unlock(listp->mtx);
2241 return;
2242 }
2243
2244 void
2245 tcp_itimer(struct inpcbinfo *ipi)
2246 {
2247 struct inpcb *inp, *nxt;
2248
2249 if (lck_rw_try_lock_exclusive(ipi->ipi_lock) == FALSE) {
2250 if (tcp_itimer_done == TRUE) {
2251 tcp_itimer_done = FALSE;
2252 atomic_add_32(&ipi->ipi_timer_req.intimer_fast, 1);
2253 return;
2254 }
2255 /* Upgrade failed, lost lock now take it again exclusive */
2256 lck_rw_lock_exclusive(ipi->ipi_lock);
2257 }
2258 tcp_itimer_done = TRUE;
2259
2260 LIST_FOREACH_SAFE(inp, &tcb, inp_list, nxt) {
2261 struct socket *so;
2262
2263 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING)
2264 continue;
2265 so = inp->inp_socket;
2266 tcp_lock(so, 1, 0);
2267 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
2268 tcp_unlock(so, 1, 0);
2269 continue;
2270 }
2271 so_check_extended_bk_idle_time(so);
2272 tcp_unlock(so, 1, 0);
2273 }
2274
2275 lck_rw_done(ipi->ipi_lock);
2276 }
2277