]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/tcp_timer.c
xnu-792.10.96.tar.gz
[apple/xnu.git] / bsd / netinet / tcp_timer.c
1 /*
2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /*
23 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
24 * The Regents of the University of California. All rights reserved.
25 *
26 * Redistribution and use in source and binary forms, with or without
27 * modification, are permitted provided that the following conditions
28 * are met:
29 * 1. Redistributions of source code must retain the above copyright
30 * notice, this list of conditions and the following disclaimer.
31 * 2. Redistributions in binary form must reproduce the above copyright
32 * notice, this list of conditions and the following disclaimer in the
33 * documentation and/or other materials provided with the distribution.
34 * 3. All advertising materials mentioning features or use of this software
35 * must display the following acknowledgement:
36 * This product includes software developed by the University of
37 * California, Berkeley and its contributors.
38 * 4. Neither the name of the University nor the names of its contributors
39 * may be used to endorse or promote products derived from this software
40 * without specific prior written permission.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52 * SUCH DAMAGE.
53 *
54 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95
55 * $FreeBSD: src/sys/netinet/tcp_timer.c,v 1.34.2.11 2001/08/22 00:59:12 silby Exp $
56 */
57
58
59 #include <sys/param.h>
60 #include <sys/systm.h>
61 #include <sys/kernel.h>
62 #include <sys/mbuf.h>
63 #include <sys/sysctl.h>
64 #include <sys/socket.h>
65 #include <sys/socketvar.h>
66 #include <sys/protosw.h>
67 #include <kern/locks.h>
68
69 #include <kern/cpu_number.h> /* before tcp_seq.h, for tcp_random18() */
70
71 #include <net/route.h>
72
73 #include <netinet/in.h>
74 #include <netinet/in_systm.h>
75 #include <netinet/in_pcb.h>
76 #if INET6
77 #include <netinet6/in6_pcb.h>
78 #endif
79 #include <netinet/ip_var.h>
80 #include <netinet/tcp.h>
81 #include <netinet/tcp_fsm.h>
82 #include <netinet/tcp_seq.h>
83 #include <netinet/tcp_timer.h>
84 #include <netinet/tcp_var.h>
85 #include <netinet/tcpip.h>
86 #if TCPDEBUG
87 #include <netinet/tcp_debug.h>
88 #endif
89 #include <sys/kdebug.h>
90
91 #define DBG_FNC_TCP_FAST NETDBG_CODE(DBG_NETTCP, (5 << 8))
92 #define DBG_FNC_TCP_SLOW NETDBG_CODE(DBG_NETTCP, (5 << 8) | 1)
93
94 /*
95 * NOTE - WARNING
96 *
97 *
98 *
99 *
100 */
101 static int
102 sysctl_msec_to_ticks SYSCTL_HANDLER_ARGS
103 {
104 int error, s, tt;
105
106 tt = *(int *)oidp->oid_arg1;
107 s = tt * 1000 / hz;
108
109 error = sysctl_handle_int(oidp, &s, 0, req);
110 if (error || !req->newptr)
111 return (error);
112
113 tt = s * hz / 1000;
114 if (tt < 1)
115 return (EINVAL);
116
117 *(int *)oidp->oid_arg1 = tt;
118 return (0);
119 }
120
121 int tcp_keepinit;
122 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW,
123 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "");
124
125 int tcp_keepidle;
126 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW,
127 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "");
128
129 int tcp_keepintvl;
130 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW,
131 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "");
132
133 int tcp_delacktime;
134 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime,
135 CTLTYPE_INT|CTLFLAG_RW, &tcp_delacktime, 0, sysctl_msec_to_ticks, "I",
136 "Time before a delayed ACK is sent");
137
138 int tcp_msl;
139 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW,
140 &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
141
142 static int always_keepalive = 0;
143 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW,
144 &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections");
145
146 static int tcp_keepcnt = TCPTV_KEEPCNT;
147 /* max idle probes */
148 int tcp_maxpersistidle;
149 /* max idle time in persist */
150 int tcp_maxidle;
151
152 struct inpcbhead time_wait_slots[N_TIME_WAIT_SLOTS];
153 int cur_tw_slot = 0;
154
155 u_long *delack_bitmask;
156
157
158 void add_to_time_wait_locked(tp)
159 struct tcpcb *tp;
160 {
161 int tw_slot;
162
163 /* pcb list should be locked when we get here */
164 #if 0
165 lck_mtx_assert(tp->t_inpcb->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
166 #endif
167
168 LIST_REMOVE(tp->t_inpcb, inp_list);
169
170 if (tp->t_timer[TCPT_2MSL] == 0)
171 tp->t_timer[TCPT_2MSL] = 1;
172
173 tp->t_rcvtime += tp->t_timer[TCPT_2MSL] & (N_TIME_WAIT_SLOTS - 1);
174 tw_slot = (tp->t_timer[TCPT_2MSL] & (N_TIME_WAIT_SLOTS - 1)) + cur_tw_slot;
175 if (tw_slot >= N_TIME_WAIT_SLOTS)
176 tw_slot -= N_TIME_WAIT_SLOTS;
177
178 LIST_INSERT_HEAD(&time_wait_slots[tw_slot], tp->t_inpcb, inp_list);
179 }
180
181 void add_to_time_wait(tp)
182 struct tcpcb *tp;
183 {
184 struct inpcbinfo *pcbinfo = &tcbinfo;
185
186 if (!lck_rw_try_lock_exclusive(pcbinfo->mtx)) {
187 tcp_unlock(tp->t_inpcb->inp_socket, 0, 0);
188 lck_rw_lock_exclusive(pcbinfo->mtx);
189 tcp_lock(tp->t_inpcb->inp_socket, 0, 0);
190 }
191 add_to_time_wait_locked(tp);
192 lck_rw_done(pcbinfo->mtx);
193 }
194
195
196
197
198 /*
199 * Fast timeout routine for processing delayed acks
200 */
201 void
202 tcp_fasttimo()
203 {
204 struct inpcb *inp, *inpnxt;
205 register struct tcpcb *tp;
206
207
208 struct inpcbinfo *pcbinfo = &tcbinfo;
209
210 int delack_checked = 0, delack_done = 0;
211
212 KERNEL_DEBUG(DBG_FNC_TCP_FAST | DBG_FUNC_START, 0,0,0,0,0);
213
214 if (tcp_delack_enabled == 0)
215 return;
216
217 lck_rw_lock_shared(pcbinfo->mtx);
218
219 /* Walk the list of valid tcpcbs and send ACKS on the ones with DELACK bit set */
220
221 for (inp = tcb.lh_first; inp != NULL; inp = inpnxt) {
222 inpnxt = inp->inp_list.le_next;
223 /* NOTE: it's OK to check the tp because the pcb can't be removed while we hold pcbinfo->mtx) */
224 if ((tp = (struct tcpcb *)inp->inp_ppcb) && (tp->t_flags & TF_DELACK)) {
225 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING)
226 continue;
227 tcp_lock(inp->inp_socket, 1, 0);
228 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
229 tcp_unlock(inp->inp_socket, 1, 0);
230 continue;
231 }
232 if (tp->t_flags & TF_DELACK) {
233 delack_done++;
234 tp->t_flags &= ~TF_DELACK;
235 tp->t_flags |= TF_ACKNOW;
236 tcpstat.tcps_delack++;
237 (void) tcp_output(tp);
238 }
239 tcp_unlock(inp->inp_socket, 1, 0);
240 }
241 }
242 KERNEL_DEBUG(DBG_FNC_TCP_FAST | DBG_FUNC_END, delack_checked, delack_done, tcpstat.tcps_delack,0,0);
243 lck_rw_done(pcbinfo->mtx);
244 }
245
246 /*
247 * Tcp protocol timeout routine called every 500 ms.
248 * Updates the timers in all active tcb's and
249 * causes finite state machine actions if timers expire.
250 */
251 void
252 tcp_slowtimo()
253 {
254 struct inpcb *inp, *inpnxt;
255 struct tcpcb *tp;
256 struct socket *so;
257 int i;
258 #if TCPDEBUG
259 int ostate;
260 #endif
261 #if KDEBUG
262 static int tws_checked;
263 #endif
264 struct inpcbinfo *pcbinfo = &tcbinfo;
265
266 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_START, 0,0,0,0,0);
267
268 tcp_maxidle = tcp_keepcnt * tcp_keepintvl;
269
270 lck_rw_lock_shared(pcbinfo->mtx);
271
272 /*
273 * Search through tcb's and update active timers.
274 */
275 for (inp = tcb.lh_first; inp != NULL; inp = inpnxt) {
276 inpnxt = inp->inp_list.le_next;
277
278 if (in_pcb_checkstate(inp, WNT_ACQUIRE,0) == WNT_STOPUSING)
279 continue;
280
281 so = inp->inp_socket;
282 tcp_lock(so, 1, 0);
283
284 if ((in_pcb_checkstate(inp, WNT_RELEASE,1) == WNT_STOPUSING) && so->so_usecount == 1) {
285 tcp_unlock(so, 1, 0);
286 continue;
287 }
288 tp = intotcpcb(inp);
289 if (tp == 0 || tp->t_state == TCPS_LISTEN) {
290 tcp_unlock(so, 1, 0);
291 continue;
292 }
293 /*
294 * Bogus state when port owned by SharedIP with loopback as the
295 * only configured interface: BlueBox does not filters loopback
296 */
297 if (tp->t_state == TCP_NSTATES) {
298 tcp_unlock(so, 1, 0);
299 continue;
300 }
301
302
303 for (i = 0; i < TCPT_NTIMERS; i++) {
304 if (tp->t_timer[i] && --tp->t_timer[i] == 0) {
305 #if TCPDEBUG
306 ostate = tp->t_state;
307 #endif
308 tp = tcp_timers(tp, i);
309 if (tp == NULL)
310 goto tpgone;
311 #if TCPDEBUG
312 if (tp->t_inpcb->inp_socket->so_options
313 & SO_DEBUG)
314 tcp_trace(TA_USER, ostate, tp,
315 (void *)0,
316 (struct tcphdr *)0,
317 PRU_SLOWTIMO);
318 #endif
319 }
320 }
321 tp->t_rcvtime++;
322 tp->t_starttime++;
323 if (tp->t_rtttime)
324 tp->t_rtttime++;
325 tpgone:
326 tcp_unlock(so, 1, 0);
327 }
328
329 #if KDEBUG
330 tws_checked = 0;
331 #endif
332 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_NONE, tws_checked,0,0,0,0);
333
334 /*
335 * Process the items in the current time-wait slot
336 */
337
338 for (inp = time_wait_slots[cur_tw_slot].lh_first; inp; inp = inpnxt)
339 {
340 inpnxt = inp->inp_list.le_next;
341 #if KDEBUG
342 tws_checked++;
343 #endif
344
345 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING)
346 continue;
347
348 tcp_lock(inp->inp_socket, 1, 0);
349
350 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING)
351 goto twunlock;
352
353 tp = intotcpcb(inp);
354 if (tp == NULL) { /* tp already closed, remove from list */
355 #if TEMPDEBUG
356 printf("tcp_slowtimo: tp is null in time-wait slot!\n");
357 #endif
358 goto twunlock;
359 }
360 if (tp->t_timer[TCPT_2MSL] >= N_TIME_WAIT_SLOTS) {
361 tp->t_timer[TCPT_2MSL] -= N_TIME_WAIT_SLOTS;
362 tp->t_rcvtime += N_TIME_WAIT_SLOTS;
363 }
364 else
365 tp->t_timer[TCPT_2MSL] = 0;
366
367 if (tp->t_timer[TCPT_2MSL] == 0)
368 tp = tcp_timers(tp, TCPT_2MSL); /* tp can be returned null if tcp_close is called */
369 twunlock:
370 tcp_unlock(inp->inp_socket, 1, 0);
371 }
372
373 if (lck_rw_lock_shared_to_exclusive(pcbinfo->mtx) != 0)
374 lck_rw_lock_exclusive(pcbinfo->mtx); /* Upgrade failed, lost lock no take it again exclusive */
375
376
377 for (inp = tcb.lh_first; inp != NULL; inp = inpnxt) {
378 inpnxt = inp->inp_list.le_next;
379 /* Ignore nat/SharedIP dummy pcbs */
380 if (inp->inp_socket == &tcbinfo.nat_dummy_socket)
381 continue;
382
383 if (inp->inp_wantcnt != WNT_STOPUSING)
384 continue;
385
386 so = inp->inp_socket;
387 if (!lck_mtx_try_lock(inp->inpcb_mtx)) {/* skip if in use */
388 #if TEMPDEBUG
389 printf("tcp_slowtimo so=%x STOPUSING but locked...\n", so);
390 #endif
391 continue;
392 }
393
394 if (so->so_usecount == 0)
395 in_pcbdispose(inp);
396 else {
397 tp = intotcpcb(inp);
398 /* Check for embryonic socket stuck on listener queue (4023660) */
399 if ((so->so_usecount == 1) && (tp->t_state == TCPS_CLOSED) &&
400 (so->so_head != NULL) && (so->so_state & SS_INCOMP)) {
401 so->so_usecount--;
402 in_pcbdispose(inp);
403 } else
404 lck_mtx_unlock(inp->inpcb_mtx);
405 }
406 }
407
408 /* Now cleanup the time wait ones */
409 for (inp = time_wait_slots[cur_tw_slot].lh_first; inp; inp = inpnxt)
410 {
411 inpnxt = inp->inp_list.le_next;
412
413 if (inp->inp_wantcnt != WNT_STOPUSING)
414 continue;
415
416 so = inp->inp_socket;
417 if (!lck_mtx_try_lock(inp->inpcb_mtx)) /* skip if in use */
418 continue;
419 if (so->so_usecount == 0)
420 in_pcbdispose(inp);
421 else {
422 tp = intotcpcb(inp);
423 /* Check for embryonic socket stuck on listener queue (4023660) */
424 if ((so->so_usecount == 1) && (tp->t_state == TCPS_CLOSED) &&
425 (so->so_head != NULL) && (so->so_state & SS_INCOMP)) {
426 so->so_usecount--;
427 in_pcbdispose(inp);
428 } else
429 lck_mtx_unlock(inp->inpcb_mtx);
430 }
431 }
432
433 tcp_now++;
434 if (++cur_tw_slot >= N_TIME_WAIT_SLOTS)
435 cur_tw_slot = 0;
436
437 lck_rw_done(pcbinfo->mtx);
438 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END, tws_checked, cur_tw_slot,0,0,0);
439 }
440
441 /*
442 * Cancel all timers for TCP tp.
443 */
444 void
445 tcp_canceltimers(tp)
446 struct tcpcb *tp;
447 {
448 register int i;
449
450 for (i = 0; i < TCPT_NTIMERS; i++)
451 tp->t_timer[i] = 0;
452 }
453
454 int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
455 { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
456
457 int tcp_backoff[TCP_MAXRXTSHIFT + 1] =
458 { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 };
459
460 static int tcp_totbackoff = 511; /* sum of tcp_backoff[] */
461
462 /*
463 * TCP timer processing.
464 */
465 struct tcpcb *
466 tcp_timers(tp, timer)
467 register struct tcpcb *tp;
468 int timer;
469 {
470 register int rexmt;
471 struct socket *so_tmp;
472 struct tcptemp *t_template;
473
474 #if TCPDEBUG
475 int ostate;
476 #endif
477
478 #if INET6
479 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV4) == 0;
480 #endif /* INET6 */
481
482 so_tmp = tp->t_inpcb->inp_socket;
483
484 switch (timer) {
485
486 /*
487 * 2 MSL timeout in shutdown went off. If we're closed but
488 * still waiting for peer to close and connection has been idle
489 * too long, or if 2MSL time is up from TIME_WAIT, delete connection
490 * control block. Otherwise, check again in a bit.
491 */
492 case TCPT_2MSL:
493 if (tp->t_state != TCPS_TIME_WAIT &&
494 tp->t_rcvtime <= tcp_maxidle) {
495 tp->t_timer[TCPT_2MSL] = (unsigned long)tcp_keepintvl;
496 add_to_time_wait_locked(tp);
497 }
498 else {
499 tp = tcp_close(tp);
500 return(tp);
501 }
502 break;
503
504 /*
505 * Retransmission timer went off. Message has not
506 * been acked within retransmit interval. Back off
507 * to a longer retransmit interval and retransmit one segment.
508 */
509 case TCPT_REXMT:
510 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
511 tp->t_rxtshift = TCP_MAXRXTSHIFT;
512 tcpstat.tcps_timeoutdrop++;
513 tp = tcp_drop(tp, tp->t_softerror ?
514 tp->t_softerror : ETIMEDOUT);
515 postevent(so_tmp, 0, EV_TIMEOUT);
516 break;
517 }
518
519 if (tp->t_rxtshift == 1) {
520 /*
521 * first retransmit; record ssthresh and cwnd so they can
522 * be recovered if this turns out to be a "bad" retransmit.
523 * A retransmit is considered "bad" if an ACK for this
524 * segment is received within RTT/2 interval; the assumption
525 * here is that the ACK was already in flight. See
526 * "On Estimating End-to-End Network Path Properties" by
527 * Allman and Paxson for more details.
528 */
529 tp->snd_cwnd_prev = tp->snd_cwnd;
530 tp->snd_ssthresh_prev = tp->snd_ssthresh;
531 tp->t_badrxtwin = tcp_now + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
532 }
533 tcpstat.tcps_rexmttimeo++;
534 if (tp->t_state == TCPS_SYN_SENT)
535 rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift];
536 else
537 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
538 TCPT_RANGESET(tp->t_rxtcur, rexmt,
539 tp->t_rttmin, TCPTV_REXMTMAX);
540 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
541
542 /*
543 * Disable rfc1323 and rfc1644 if we havn't got any response to
544 * our third SYN to work-around some broken terminal servers
545 * (most of which have hopefully been retired) that have bad VJ
546 * header compression code which trashes TCP segments containing
547 * unknown-to-them TCP options.
548 */
549 if ((tp->t_state == TCPS_SYN_SENT) && (tp->t_rxtshift == 3))
550 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_REQ_CC);
551 /*
552 * If losing, let the lower level know and try for
553 * a better route. Also, if we backed off this far,
554 * our srtt estimate is probably bogus. Clobber it
555 * so we'll take the next rtt measurement as our srtt;
556 * move the current srtt into rttvar to keep the current
557 * retransmit times until then.
558 */
559 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
560 #if INET6
561 if (isipv6)
562 in6_losing(tp->t_inpcb);
563 else
564 #endif /* INET6 */
565 in_losing(tp->t_inpcb);
566 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
567 tp->t_srtt = 0;
568 }
569 tp->snd_nxt = tp->snd_una;
570 /*
571 * Note: We overload snd_recover to function also as the
572 * snd_last variable described in RFC 2582
573 */
574 tp->snd_recover = tp->snd_max;
575 /*
576 * Force a segment to be sent.
577 */
578 tp->t_flags |= TF_ACKNOW;
579 /*
580 * If timing a segment in this window, stop the timer.
581 */
582 tp->t_rtttime = 0;
583 /*
584 * Close the congestion window down to one segment
585 * (we'll open it by one segment for each ack we get).
586 * Since we probably have a window's worth of unacked
587 * data accumulated, this "slow start" keeps us from
588 * dumping all that data as back-to-back packets (which
589 * might overwhelm an intermediate gateway).
590 *
591 * There are two phases to the opening: Initially we
592 * open by one mss on each ack. This makes the window
593 * size increase exponentially with time. If the
594 * window is larger than the path can handle, this
595 * exponential growth results in dropped packet(s)
596 * almost immediately. To get more time between
597 * drops but still "push" the network to take advantage
598 * of improving conditions, we switch from exponential
599 * to linear window opening at some threshhold size.
600 * For a threshhold, we use half the current window
601 * size, truncated to a multiple of the mss.
602 *
603 * (the minimum cwnd that will give us exponential
604 * growth is 2 mss. We don't allow the threshhold
605 * to go below this.)
606 */
607 {
608 u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
609 if (win < 2)
610 win = 2;
611 tp->snd_cwnd = tp->t_maxseg;
612 tp->snd_ssthresh = win * tp->t_maxseg;
613 tp->t_dupacks = 0;
614 }
615 (void) tcp_output(tp);
616 break;
617
618 /*
619 * Persistance timer into zero window.
620 * Force a byte to be output, if possible.
621 */
622 case TCPT_PERSIST:
623 tcpstat.tcps_persisttimeo++;
624 /*
625 * Hack: if the peer is dead/unreachable, we do not
626 * time out if the window is closed. After a full
627 * backoff, drop the connection if the idle time
628 * (no responses to probes) reaches the maximum
629 * backoff that we would use if retransmitting.
630 */
631 if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
632 (tp->t_rcvtime >= tcp_maxpersistidle ||
633 tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
634 tcpstat.tcps_persistdrop++;
635 so_tmp = tp->t_inpcb->inp_socket;
636 tp = tcp_drop(tp, ETIMEDOUT);
637 postevent(so_tmp, 0, EV_TIMEOUT);
638 break;
639 }
640 tcp_setpersist(tp);
641 tp->t_force = 1;
642 (void) tcp_output(tp);
643 tp->t_force = 0;
644 break;
645
646 /*
647 * Keep-alive timer went off; send something
648 * or drop connection if idle for too long.
649 */
650 case TCPT_KEEP:
651 tcpstat.tcps_keeptimeo++;
652 if (tp->t_state < TCPS_ESTABLISHED)
653 goto dropit;
654 if ((always_keepalive ||
655 tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) &&
656 tp->t_state <= TCPS_CLOSING) {
657 if (tp->t_rcvtime >= TCP_KEEPIDLE(tp) + (unsigned long)tcp_maxidle)
658 goto dropit;
659 /*
660 * Send a packet designed to force a response
661 * if the peer is up and reachable:
662 * either an ACK if the connection is still alive,
663 * or an RST if the peer has closed the connection
664 * due to timeout or reboot.
665 * Using sequence number tp->snd_una-1
666 * causes the transmitted zero-length segment
667 * to lie outside the receive window;
668 * by the protocol spec, this requires the
669 * correspondent TCP to respond.
670 */
671 tcpstat.tcps_keepprobe++;
672 t_template = tcp_maketemplate(tp);
673 if (t_template) {
674 tcp_respond(tp, t_template->tt_ipgen,
675 &t_template->tt_t, (struct mbuf *)NULL,
676 tp->rcv_nxt, tp->snd_una - 1, 0);
677 (void) m_free(dtom(t_template));
678 }
679 tp->t_timer[TCPT_KEEP] = tcp_keepintvl;
680 } else
681 tp->t_timer[TCPT_KEEP] = TCP_KEEPIDLE(tp);
682 break;
683
684 #if TCPDEBUG
685 if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
686 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
687 PRU_SLOWTIMO);
688 #endif
689 dropit:
690 tcpstat.tcps_keepdrops++;
691 tp = tcp_drop(tp, ETIMEDOUT);
692 postevent(so_tmp, 0, EV_TIMEOUT);
693 break;
694 }
695 return (tp);
696 }