]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/tcp_input.c
c2925621ee77b65ea20d62c7bba4027ee8444c4f
[apple/xnu.git] / bsd / netinet / tcp_input.c
1 /*
2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
7 *
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * file.
14 *
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
22 *
23 * @APPLE_LICENSE_HEADER_END@
24 */
25 /*
26 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
27 * The Regents of the University of California. All rights reserved.
28 *
29 * Redistribution and use in source and binary forms, with or without
30 * modification, are permitted provided that the following conditions
31 * are met:
32 * 1. Redistributions of source code must retain the above copyright
33 * notice, this list of conditions and the following disclaimer.
34 * 2. Redistributions in binary form must reproduce the above copyright
35 * notice, this list of conditions and the following disclaimer in the
36 * documentation and/or other materials provided with the distribution.
37 * 3. All advertising materials mentioning features or use of this software
38 * must display the following acknowledgement:
39 * This product includes software developed by the University of
40 * California, Berkeley and its contributors.
41 * 4. Neither the name of the University nor the names of its contributors
42 * may be used to endorse or promote products derived from this software
43 * without specific prior written permission.
44 *
45 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
48 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 * SUCH DAMAGE.
56 *
57 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
58 * $FreeBSD: src/sys/netinet/tcp_input.c,v 1.107.2.16 2001/08/22 00:59:12 silby Exp $
59 */
60
61
62 #include <sys/param.h>
63 #include <sys/systm.h>
64 #include <sys/kernel.h>
65 #include <sys/sysctl.h>
66 #include <sys/malloc.h>
67 #include <sys/mbuf.h>
68 #include <sys/proc.h> /* for proc0 declaration */
69 #include <sys/protosw.h>
70 #include <sys/socket.h>
71 #include <sys/socketvar.h>
72 #include <sys/syslog.h>
73
74 #include <kern/cpu_number.h> /* before tcp_seq.h, for tcp_random18() */
75
76 #include <net/if.h>
77 #include <net/if_types.h>
78 #include <net/route.h>
79
80 #include <netinet/in.h>
81 #include <netinet/in_systm.h>
82 #include <netinet/ip.h>
83 #include <netinet/ip_icmp.h> /* for ICMP_BANDLIM */
84 #include <netinet/in_var.h>
85 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
86 #include <netinet/in_pcb.h>
87 #include <netinet/ip_var.h>
88 #if INET6
89 #include <netinet/ip6.h>
90 #include <netinet/icmp6.h>
91 #include <netinet6/nd6.h>
92 #include <netinet6/ip6_var.h>
93 #include <netinet6/in6_pcb.h>
94 #endif
95 #include <netinet/tcp.h>
96 #include <netinet/tcp_fsm.h>
97 #include <netinet/tcp_seq.h>
98 #include <netinet/tcp_timer.h>
99 #include <netinet/tcp_var.h>
100 #if INET6
101 #include <netinet6/tcp6_var.h>
102 #endif
103 #include <netinet/tcpip.h>
104 #if TCPDEBUG
105 #include <netinet/tcp_debug.h>
106 u_char tcp_saveipgen[40]; /* the size must be of max ip header, now IPv6 */
107 struct tcphdr tcp_savetcp;
108 #endif /* TCPDEBUG */
109
110 #if IPSEC
111 #include <netinet6/ipsec.h>
112 #if INET6
113 #include <netinet6/ipsec6.h>
114 #endif
115 #include <netkey/key.h>
116 #endif /*IPSEC*/
117
118 #include <sys/kdebug.h>
119
120 #ifndef __APPLE__
121 MALLOC_DEFINE(M_TSEGQ, "tseg_qent", "TCP segment queue entry");
122 #endif
123
124 #define DBG_LAYER_BEG NETDBG_CODE(DBG_NETTCP, 0)
125 #define DBG_LAYER_END NETDBG_CODE(DBG_NETTCP, 2)
126 #define DBG_FNC_TCP_INPUT NETDBG_CODE(DBG_NETTCP, (3 << 8))
127 #define DBG_FNC_TCP_NEWCONN NETDBG_CODE(DBG_NETTCP, (7 << 8))
128
129 static int tcprexmtthresh = 3;
130 tcp_cc tcp_ccgen;
131 extern int apple_hwcksum_rx;
132
133 #if IPSEC
134 extern int ipsec_bypass;
135 #endif
136
137 struct tcpstat tcpstat;
138 SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RD,
139 &tcpstat , tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
140
141 static int log_in_vain = 0;
142 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW,
143 &log_in_vain, 0, "Log all incoming TCP connections");
144
145 static int blackhole = 0;
146 SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW,
147 &blackhole, 0, "Do not send RST when dropping refused connections");
148
149 int tcp_delack_enabled = 1;
150 SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW,
151 &tcp_delack_enabled, 0,
152 "Delay ACK to try and piggyback it onto a data packet");
153
154 int tcp_lq_overflow = 1;
155 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_lq_overflow, CTLFLAG_RW,
156 &tcp_lq_overflow, 0,
157 "Listen Queue Overflow");
158
159 #if TCP_DROP_SYNFIN
160 static int drop_synfin = 0;
161 SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW,
162 &drop_synfin, 0, "Drop TCP packets with SYN+FIN set");
163 #endif
164
165 __private_extern__ int slowlink_wsize = 8192;
166 SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowlink_wsize, CTLFLAG_RW,
167 &slowlink_wsize, 0, "Maximum advertised window size for slowlink");
168
169
170 u_long tcp_now;
171 struct inpcbhead tcb;
172 #define tcb6 tcb /* for KAME src sync over BSD*'s */
173 struct inpcbinfo tcbinfo;
174
175 static void tcp_dooptions __P((struct tcpcb *,
176 u_char *, int, struct tcphdr *, struct tcpopt *));
177 static void tcp_pulloutofband __P((struct socket *,
178 struct tcphdr *, struct mbuf *, int));
179 static int tcp_reass __P((struct tcpcb *, struct tcphdr *, int *,
180 struct mbuf *));
181 static void tcp_xmit_timer __P((struct tcpcb *, int));
182 static int tcp_newreno __P((struct tcpcb *, struct tcphdr *));
183
184 /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
185 #if INET6
186 #define ND6_HINT(tp) \
187 do { \
188 if ((tp) && (tp)->t_inpcb && \
189 ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0 && \
190 (tp)->t_inpcb->in6p_route.ro_rt) \
191 nd6_nud_hint((tp)->t_inpcb->in6p_route.ro_rt, NULL, 0); \
192 } while (0)
193 #else
194 #define ND6_HINT(tp)
195 #endif
196
197 extern u_long *delack_bitmask;
198
199 /*
200 * Indicate whether this ack should be delayed. We can delay the ack if
201 * - delayed acks are enabled and
202 * - there is no delayed ack timer in progress and
203 * - our last ack wasn't a 0-sized window. We never want to delay
204 * the ack that opens up a 0-sized window.
205 */
206 #define DELAY_ACK(tp) \
207 (tcp_delack_enabled && !callout_pending(tp->tt_delack) && \
208 (tp->t_flags & TF_RXWIN0SENT) == 0)
209
210
211 static int
212 tcp_reass(tp, th, tlenp, m)
213 register struct tcpcb *tp;
214 register struct tcphdr *th;
215 int *tlenp;
216 struct mbuf *m;
217 {
218 struct tseg_qent *q;
219 struct tseg_qent *p = NULL;
220 struct tseg_qent *nq;
221 struct tseg_qent *te;
222 struct socket *so = tp->t_inpcb->inp_socket;
223 int flags;
224
225 /*
226 * Call with th==0 after become established to
227 * force pre-ESTABLISHED data up to user socket.
228 */
229 if (th == 0)
230 goto present;
231
232 /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
233 MALLOC(te, struct tseg_qent *, sizeof (struct tseg_qent), M_TSEGQ,
234 M_NOWAIT);
235 if (te == NULL) {
236 tcpstat.tcps_rcvmemdrop++;
237 m_freem(m);
238 return (0);
239 }
240
241 /*
242 * Find a segment which begins after this one does.
243 */
244 LIST_FOREACH(q, &tp->t_segq, tqe_q) {
245 if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
246 break;
247 p = q;
248 }
249
250 /*
251 * If there is a preceding segment, it may provide some of
252 * our data already. If so, drop the data from the incoming
253 * segment. If it provides all of our data, drop us.
254 */
255 if (p != NULL) {
256 register int i;
257 /* conversion to int (in i) handles seq wraparound */
258 i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
259 if (i > 0) {
260 if (i >= *tlenp) {
261 tcpstat.tcps_rcvduppack++;
262 tcpstat.tcps_rcvdupbyte += *tlenp;
263 m_freem(m);
264 FREE(te, M_TSEGQ);
265 /*
266 * Try to present any queued data
267 * at the left window edge to the user.
268 * This is needed after the 3-WHS
269 * completes.
270 */
271 goto present; /* ??? */
272 }
273 m_adj(m, i);
274 *tlenp -= i;
275 th->th_seq += i;
276 }
277 }
278 tcpstat.tcps_rcvoopack++;
279 tcpstat.tcps_rcvoobyte += *tlenp;
280
281 /*
282 * While we overlap succeeding segments trim them or,
283 * if they are completely covered, dequeue them.
284 */
285 while (q) {
286 register int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
287 if (i <= 0)
288 break;
289 if (i < q->tqe_len) {
290 q->tqe_th->th_seq += i;
291 q->tqe_len -= i;
292 m_adj(q->tqe_m, i);
293 break;
294 }
295
296 nq = LIST_NEXT(q, tqe_q);
297 LIST_REMOVE(q, tqe_q);
298 m_freem(q->tqe_m);
299 FREE(q, M_TSEGQ);
300 q = nq;
301 }
302
303 /* Insert the new segment queue entry into place. */
304 te->tqe_m = m;
305 te->tqe_th = th;
306 te->tqe_len = *tlenp;
307
308 if (p == NULL) {
309 LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
310 } else {
311 LIST_INSERT_AFTER(p, te, tqe_q);
312 }
313
314 present:
315 /*
316 * Present data to user, advancing rcv_nxt through
317 * completed sequence space.
318 */
319 if (!TCPS_HAVEESTABLISHED(tp->t_state))
320 return (0);
321 q = LIST_FIRST(&tp->t_segq);
322 if (!q || q->tqe_th->th_seq != tp->rcv_nxt)
323 return (0);
324 do {
325 tp->rcv_nxt += q->tqe_len;
326 flags = q->tqe_th->th_flags & TH_FIN;
327 nq = LIST_NEXT(q, tqe_q);
328 LIST_REMOVE(q, tqe_q);
329 if (so->so_state & SS_CANTRCVMORE)
330 m_freem(q->tqe_m);
331 else
332 sbappend(&so->so_rcv, q->tqe_m);
333 FREE(q, M_TSEGQ);
334 q = nq;
335 } while (q && q->tqe_th->th_seq == tp->rcv_nxt);
336 ND6_HINT(tp);
337
338 #if INET6
339 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
340
341 KERNEL_DEBUG(DBG_LAYER_BEG,
342 ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
343 (((tp->t_inpcb->in6p_laddr.s6_addr16[0] & 0xffff) << 16) |
344 (tp->t_inpcb->in6p_faddr.s6_addr16[0] & 0xffff)),
345 0,0,0);
346 }
347 else
348 #endif
349 {
350 KERNEL_DEBUG(DBG_LAYER_BEG,
351 ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
352 (((tp->t_inpcb->inp_laddr.s_addr & 0xffff) << 16) |
353 (tp->t_inpcb->inp_faddr.s_addr & 0xffff)),
354 0,0,0);
355 }
356 sorwakeup(so);
357 return (flags);
358
359 }
360
361
362 /*
363 * TCP input routine, follows pages 65-76 of the
364 * protocol specification dated September, 1981 very closely.
365 */
366 #if INET6
367 int
368 tcp6_input(mp, offp, proto)
369 struct mbuf **mp;
370 int *offp, proto;
371 {
372 register struct mbuf *m = *mp;
373 struct in6_ifaddr *ia6;
374
375 IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE);
376
377 /*
378 * draft-itojun-ipv6-tcp-to-anycast
379 * better place to put this in?
380 */
381 ia6 = ip6_getdstifaddr(m);
382 if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) {
383 struct ip6_hdr *ip6;
384
385 ip6 = mtod(m, struct ip6_hdr *);
386 icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
387 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
388 return IPPROTO_DONE;
389 }
390
391 tcp_input(m, *offp);
392 return IPPROTO_DONE;
393 }
394 #endif
395
396 void
397 tcp_input(m, off0)
398 struct mbuf *m;
399 int off0;
400 {
401 register struct tcphdr *th;
402 register struct ip *ip = NULL;
403 register struct ipovly *ipov;
404 register struct inpcb *inp;
405 u_char *optp = NULL;
406 int optlen = 0;
407 int len, tlen, off;
408 int drop_hdrlen;
409 register struct tcpcb *tp = 0;
410 register int thflags;
411 struct socket *so = 0;
412 int todrop, acked, ourfinisacked, needoutput = 0;
413 struct in_addr laddr;
414 #if INET6
415 struct in6_addr laddr6;
416 #endif
417 int dropsocket = 0;
418 int iss = 0;
419 u_long tiwin;
420 struct tcpopt to; /* options in this segment */
421 struct rmxp_tao *taop; /* pointer to our TAO cache entry */
422 struct rmxp_tao tao_noncached; /* in case there's no cached entry */
423 #if TCPDEBUG
424 short ostate = 0;
425 #endif
426 #if INET6
427 struct ip6_hdr *ip6 = NULL;
428 int isipv6;
429 #endif /* INET6 */
430 int rstreason; /* For badport_bandlim accounting purposes */
431 struct proc *proc0=current_proc();
432
433 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_START,0,0,0,0,0);
434
435 #if INET6
436 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
437 #endif
438 bzero((char *)&to, sizeof(to));
439
440 tcpstat.tcps_rcvtotal++;
441
442
443
444 #if INET6
445 if (isipv6) {
446 /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */
447 ip6 = mtod(m, struct ip6_hdr *);
448 tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
449 if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) {
450 tcpstat.tcps_rcvbadsum++;
451 goto drop;
452 }
453 th = (struct tcphdr *)((caddr_t)ip6 + off0);
454
455 KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport),
456 (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
457 th->th_seq, th->th_ack, th->th_win);
458 /*
459 * Be proactive about unspecified IPv6 address in source.
460 * As we use all-zero to indicate unbounded/unconnected pcb,
461 * unspecified IPv6 address can be used to confuse us.
462 *
463 * Note that packets with unspecified IPv6 destination is
464 * already dropped in ip6_input.
465 */
466 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
467 /* XXX stat */
468 goto drop;
469 }
470 } else
471 #endif /* INET6 */
472 {
473 /*
474 * Get IP and TCP header together in first mbuf.
475 * Note: IP leaves IP header in first mbuf.
476 */
477 if (off0 > sizeof (struct ip)) {
478 ip_stripoptions(m, (struct mbuf *)0);
479 off0 = sizeof(struct ip);
480 if (m->m_pkthdr.csum_flags & CSUM_TCP_SUM16)
481 m->m_pkthdr.csum_flags = 0; /* invalidate hwcksuming */
482
483 }
484 if (m->m_len < sizeof (struct tcpiphdr)) {
485 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) {
486 tcpstat.tcps_rcvshort++;
487 return;
488 }
489 }
490 ip = mtod(m, struct ip *);
491 ipov = (struct ipovly *)ip;
492 th = (struct tcphdr *)((caddr_t)ip + off0);
493 tlen = ip->ip_len;
494
495 KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport),
496 (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
497 th->th_seq, th->th_ack, th->th_win);
498
499 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
500 if (apple_hwcksum_rx && (m->m_pkthdr.csum_flags & CSUM_TCP_SUM16)) {
501 u_short pseudo;
502 bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
503 ipov->ih_len = (u_short)tlen;
504 HTONS(ipov->ih_len);
505 pseudo = in_cksum(m, sizeof (struct ip));
506 th->th_sum = in_addword(pseudo, (m->m_pkthdr.csum_data & 0xFFFF));
507 } else {
508 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
509 th->th_sum = m->m_pkthdr.csum_data;
510 else
511 th->th_sum = in_pseudo(ip->ip_src.s_addr,
512 ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data +
513 ip->ip_len + IPPROTO_TCP));
514 }
515 th->th_sum ^= 0xffff;
516 } else {
517 /*
518 * Checksum extended TCP header and data.
519 */
520 len = sizeof (struct ip) + tlen;
521 bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
522 ipov->ih_len = (u_short)tlen;
523 HTONS(ipov->ih_len);
524 th->th_sum = in_cksum(m, len);
525 }
526 if (th->th_sum) {
527 tcpstat.tcps_rcvbadsum++;
528 goto drop;
529 }
530 #if INET6
531 /* Re-initialization for later version check */
532 ip->ip_v = IPVERSION;
533 #endif
534 }
535
536 /*
537 * Check that TCP offset makes sense,
538 * pull out TCP options and adjust length. XXX
539 */
540 off = th->th_off << 2;
541 if (off < sizeof (struct tcphdr) || off > tlen) {
542 tcpstat.tcps_rcvbadoff++;
543 goto drop;
544 }
545 tlen -= off; /* tlen is used instead of ti->ti_len */
546 if (off > sizeof (struct tcphdr)) {
547 #if INET6
548 if (isipv6) {
549 IP6_EXTHDR_CHECK(m, off0, off, );
550 ip6 = mtod(m, struct ip6_hdr *);
551 th = (struct tcphdr *)((caddr_t)ip6 + off0);
552 } else
553 #endif /* INET6 */
554 {
555 if (m->m_len < sizeof(struct ip) + off) {
556 if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) {
557 tcpstat.tcps_rcvshort++;
558 return;
559 }
560 ip = mtod(m, struct ip *);
561 ipov = (struct ipovly *)ip;
562 th = (struct tcphdr *)((caddr_t)ip + off0);
563 }
564 }
565 optlen = off - sizeof (struct tcphdr);
566 optp = (u_char *)(th + 1);
567 /*
568 * Do quick retrieval of timestamp options ("options
569 * prediction?"). If timestamp is the only option and it's
570 * formatted as recommended in RFC 1323 appendix A, we
571 * quickly get the values now and not bother calling
572 * tcp_dooptions(), etc.
573 */
574 if ((optlen == TCPOLEN_TSTAMP_APPA ||
575 (optlen > TCPOLEN_TSTAMP_APPA &&
576 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
577 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
578 (th->th_flags & TH_SYN) == 0) {
579 to.to_flag |= TOF_TS;
580 to.to_tsval = ntohl(*(u_int32_t *)(optp + 4));
581 to.to_tsecr = ntohl(*(u_int32_t *)(optp + 8));
582 optp = NULL; /* we've parsed the options */
583 }
584 }
585 thflags = th->th_flags;
586
587 #if TCP_DROP_SYNFIN
588 /*
589 * If the drop_synfin option is enabled, drop all packets with
590 * both the SYN and FIN bits set. This prevents e.g. nmap from
591 * identifying the TCP/IP stack.
592 *
593 * This is incompatible with RFC1644 extensions (T/TCP).
594 */
595 if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN))
596 goto drop;
597 #endif
598
599 /*
600 * Convert TCP protocol specific fields to host format.
601 */
602 NTOHL(th->th_seq);
603 NTOHL(th->th_ack);
604 NTOHS(th->th_win);
605 NTOHS(th->th_urp);
606
607 /*
608 * Delay droping TCP, IP headers, IPv6 ext headers, and TCP options,
609 * until after ip6_savecontrol() is called and before other functions
610 * which don't want those proto headers.
611 * Because ip6_savecontrol() is going to parse the mbuf to
612 * search for data to be passed up to user-land, it wants mbuf
613 * parameters to be unchanged.
614 */
615 drop_hdrlen = off0 + off;
616
617 /*
618 * Locate pcb for segment.
619 */
620 findpcb:
621 #if IPFIREWALL_FORWARD
622 if (ip_fw_fwd_addr != NULL
623 #if INET6
624 && isipv6 == NULL /* IPv6 support is not yet */
625 #endif /* INET6 */
626 ) {
627 /*
628 * Diverted. Pretend to be the destination.
629 * already got one like this?
630 */
631 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
632 ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif);
633 if (!inp) {
634 /*
635 * No, then it's new. Try find the ambushing socket
636 */
637 if (!ip_fw_fwd_addr->sin_port) {
638 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src,
639 th->th_sport, ip_fw_fwd_addr->sin_addr,
640 th->th_dport, 1, m->m_pkthdr.rcvif);
641 } else {
642 inp = in_pcblookup_hash(&tcbinfo,
643 ip->ip_src, th->th_sport,
644 ip_fw_fwd_addr->sin_addr,
645 ntohs(ip_fw_fwd_addr->sin_port), 1,
646 m->m_pkthdr.rcvif);
647 }
648 }
649 ip_fw_fwd_addr = NULL;
650 } else
651 #endif /* IPFIREWALL_FORWARD */
652 {
653 #if INET6
654 if (isipv6)
655 inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport,
656 &ip6->ip6_dst, th->th_dport, 1,
657 m->m_pkthdr.rcvif);
658 else
659 #endif /* INET6 */
660 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
661 ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif);
662 }
663
664 #if IPSEC
665 #if INET6
666 if (isipv6) {
667 if (ipsec_bypass == 0 && inp != NULL && ipsec6_in_reject_so(m, inp->inp_socket)) {
668 ipsec6stat.in_polvio++;
669 goto drop;
670 }
671 } else
672 #endif /* INET6 */
673 if (ipsec_bypass == 0 && inp != NULL && ipsec4_in_reject_so(m, inp->inp_socket)) {
674 ipsecstat.in_polvio++;
675 goto drop;
676 }
677 #endif /*IPSEC*/
678
679 /*
680 * If the state is CLOSED (i.e., TCB does not exist) then
681 * all data in the incoming segment is discarded.
682 * If the TCB exists but is in CLOSED state, it is embryonic,
683 * but should either do a listen or a connect soon.
684 */
685 if (inp == NULL) {
686 if (log_in_vain) {
687 #if INET6
688 char dbuf[INET6_ADDRSTRLEN], sbuf[INET6_ADDRSTRLEN];
689 #else /* INET6 */
690 char dbuf[4*sizeof "123"], sbuf[4*sizeof "123"];
691 #endif /* INET6 */
692
693 #if INET6
694 if (isipv6) {
695 strcpy(dbuf, ip6_sprintf(&ip6->ip6_dst));
696 strcpy(sbuf, ip6_sprintf(&ip6->ip6_src));
697 } else
698 #endif
699 {
700 strcpy(dbuf, inet_ntoa(ip->ip_dst));
701 strcpy(sbuf, inet_ntoa(ip->ip_src));
702 }
703 switch (log_in_vain) {
704 case 1:
705 if(thflags & TH_SYN)
706 log(LOG_INFO,
707 "Connection attempt to TCP %s:%d from %s:%d\n",
708 dbuf, ntohs(th->th_dport),
709 sbuf,
710 ntohs(th->th_sport));
711 break;
712 case 2:
713 log(LOG_INFO,
714 "Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n",
715 dbuf, ntohs(th->th_dport), sbuf,
716 ntohs(th->th_sport), thflags);
717 break;
718 default:
719 break;
720 }
721 }
722 if (blackhole) {
723 switch (blackhole) {
724 case 1:
725 if (thflags & TH_SYN)
726 goto drop;
727 break;
728 case 2:
729 goto drop;
730 default:
731 goto drop;
732 }
733 }
734 rstreason = BANDLIM_RST_CLOSEDPORT;
735 goto dropwithreset;
736 }
737 tp = intotcpcb(inp);
738 if (tp == 0) {
739 rstreason = BANDLIM_RST_CLOSEDPORT;
740 goto dropwithreset;
741 }
742 if (tp->t_state == TCPS_CLOSED)
743 goto drop;
744
745 #ifdef __APPLE__
746 /*
747 * Bogus state when listening port owned by SharedIP with loopback as the
748 * only configured interface: BlueBox does not filters loopback
749 */
750 if (tp->t_state == TCP_NSTATES)
751 goto drop;
752 #endif
753
754 /* Unscale the window into a 32-bit value. */
755 if ((thflags & TH_SYN) == 0)
756 tiwin = th->th_win << tp->snd_scale;
757 else
758 tiwin = th->th_win;
759
760 so = inp->inp_socket;
761 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
762 #if TCPDEBUG
763 if (so->so_options & SO_DEBUG) {
764 ostate = tp->t_state;
765 #if INET6
766 if (isipv6)
767 bcopy((char *)ip6, (char *)tcp_saveipgen,
768 sizeof(*ip6));
769 else
770 #endif /* INET6 */
771 bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip));
772 tcp_savetcp = *th;
773 }
774 #endif
775 if (so->so_options & SO_ACCEPTCONN) {
776 register struct tcpcb *tp0 = tp;
777 struct socket *so2;
778 #if IPSEC
779 struct socket *oso;
780 #endif
781 #if INET6
782 struct inpcb *oinp = sotoinpcb(so);
783 #endif /* INET6 */
784
785 #if !IPSEC
786 /*
787 * Current IPsec implementation makes incorrect IPsec
788 * cache if this check is done here.
789 * So delay this until duplicated socket is created.
790 */
791 if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
792 /*
793 * Note: dropwithreset makes sure we don't
794 * send a RST in response to a RST.
795 */
796 if (thflags & TH_ACK) {
797 tcpstat.tcps_badsyn++;
798 rstreason = BANDLIM_RST_OPENPORT;
799 goto dropwithreset;
800 }
801 goto drop;
802 }
803 #endif
804 KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_START,0,0,0,0,0);
805
806 #if INET6
807 /*
808 * If deprecated address is forbidden,
809 * we do not accept SYN to deprecated interface
810 * address to prevent any new inbound connection from
811 * getting established.
812 * When we do not accept SYN, we send a TCP RST,
813 * with deprecated source address (instead of dropping
814 * it). We compromise it as it is much better for peer
815 * to send a RST, and RST will be the final packet
816 * for the exchange.
817 *
818 * If we do not forbid deprecated addresses, we accept
819 * the SYN packet. RFC2462 does not suggest dropping
820 * SYN in this case.
821 * If we decipher RFC2462 5.5.4, it says like this:
822 * 1. use of deprecated addr with existing
823 * communication is okay - "SHOULD continue to be
824 * used"
825 * 2. use of it with new communication:
826 * (2a) "SHOULD NOT be used if alternate address
827 * with sufficient scope is available"
828 * (2b) nothing mentioned otherwise.
829 * Here we fall into (2b) case as we have no choice in
830 * our source address selection - we must obey the peer.
831 *
832 * The wording in RFC2462 is confusing, and there are
833 * multiple description text for deprecated address
834 * handling - worse, they are not exactly the same.
835 * I believe 5.5.4 is the best one, so we follow 5.5.4.
836 */
837 if (isipv6 && !ip6_use_deprecated) {
838 struct in6_ifaddr *ia6;
839
840 if ((ia6 = ip6_getdstifaddr(m)) &&
841 (ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
842 tp = NULL;
843 rstreason = BANDLIM_RST_OPENPORT;
844 goto dropwithreset;
845 }
846 }
847 #endif
848
849 so2 = sonewconn(so, 0);
850 if (so2 == 0) {
851 tcpstat.tcps_listendrop++;
852 so2 = sodropablereq(so);
853 if (so2) {
854 if (tcp_lq_overflow)
855 sototcpcb(so2)->t_flags |=
856 TF_LQ_OVERFLOW;
857 tcp_drop(sototcpcb(so2), ETIMEDOUT);
858 so2 = sonewconn(so, 0);
859 }
860 if (!so2)
861 goto drop;
862 }
863 #if IPSEC
864 oso = so;
865 #endif
866 so = so2;
867 /*
868 * This is ugly, but ....
869 *
870 * Mark socket as temporary until we're
871 * committed to keeping it. The code at
872 * ``drop'' and ``dropwithreset'' check the
873 * flag dropsocket to see if the temporary
874 * socket created here should be discarded.
875 * We mark the socket as discardable until
876 * we're committed to it below in TCPS_LISTEN.
877 */
878 dropsocket++;
879 inp = (struct inpcb *)so->so_pcb;
880 #if INET6
881 if (isipv6)
882 inp->in6p_laddr = ip6->ip6_dst;
883 else {
884 inp->inp_vflag &= ~INP_IPV6;
885 inp->inp_vflag |= INP_IPV4;
886 #endif /* INET6 */
887 inp->inp_laddr = ip->ip_dst;
888 #if INET6
889 }
890 #endif /* INET6 */
891 inp->inp_lport = th->th_dport;
892 if (in_pcbinshash(inp) != 0) {
893 /*
894 * Undo the assignments above if we failed to
895 * put the PCB on the hash lists.
896 */
897 #if INET6
898 if (isipv6)
899 inp->in6p_laddr = in6addr_any;
900 else
901 #endif /* INET6 */
902 inp->inp_laddr.s_addr = INADDR_ANY;
903 inp->inp_lport = 0;
904 goto drop;
905 }
906 #if IPSEC
907 /*
908 * To avoid creating incorrectly cached IPsec
909 * association, this is need to be done here.
910 *
911 * Subject: (KAME-snap 748)
912 * From: Wayne Knowles <w.knowles@niwa.cri.nz>
913 * ftp://ftp.kame.net/pub/mail-list/snap-users/748
914 */
915 if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
916 /*
917 * Note: dropwithreset makes sure we don't
918 * send a RST in response to a RST.
919 */
920 if (thflags & TH_ACK) {
921 tcpstat.tcps_badsyn++;
922 rstreason = BANDLIM_RST_OPENPORT;
923 goto dropwithreset;
924 }
925 goto drop;
926 }
927 #endif
928 #if INET6
929 if (isipv6) {
930 /*
931 * Inherit socket options from the listening
932 * socket.
933 * Note that in6p_inputopts are not (even
934 * should not be) copied, since it stores
935 * previously received options and is used to
936 * detect if each new option is different than
937 * the previous one and hence should be passed
938 * to a user.
939 * If we copied in6p_inputopts, a user would
940 * not be able to receive options just after
941 * calling the accept system call.
942 */
943 inp->inp_flags |=
944 oinp->inp_flags & INP_CONTROLOPTS;
945 if (oinp->in6p_outputopts)
946 inp->in6p_outputopts =
947 ip6_copypktopts(oinp->in6p_outputopts,
948 M_NOWAIT);
949 } else
950 #endif /* INET6 */
951 inp->inp_options = ip_srcroute();
952 #if IPSEC
953 /* copy old policy into new socket's */
954 if (sotoinpcb(oso)->inp_sp)
955 {
956 int error = 0;
957 /* Is it a security hole here to silently fail to copy the policy? */
958 if (inp->inp_sp != NULL)
959 error = ipsec_init_policy(so, &inp->inp_sp);
960 if (error != 0 || ipsec_copy_policy(sotoinpcb(oso)->inp_sp, inp->inp_sp))
961 printf("tcp_input: could not copy policy\n");
962 }
963 #endif
964 tp = intotcpcb(inp);
965 tp->t_state = TCPS_LISTEN;
966 tp->t_flags |= tp0->t_flags & (TF_NOPUSH|TF_NOOPT|TF_NODELAY);
967
968 /* Compute proper scaling value from buffer space */
969 while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
970 TCP_MAXWIN << tp->request_r_scale <
971 so->so_rcv.sb_hiwat)
972 tp->request_r_scale++;
973
974 KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_END,0,0,0,0,0);
975 }
976 }
977
978 /*
979 * Segment received on connection.
980 * Reset idle time and keep-alive timer.
981 */
982 tp->t_rcvtime = 0;
983 if (TCPS_HAVEESTABLISHED(tp->t_state))
984 tp->t_timer[TCPT_KEEP] = tcp_keepidle;
985
986 /*
987 * Process options if not in LISTEN state,
988 * else do it below (after getting remote address).
989 */
990 if (tp->t_state != TCPS_LISTEN && optp)
991 tcp_dooptions(tp, optp, optlen, th, &to);
992
993 /*
994 * Header prediction: check for the two common cases
995 * of a uni-directional data xfer. If the packet has
996 * no control flags, is in-sequence, the window didn't
997 * change and we're not retransmitting, it's a
998 * candidate. If the length is zero and the ack moved
999 * forward, we're the sender side of the xfer. Just
1000 * free the data acked & wake any higher level process
1001 * that was blocked waiting for space. If the length
1002 * is non-zero and the ack didn't move, we're the
1003 * receiver side. If we're getting packets in-order
1004 * (the reassembly queue is empty), add the data to
1005 * the socket buffer and note that we need a delayed ack.
1006 * Make sure that the hidden state-flags are also off.
1007 * Since we check for TCPS_ESTABLISHED above, it can only
1008 * be TH_NEEDSYN.
1009 */
1010 if (tp->t_state == TCPS_ESTABLISHED &&
1011 (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
1012 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
1013 ((to.to_flag & TOF_TS) == 0 ||
1014 TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
1015 /*
1016 * Using the CC option is compulsory if once started:
1017 * the segment is OK if no T/TCP was negotiated or
1018 * if the segment has a CC option equal to CCrecv
1019 */
1020 ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) != (TF_REQ_CC|TF_RCVD_CC) ||
1021 ((to.to_flag & TOF_CC) != 0 && to.to_cc == tp->cc_recv)) &&
1022 th->th_seq == tp->rcv_nxt &&
1023 tiwin && tiwin == tp->snd_wnd &&
1024 tp->snd_nxt == tp->snd_max) {
1025
1026 /*
1027 * If last ACK falls within this segment's sequence numbers,
1028 * record the timestamp.
1029 * NOTE that the test is modified according to the latest
1030 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
1031 */
1032 if ((to.to_flag & TOF_TS) != 0 &&
1033 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
1034 tp->ts_recent_age = tcp_now;
1035 tp->ts_recent = to.to_tsval;
1036 }
1037
1038 if (tlen == 0) {
1039 if (SEQ_GT(th->th_ack, tp->snd_una) &&
1040 SEQ_LEQ(th->th_ack, tp->snd_max) &&
1041 tp->snd_cwnd >= tp->snd_wnd &&
1042 tp->t_dupacks < tcprexmtthresh) {
1043 /*
1044 * this is a pure ack for outstanding data.
1045 */
1046 ++tcpstat.tcps_predack;
1047 /*
1048 * "bad retransmit" recovery
1049 */
1050 if (tp->t_rxtshift == 1 &&
1051 tcp_now < tp->t_badrxtwin) {
1052 tp->snd_cwnd = tp->snd_cwnd_prev;
1053 tp->snd_ssthresh =
1054 tp->snd_ssthresh_prev;
1055 tp->snd_nxt = tp->snd_max;
1056 tp->t_badrxtwin = 0;
1057 }
1058 if (((to.to_flag & TOF_TS) != 0) && (to.to_tsecr != 0)) /* Makes sure we already have a TS */
1059 tcp_xmit_timer(tp,
1060 tcp_now - to.to_tsecr + 1);
1061 else if (tp->t_rtttime &&
1062 SEQ_GT(th->th_ack, tp->t_rtseq))
1063 tcp_xmit_timer(tp, tp->t_rtttime);
1064 acked = th->th_ack - tp->snd_una;
1065 tcpstat.tcps_rcvackpack++;
1066 tcpstat.tcps_rcvackbyte += acked;
1067 sbdrop(&so->so_snd, acked);
1068 tp->snd_una = th->th_ack;
1069 m_freem(m);
1070 ND6_HINT(tp); /* some progress has been done */
1071
1072 /*
1073 * If all outstanding data are acked, stop
1074 * retransmit timer, otherwise restart timer
1075 * using current (possibly backed-off) value.
1076 * If process is waiting for space,
1077 * wakeup/selwakeup/signal. If data
1078 * are ready to send, let tcp_output
1079 * decide between more output or persist.
1080 */
1081 if (tp->snd_una == tp->snd_max)
1082 tp->t_timer[TCPT_REXMT] = 0;
1083 else if (tp->t_timer[TCPT_PERSIST] == 0)
1084 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
1085
1086 if (so->so_snd.sb_cc)
1087 (void) tcp_output(tp);
1088 sowwakeup(so);
1089 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
1090 return;
1091 }
1092 } else if (th->th_ack == tp->snd_una &&
1093 LIST_EMPTY(&tp->t_segq) &&
1094 tlen <= sbspace(&so->so_rcv)) {
1095 /*
1096 * this is a pure, in-sequence data packet
1097 * with nothing on the reassembly queue and
1098 * we have enough buffer space to take it.
1099 */
1100 ++tcpstat.tcps_preddat;
1101 tp->rcv_nxt += tlen;
1102 tcpstat.tcps_rcvpack++;
1103 tcpstat.tcps_rcvbyte += tlen;
1104 ND6_HINT(tp); /* some progress has been done */
1105 /*
1106 * Add data to socket buffer.
1107 */
1108 m_adj(m, drop_hdrlen); /* delayed header drop */
1109 sbappend(&so->so_rcv, m);
1110 #if INET6
1111 if (isipv6) {
1112 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
1113 (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
1114 th->th_seq, th->th_ack, th->th_win);
1115 }
1116 else
1117 #endif
1118 {
1119 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
1120 (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
1121 th->th_seq, th->th_ack, th->th_win);
1122 }
1123 if (tcp_delack_enabled) {
1124 TCP_DELACK_BITSET(tp->t_inpcb->hash_element);
1125 tp->t_flags |= TF_DELACK;
1126 } else {
1127 tp->t_flags |= TF_ACKNOW;
1128 tcp_output(tp);
1129 }
1130 sorwakeup(so);
1131 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
1132 return;
1133 }
1134 }
1135
1136 /*
1137 * Calculate amount of space in receive window,
1138 * and then do TCP input processing.
1139 * Receive window is amount of space in rcv queue,
1140 * but not less than advertised window.
1141 */
1142 { int win;
1143
1144 win = sbspace(&so->so_rcv);
1145 if (win < 0)
1146 win = 0;
1147 else { /* clip rcv window to 4K for modems */
1148 if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0)
1149 win = min(win, slowlink_wsize);
1150 }
1151 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
1152 }
1153
1154 switch (tp->t_state) {
1155
1156 /*
1157 * If the state is LISTEN then ignore segment if it contains an RST.
1158 * If the segment contains an ACK then it is bad and send a RST.
1159 * If it does not contain a SYN then it is not interesting; drop it.
1160 * If it is from this socket, drop it, it must be forged.
1161 * Don't bother responding if the destination was a broadcast.
1162 * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial
1163 * tp->iss, and send a segment:
1164 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
1165 * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
1166 * Fill in remote peer address fields if not previously specified.
1167 * Enter SYN_RECEIVED state, and process any other fields of this
1168 * segment in this state.
1169 */
1170 case TCPS_LISTEN: {
1171 register struct sockaddr_in *sin;
1172 #if INET6
1173 register struct sockaddr_in6 *sin6;
1174 #endif
1175
1176 if (thflags & TH_RST)
1177 goto drop;
1178 if (thflags & TH_ACK) {
1179 rstreason = BANDLIM_RST_OPENPORT;
1180 goto dropwithreset;
1181 }
1182 if ((thflags & TH_SYN) == 0)
1183 goto drop;
1184 if (th->th_dport == th->th_sport) {
1185 #if INET6
1186 if (isipv6) {
1187 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
1188 &ip6->ip6_src))
1189 goto drop;
1190 } else
1191 #endif /* INET6 */
1192 if (ip->ip_dst.s_addr == ip->ip_src.s_addr)
1193 goto drop;
1194 }
1195 /*
1196 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
1197 * in_broadcast() should never return true on a received
1198 * packet with M_BCAST not set.
1199 *
1200 * Packets with a multicast source address should also
1201 * be discarded.
1202 */
1203 if (m->m_flags & (M_BCAST|M_MCAST))
1204 goto drop;
1205 #if INET6
1206 if (isipv6) {
1207 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
1208 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
1209 goto drop;
1210 } else
1211 #endif
1212 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
1213 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
1214 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
1215 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
1216 goto drop;
1217 #if INET6
1218 if (isipv6) {
1219 MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6,
1220 M_SONAME, M_NOWAIT);
1221 if (sin6 == NULL)
1222 goto drop;
1223 bzero(sin6, sizeof(*sin6));
1224 sin6->sin6_family = AF_INET6;
1225 sin6->sin6_len = sizeof(*sin6);
1226 sin6->sin6_addr = ip6->ip6_src;
1227 sin6->sin6_port = th->th_sport;
1228 laddr6 = inp->in6p_laddr;
1229 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
1230 inp->in6p_laddr = ip6->ip6_dst;
1231 if (in6_pcbconnect(inp, (struct sockaddr *)sin6,
1232 proc0)) {
1233 inp->in6p_laddr = laddr6;
1234 FREE(sin6, M_SONAME);
1235 goto drop;
1236 }
1237 FREE(sin6, M_SONAME);
1238 } else
1239 #endif
1240 {
1241 MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME,
1242 M_NOWAIT);
1243 if (sin == NULL)
1244 goto drop;
1245 sin->sin_family = AF_INET;
1246 sin->sin_len = sizeof(*sin);
1247 sin->sin_addr = ip->ip_src;
1248 sin->sin_port = th->th_sport;
1249 bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero));
1250 laddr = inp->inp_laddr;
1251 if (inp->inp_laddr.s_addr == INADDR_ANY)
1252 inp->inp_laddr = ip->ip_dst;
1253 if (in_pcbconnect(inp, (struct sockaddr *)sin, proc0)) {
1254 inp->inp_laddr = laddr;
1255 FREE(sin, M_SONAME);
1256 goto drop;
1257 }
1258 FREE(sin, M_SONAME);
1259 }
1260 if ((taop = tcp_gettaocache(inp)) == NULL) {
1261 taop = &tao_noncached;
1262 bzero(taop, sizeof(*taop));
1263 }
1264 tcp_dooptions(tp, optp, optlen, th, &to);
1265 if (iss)
1266 tp->iss = iss;
1267 else {
1268 tp->iss = tcp_new_isn(tp);
1269 }
1270 tp->irs = th->th_seq;
1271 tcp_sendseqinit(tp);
1272 tcp_rcvseqinit(tp);
1273 tp->snd_recover = tp->snd_una;
1274 /*
1275 * Initialization of the tcpcb for transaction;
1276 * set SND.WND = SEG.WND,
1277 * initialize CCsend and CCrecv.
1278 */
1279 tp->snd_wnd = tiwin; /* initial send-window */
1280 tp->cc_send = CC_INC(tcp_ccgen);
1281 tp->cc_recv = to.to_cc;
1282 /*
1283 * Perform TAO test on incoming CC (SEG.CC) option, if any.
1284 * - compare SEG.CC against cached CC from the same host,
1285 * if any.
1286 * - if SEG.CC > chached value, SYN must be new and is accepted
1287 * immediately: save new CC in the cache, mark the socket
1288 * connected, enter ESTABLISHED state, turn on flag to
1289 * send a SYN in the next segment.
1290 * A virtual advertised window is set in rcv_adv to
1291 * initialize SWS prevention. Then enter normal segment
1292 * processing: drop SYN, process data and FIN.
1293 * - otherwise do a normal 3-way handshake.
1294 */
1295 if ((to.to_flag & TOF_CC) != 0) {
1296 if (((tp->t_flags & TF_NOPUSH) != 0) &&
1297 taop->tao_cc != 0 && CC_GT(to.to_cc, taop->tao_cc)) {
1298
1299 taop->tao_cc = to.to_cc;
1300
1301 tp->t_state = TCPS_ESTABLISHED;
1302
1303 /*
1304 * If there is a FIN, or if there is data and the
1305 * connection is local, then delay SYN,ACK(SYN) in
1306 * the hope of piggy-backing it on a response
1307 * segment. Otherwise must send ACK now in case
1308 * the other side is slow starting.
1309 */
1310 if (tcp_delack_enabled && ((thflags & TH_FIN) ||
1311 (tlen != 0 &&
1312 #if INET6
1313 (isipv6 && in6_localaddr(&inp->in6p_faddr))
1314 ||
1315 (!isipv6 &&
1316 #endif /* INET6 */
1317 in_localaddr(inp->inp_faddr)
1318 #if INET6
1319 )
1320 #endif /* INET6 */
1321 ))) {
1322 TCP_DELACK_BITSET(tp->t_inpcb->hash_element);
1323 tp->t_flags |= (TF_DELACK | TF_NEEDSYN);
1324 }
1325 else
1326 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
1327
1328 /*
1329 * Limit the `virtual advertised window' to TCP_MAXWIN
1330 * here. Even if we requested window scaling, it will
1331 * become effective only later when our SYN is acked.
1332 */
1333 if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0) /* clip window size for for slow link */
1334 tp->rcv_adv += min(tp->rcv_wnd, slowlink_wsize);
1335 else
1336 tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN);
1337 tcpstat.tcps_connects++;
1338 soisconnected(so);
1339 tp->t_timer[TCPT_KEEP] = tcp_keepinit;
1340 dropsocket = 0; /* committed to socket */
1341 tcpstat.tcps_accepts++;
1342 goto trimthenstep6;
1343 }
1344 /* else do standard 3-way handshake */
1345 } else {
1346 /*
1347 * No CC option, but maybe CC.NEW:
1348 * invalidate cached value.
1349 */
1350 taop->tao_cc = 0;
1351 }
1352 /*
1353 * TAO test failed or there was no CC option,
1354 * do a standard 3-way handshake.
1355 */
1356 tp->t_flags |= TF_ACKNOW;
1357 tp->t_state = TCPS_SYN_RECEIVED;
1358 tp->t_timer[TCPT_KEEP] = tcp_keepinit;
1359 dropsocket = 0; /* committed to socket */
1360 tcpstat.tcps_accepts++;
1361 goto trimthenstep6;
1362 }
1363
1364 /*
1365 * If the state is SYN_RECEIVED:
1366 * if seg contains an ACK, but not for our SYN/ACK, send a RST.
1367 */
1368 case TCPS_SYN_RECEIVED:
1369 if ((thflags & TH_ACK) &&
1370 (SEQ_LEQ(th->th_ack, tp->snd_una) ||
1371 SEQ_GT(th->th_ack, tp->snd_max))) {
1372 rstreason = BANDLIM_RST_OPENPORT;
1373 goto dropwithreset;
1374 }
1375 break;
1376
1377 /*
1378 * If the state is SYN_SENT:
1379 * if seg contains an ACK, but not for our SYN, drop the input.
1380 * if seg contains a RST, then drop the connection.
1381 * if seg does not contain SYN, then drop it.
1382 * Otherwise this is an acceptable SYN segment
1383 * initialize tp->rcv_nxt and tp->irs
1384 * if seg contains ack then advance tp->snd_una
1385 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
1386 * arrange for segment to be acked (eventually)
1387 * continue processing rest of data/controls, beginning with URG
1388 */
1389 case TCPS_SYN_SENT:
1390 if ((taop = tcp_gettaocache(inp)) == NULL) {
1391 taop = &tao_noncached;
1392 bzero(taop, sizeof(*taop));
1393 }
1394
1395 if ((thflags & TH_ACK) &&
1396 (SEQ_LEQ(th->th_ack, tp->iss) ||
1397 SEQ_GT(th->th_ack, tp->snd_max))) {
1398 /*
1399 * If we have a cached CCsent for the remote host,
1400 * hence we haven't just crashed and restarted,
1401 * do not send a RST. This may be a retransmission
1402 * from the other side after our earlier ACK was lost.
1403 * Our new SYN, when it arrives, will serve as the
1404 * needed ACK.
1405 */
1406 if (taop->tao_ccsent != 0)
1407 goto drop;
1408 else {
1409 rstreason = BANDLIM_UNLIMITED;
1410 goto dropwithreset;
1411 }
1412 }
1413 if (thflags & TH_RST) {
1414 if (thflags & TH_ACK) {
1415 tp = tcp_drop(tp, ECONNREFUSED);
1416 postevent(so, 0, EV_RESET);
1417 }
1418 goto drop;
1419 }
1420 if ((thflags & TH_SYN) == 0)
1421 goto drop;
1422 tp->snd_wnd = th->th_win; /* initial send window */
1423 tp->cc_recv = to.to_cc; /* foreign CC */
1424
1425 tp->irs = th->th_seq;
1426 tcp_rcvseqinit(tp);
1427 if (thflags & TH_ACK) {
1428 /*
1429 * Our SYN was acked. If segment contains CC.ECHO
1430 * option, check it to make sure this segment really
1431 * matches our SYN. If not, just drop it as old
1432 * duplicate, but send an RST if we're still playing
1433 * by the old rules. If no CC.ECHO option, make sure
1434 * we don't get fooled into using T/TCP.
1435 */
1436 if (to.to_flag & TOF_CCECHO) {
1437 if (tp->cc_send != to.to_ccecho) {
1438 if (taop->tao_ccsent != 0)
1439 goto drop;
1440 else {
1441 rstreason = BANDLIM_UNLIMITED;
1442 goto dropwithreset;
1443 }
1444 }
1445 } else
1446 tp->t_flags &= ~TF_RCVD_CC;
1447 tcpstat.tcps_connects++;
1448 soisconnected(so);
1449 /* Do window scaling on this connection? */
1450 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1451 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1452 tp->snd_scale = tp->requested_s_scale;
1453 tp->rcv_scale = tp->request_r_scale;
1454 }
1455 /* Segment is acceptable, update cache if undefined. */
1456 if (taop->tao_ccsent == 0)
1457 taop->tao_ccsent = to.to_ccecho;
1458
1459 tp->rcv_adv += tp->rcv_wnd;
1460 tp->snd_una++; /* SYN is acked */
1461 /*
1462 * If there's data, delay ACK; if there's also a FIN
1463 * ACKNOW will be turned on later.
1464 */
1465 if (tcp_delack_enabled && tlen != 0) {
1466 TCP_DELACK_BITSET(tp->t_inpcb->hash_element);
1467 tp->t_flags |= TF_DELACK;
1468 }
1469 else
1470 tp->t_flags |= TF_ACKNOW;
1471 /*
1472 * Received <SYN,ACK> in SYN_SENT[*] state.
1473 * Transitions:
1474 * SYN_SENT --> ESTABLISHED
1475 * SYN_SENT* --> FIN_WAIT_1
1476 */
1477 if (tp->t_flags & TF_NEEDFIN) {
1478 tp->t_state = TCPS_FIN_WAIT_1;
1479 tp->t_flags &= ~TF_NEEDFIN;
1480 thflags &= ~TH_SYN;
1481 } else {
1482 tp->t_state = TCPS_ESTABLISHED;
1483 tp->t_timer[TCPT_KEEP] = tcp_keepidle;
1484 }
1485 } else {
1486 /*
1487 * Received initial SYN in SYN-SENT[*] state => simul-
1488 * taneous open. If segment contains CC option and there is
1489 * a cached CC, apply TAO test; if it succeeds, connection is
1490 * half-synchronized. Otherwise, do 3-way handshake:
1491 * SYN-SENT -> SYN-RECEIVED
1492 * SYN-SENT* -> SYN-RECEIVED*
1493 * If there was no CC option, clear cached CC value.
1494 */
1495 tp->t_flags |= TF_ACKNOW;
1496 tp->t_timer[TCPT_REXMT] = 0;
1497 if (to.to_flag & TOF_CC) {
1498 if (taop->tao_cc != 0 &&
1499 CC_GT(to.to_cc, taop->tao_cc)) {
1500 /*
1501 * update cache and make transition:
1502 * SYN-SENT -> ESTABLISHED*
1503 * SYN-SENT* -> FIN-WAIT-1*
1504 */
1505 taop->tao_cc = to.to_cc;
1506 if (tp->t_flags & TF_NEEDFIN) {
1507 tp->t_state = TCPS_FIN_WAIT_1;
1508 tp->t_flags &= ~TF_NEEDFIN;
1509 } else {
1510 tp->t_state = TCPS_ESTABLISHED;
1511 tp->t_timer[TCPT_KEEP] = tcp_keepidle;
1512 }
1513 tp->t_flags |= TF_NEEDSYN;
1514 } else
1515 tp->t_state = TCPS_SYN_RECEIVED;
1516 } else {
1517 /* CC.NEW or no option => invalidate cache */
1518 taop->tao_cc = 0;
1519 tp->t_state = TCPS_SYN_RECEIVED;
1520 }
1521 }
1522
1523 trimthenstep6:
1524 /*
1525 * Advance th->th_seq to correspond to first data byte.
1526 * If data, trim to stay within window,
1527 * dropping FIN if necessary.
1528 */
1529 th->th_seq++;
1530 if (tlen > tp->rcv_wnd) {
1531 todrop = tlen - tp->rcv_wnd;
1532 m_adj(m, -todrop);
1533 tlen = tp->rcv_wnd;
1534 thflags &= ~TH_FIN;
1535 tcpstat.tcps_rcvpackafterwin++;
1536 tcpstat.tcps_rcvbyteafterwin += todrop;
1537 }
1538 tp->snd_wl1 = th->th_seq - 1;
1539 tp->rcv_up = th->th_seq;
1540 /*
1541 * Client side of transaction: already sent SYN and data.
1542 * If the remote host used T/TCP to validate the SYN,
1543 * our data will be ACK'd; if so, enter normal data segment
1544 * processing in the middle of step 5, ack processing.
1545 * Otherwise, goto step 6.
1546 */
1547 if (thflags & TH_ACK)
1548 goto process_ACK;
1549 goto step6;
1550 /*
1551 * If the state is LAST_ACK or CLOSING or TIME_WAIT:
1552 * if segment contains a SYN and CC [not CC.NEW] option:
1553 * if state == TIME_WAIT and connection duration > MSL,
1554 * drop packet and send RST;
1555 *
1556 * if SEG.CC > CCrecv then is new SYN, and can implicitly
1557 * ack the FIN (and data) in retransmission queue.
1558 * Complete close and delete TCPCB. Then reprocess
1559 * segment, hoping to find new TCPCB in LISTEN state;
1560 *
1561 * else must be old SYN; drop it.
1562 * else do normal processing.
1563 */
1564 case TCPS_LAST_ACK:
1565 case TCPS_CLOSING:
1566 case TCPS_TIME_WAIT:
1567 if ((thflags & TH_SYN) &&
1568 (to.to_flag & TOF_CC) && tp->cc_recv != 0) {
1569 if (tp->t_state == TCPS_TIME_WAIT &&
1570 tp->t_starttime > tcp_msl) {
1571 rstreason = BANDLIM_UNLIMITED;
1572 goto dropwithreset;
1573 }
1574 if (CC_GT(to.to_cc, tp->cc_recv)) {
1575 tp = tcp_close(tp);
1576 goto findpcb;
1577 }
1578 else
1579 goto drop;
1580 }
1581 break; /* continue normal processing */
1582 }
1583
1584 /*
1585 * States other than LISTEN or SYN_SENT.
1586 * First check the RST flag and sequence number since reset segments
1587 * are exempt from the timestamp and connection count tests. This
1588 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
1589 * below which allowed reset segments in half the sequence space
1590 * to fall though and be processed (which gives forged reset
1591 * segments with a random sequence number a 50 percent chance of
1592 * killing a connection).
1593 * Then check timestamp, if present.
1594 * Then check the connection count, if present.
1595 * Then check that at least some bytes of segment are within
1596 * receive window. If segment begins before rcv_nxt,
1597 * drop leading data (and SYN); if nothing left, just ack.
1598 *
1599 *
1600 * If the RST bit is set, check the sequence number to see
1601 * if this is a valid reset segment.
1602 * RFC 793 page 37:
1603 * In all states except SYN-SENT, all reset (RST) segments
1604 * are validated by checking their SEQ-fields. A reset is
1605 * valid if its sequence number is in the window.
1606 * Note: this does not take into account delayed ACKs, so
1607 * we should test against last_ack_sent instead of rcv_nxt.
1608 * The sequence number in the reset segment is normally an
1609 * echo of our outgoing acknowlegement numbers, but some hosts
1610 * send a reset with the sequence number at the rightmost edge
1611 * of our receive window, and we have to handle this case.
1612 * If we have multiple segments in flight, the intial reset
1613 * segment sequence numbers will be to the left of last_ack_sent,
1614 * but they will eventually catch up.
1615 * In any case, it never made sense to trim reset segments to
1616 * fit the receive window since RFC 1122 says:
1617 * 4.2.2.12 RST Segment: RFC-793 Section 3.4
1618 *
1619 * A TCP SHOULD allow a received RST segment to include data.
1620 *
1621 * DISCUSSION
1622 * It has been suggested that a RST segment could contain
1623 * ASCII text that encoded and explained the cause of the
1624 * RST. No standard has yet been established for such
1625 * data.
1626 *
1627 * If the reset segment passes the sequence number test examine
1628 * the state:
1629 * SYN_RECEIVED STATE:
1630 * If passive open, return to LISTEN state.
1631 * If active open, inform user that connection was refused.
1632 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
1633 * Inform user that connection was reset, and close tcb.
1634 * CLOSING, LAST_ACK STATES:
1635 * Close the tcb.
1636 * TIME_WAIT STATE:
1637 * Drop the segment - see Stevens, vol. 2, p. 964 and
1638 * RFC 1337.
1639 */
1640 if (thflags & TH_RST) {
1641 if (SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
1642 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
1643 switch (tp->t_state) {
1644
1645 case TCPS_SYN_RECEIVED:
1646 so->so_error = ECONNREFUSED;
1647 goto close;
1648
1649 case TCPS_ESTABLISHED:
1650 case TCPS_FIN_WAIT_1:
1651 case TCPS_CLOSE_WAIT:
1652 /*
1653 Drop through ...
1654 */
1655 case TCPS_FIN_WAIT_2:
1656 so->so_error = ECONNRESET;
1657 close:
1658 postevent(so, 0, EV_RESET);
1659 tp->t_state = TCPS_CLOSED;
1660 tcpstat.tcps_drops++;
1661 tp = tcp_close(tp);
1662 break;
1663
1664 case TCPS_CLOSING:
1665 case TCPS_LAST_ACK:
1666 tp = tcp_close(tp);
1667 break;
1668
1669 case TCPS_TIME_WAIT:
1670 break;
1671 }
1672 }
1673 goto drop;
1674 }
1675
1676 /*
1677 * RFC 1323 PAWS: If we have a timestamp reply on this segment
1678 * and it's less than ts_recent, drop it.
1679 */
1680 if ((to.to_flag & TOF_TS) != 0 && tp->ts_recent &&
1681 TSTMP_LT(to.to_tsval, tp->ts_recent)) {
1682
1683 /* Check to see if ts_recent is over 24 days old. */
1684 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) {
1685 /*
1686 * Invalidate ts_recent. If this segment updates
1687 * ts_recent, the age will be reset later and ts_recent
1688 * will get a valid value. If it does not, setting
1689 * ts_recent to zero will at least satisfy the
1690 * requirement that zero be placed in the timestamp
1691 * echo reply when ts_recent isn't valid. The
1692 * age isn't reset until we get a valid ts_recent
1693 * because we don't want out-of-order segments to be
1694 * dropped when ts_recent is old.
1695 */
1696 tp->ts_recent = 0;
1697 } else {
1698 tcpstat.tcps_rcvduppack++;
1699 tcpstat.tcps_rcvdupbyte += tlen;
1700 tcpstat.tcps_pawsdrop++;
1701 goto dropafterack;
1702 }
1703 }
1704
1705 /*
1706 * T/TCP mechanism
1707 * If T/TCP was negotiated and the segment doesn't have CC,
1708 * or if its CC is wrong then drop the segment.
1709 * RST segments do not have to comply with this.
1710 */
1711 if ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) == (TF_REQ_CC|TF_RCVD_CC) &&
1712 ((to.to_flag & TOF_CC) == 0 || tp->cc_recv != to.to_cc))
1713 goto dropafterack;
1714
1715 /*
1716 * In the SYN-RECEIVED state, validate that the packet belongs to
1717 * this connection before trimming the data to fit the receive
1718 * window. Check the sequence number versus IRS since we know
1719 * the sequence numbers haven't wrapped. This is a partial fix
1720 * for the "LAND" DoS attack.
1721 */
1722 if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
1723 rstreason = BANDLIM_RST_OPENPORT;
1724 goto dropwithreset;
1725 }
1726
1727 todrop = tp->rcv_nxt - th->th_seq;
1728 if (todrop > 0) {
1729 if (thflags & TH_SYN) {
1730 thflags &= ~TH_SYN;
1731 th->th_seq++;
1732 if (th->th_urp > 1)
1733 th->th_urp--;
1734 else
1735 thflags &= ~TH_URG;
1736 todrop--;
1737 }
1738 /*
1739 * Following if statement from Stevens, vol. 2, p. 960.
1740 */
1741 if (todrop > tlen
1742 || (todrop == tlen && (thflags & TH_FIN) == 0)) {
1743 /*
1744 * Any valid FIN must be to the left of the window.
1745 * At this point the FIN must be a duplicate or out
1746 * of sequence; drop it.
1747 */
1748 thflags &= ~TH_FIN;
1749
1750 /*
1751 * Send an ACK to resynchronize and drop any data.
1752 * But keep on processing for RST or ACK.
1753 */
1754 tp->t_flags |= TF_ACKNOW;
1755 todrop = tlen;
1756 tcpstat.tcps_rcvduppack++;
1757 tcpstat.tcps_rcvdupbyte += todrop;
1758 } else {
1759 tcpstat.tcps_rcvpartduppack++;
1760 tcpstat.tcps_rcvpartdupbyte += todrop;
1761 }
1762 drop_hdrlen += todrop; /* drop from the top afterwards */
1763 th->th_seq += todrop;
1764 tlen -= todrop;
1765 if (th->th_urp > todrop)
1766 th->th_urp -= todrop;
1767 else {
1768 thflags &= ~TH_URG;
1769 th->th_urp = 0;
1770 }
1771 }
1772
1773 /*
1774 * If new data are received on a connection after the
1775 * user processes are gone, then RST the other end.
1776 */
1777 if ((so->so_state & SS_NOFDREF) &&
1778 tp->t_state > TCPS_CLOSE_WAIT && tlen) {
1779 tp = tcp_close(tp);
1780 tcpstat.tcps_rcvafterclose++;
1781 rstreason = BANDLIM_UNLIMITED;
1782 goto dropwithreset;
1783 }
1784
1785 /*
1786 * If segment ends after window, drop trailing data
1787 * (and PUSH and FIN); if nothing left, just ACK.
1788 */
1789 todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd);
1790 if (todrop > 0) {
1791 tcpstat.tcps_rcvpackafterwin++;
1792 if (todrop >= tlen) {
1793 tcpstat.tcps_rcvbyteafterwin += tlen;
1794 /*
1795 * If a new connection request is received
1796 * while in TIME_WAIT, drop the old connection
1797 * and start over if the sequence numbers
1798 * are above the previous ones.
1799 */
1800 if (thflags & TH_SYN &&
1801 tp->t_state == TCPS_TIME_WAIT &&
1802 SEQ_GT(th->th_seq, tp->rcv_nxt)) {
1803 iss = tcp_new_isn(tp);
1804 tp = tcp_close(tp);
1805 goto findpcb;
1806 }
1807 /*
1808 * If window is closed can only take segments at
1809 * window edge, and have to drop data and PUSH from
1810 * incoming segments. Continue processing, but
1811 * remember to ack. Otherwise, drop segment
1812 * and ack.
1813 */
1814 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
1815 tp->t_flags |= TF_ACKNOW;
1816 tcpstat.tcps_rcvwinprobe++;
1817 } else
1818 goto dropafterack;
1819 } else
1820 tcpstat.tcps_rcvbyteafterwin += todrop;
1821 m_adj(m, -todrop);
1822 tlen -= todrop;
1823 thflags &= ~(TH_PUSH|TH_FIN);
1824 }
1825
1826 /*
1827 * If last ACK falls within this segment's sequence numbers,
1828 * record its timestamp.
1829 * NOTE that the test is modified according to the latest
1830 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
1831 */
1832 if ((to.to_flag & TOF_TS) != 0 &&
1833 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
1834 tp->ts_recent_age = tcp_now;
1835 tp->ts_recent = to.to_tsval;
1836 }
1837
1838 /*
1839 * If a SYN is in the window, then this is an
1840 * error and we send an RST and drop the connection.
1841 */
1842 if (thflags & TH_SYN) {
1843 tp = tcp_drop(tp, ECONNRESET);
1844 rstreason = BANDLIM_UNLIMITED;
1845 postevent(so, 0, EV_RESET);
1846 goto dropwithreset;
1847 }
1848
1849 /*
1850 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN
1851 * flag is on (half-synchronized state), then queue data for
1852 * later processing; else drop segment and return.
1853 */
1854 if ((thflags & TH_ACK) == 0) {
1855 if (tp->t_state == TCPS_SYN_RECEIVED ||
1856 (tp->t_flags & TF_NEEDSYN))
1857 goto step6;
1858 else
1859 goto drop;
1860 }
1861
1862 /*
1863 * Ack processing.
1864 */
1865 switch (tp->t_state) {
1866
1867 /*
1868 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
1869 * ESTABLISHED state and continue processing.
1870 * The ACK was checked above.
1871 */
1872 case TCPS_SYN_RECEIVED:
1873
1874 tcpstat.tcps_connects++;
1875 soisconnected(so);
1876
1877 /* Do window scaling? */
1878 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1879 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1880 tp->snd_scale = tp->requested_s_scale;
1881 tp->rcv_scale = tp->request_r_scale;
1882 }
1883 /*
1884 * Upon successful completion of 3-way handshake,
1885 * update cache.CC if it was undefined, pass any queued
1886 * data to the user, and advance state appropriately.
1887 */
1888 if ((taop = tcp_gettaocache(inp)) != NULL &&
1889 taop->tao_cc == 0)
1890 taop->tao_cc = tp->cc_recv;
1891
1892 /*
1893 * Make transitions:
1894 * SYN-RECEIVED -> ESTABLISHED
1895 * SYN-RECEIVED* -> FIN-WAIT-1
1896 */
1897 if (tp->t_flags & TF_NEEDFIN) {
1898 tp->t_state = TCPS_FIN_WAIT_1;
1899 tp->t_flags &= ~TF_NEEDFIN;
1900 } else {
1901 tp->t_state = TCPS_ESTABLISHED;
1902 tp->t_timer[TCPT_KEEP] = tcp_keepidle;
1903 }
1904 /*
1905 * If segment contains data or ACK, will call tcp_reass()
1906 * later; if not, do so now to pass queued data to user.
1907 */
1908 if (tlen == 0 && (thflags & TH_FIN) == 0)
1909 (void) tcp_reass(tp, (struct tcphdr *)0, 0,
1910 (struct mbuf *)0);
1911 tp->snd_wl1 = th->th_seq - 1;
1912 /* fall into ... */
1913
1914 /*
1915 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
1916 * ACKs. If the ack is in the range
1917 * tp->snd_una < th->th_ack <= tp->snd_max
1918 * then advance tp->snd_una to th->th_ack and drop
1919 * data from the retransmission queue. If this ACK reflects
1920 * more up to date window information we update our window information.
1921 */
1922 case TCPS_ESTABLISHED:
1923 case TCPS_FIN_WAIT_1:
1924 case TCPS_FIN_WAIT_2:
1925 case TCPS_CLOSE_WAIT:
1926 case TCPS_CLOSING:
1927 case TCPS_LAST_ACK:
1928 case TCPS_TIME_WAIT:
1929
1930 if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
1931 if (tlen == 0 && tiwin == tp->snd_wnd) {
1932 tcpstat.tcps_rcvdupack++;
1933 /*
1934 * If we have outstanding data (other than
1935 * a window probe), this is a completely
1936 * duplicate ack (ie, window info didn't
1937 * change), the ack is the biggest we've
1938 * seen and we've seen exactly our rexmt
1939 * threshhold of them, assume a packet
1940 * has been dropped and retransmit it.
1941 * Kludge snd_nxt & the congestion
1942 * window so we send only this one
1943 * packet.
1944 *
1945 * We know we're losing at the current
1946 * window size so do congestion avoidance
1947 * (set ssthresh to half the current window
1948 * and pull our congestion window back to
1949 * the new ssthresh).
1950 *
1951 * Dup acks mean that packets have left the
1952 * network (they're now cached at the receiver)
1953 * so bump cwnd by the amount in the receiver
1954 * to keep a constant cwnd packets in the
1955 * network.
1956 */
1957 if (tp->t_timer[TCPT_REXMT] == 0 ||
1958 th->th_ack != tp->snd_una)
1959 tp->t_dupacks = 0;
1960 else if (++tp->t_dupacks == tcprexmtthresh) {
1961 tcp_seq onxt = tp->snd_nxt;
1962 u_int win =
1963 min(tp->snd_wnd, tp->snd_cwnd) / 2 /
1964 tp->t_maxseg;
1965 if (tcp_do_newreno && SEQ_LT(th->th_ack,
1966 tp->snd_recover)) {
1967 /* False retransmit, should not
1968 * cut window
1969 */
1970 tp->snd_cwnd += tp->t_maxseg;
1971 tp->t_dupacks = 0;
1972 (void) tcp_output(tp);
1973 goto drop;
1974 }
1975 if (win < 2)
1976 win = 2;
1977 tp->snd_ssthresh = win * tp->t_maxseg;
1978 tp->snd_recover = tp->snd_max;
1979 tp->t_timer[TCPT_REXMT] = 0;
1980 tp->t_rtttime = 0;
1981 tp->snd_nxt = th->th_ack;
1982 tp->snd_cwnd = tp->t_maxseg;
1983 (void) tcp_output(tp);
1984 tp->snd_cwnd = tp->snd_ssthresh +
1985 tp->t_maxseg * tp->t_dupacks;
1986 if (SEQ_GT(onxt, tp->snd_nxt))
1987 tp->snd_nxt = onxt;
1988 goto drop;
1989 } else if (tp->t_dupacks > tcprexmtthresh) {
1990 tp->snd_cwnd += tp->t_maxseg;
1991 (void) tcp_output(tp);
1992 goto drop;
1993 }
1994 } else
1995 tp->t_dupacks = 0;
1996 break;
1997 }
1998 /*
1999 * If the congestion window was inflated to account
2000 * for the other side's cached packets, retract it.
2001 */
2002 if (tcp_do_newreno == 0) {
2003 if (tp->t_dupacks >= tcprexmtthresh &&
2004 tp->snd_cwnd > tp->snd_ssthresh)
2005 tp->snd_cwnd = tp->snd_ssthresh;
2006 tp->t_dupacks = 0;
2007 } else if (tp->t_dupacks >= tcprexmtthresh &&
2008 !tcp_newreno(tp, th)) {
2009 /*
2010 * Window inflation should have left us with approx.
2011 * snd_ssthresh outstanding data. But in case we
2012 * would be inclined to send a burst, better to do
2013 * it via the slow start mechanism.
2014 */
2015 if (SEQ_GT(th->th_ack + tp->snd_ssthresh, tp->snd_max))
2016 tp->snd_cwnd =
2017 tp->snd_max - th->th_ack + tp->t_maxseg;
2018 else
2019 tp->snd_cwnd = tp->snd_ssthresh;
2020 tp->t_dupacks = 0;
2021 }
2022
2023 if (tp->t_dupacks < tcprexmtthresh)
2024 tp->t_dupacks = 0;
2025
2026 if (SEQ_GT(th->th_ack, tp->snd_max)) {
2027 tcpstat.tcps_rcvacktoomuch++;
2028 goto dropafterack;
2029 }
2030 /*
2031 * If we reach this point, ACK is not a duplicate,
2032 * i.e., it ACKs something we sent.
2033 */
2034 if (tp->t_flags & TF_NEEDSYN) {
2035 /*
2036 * T/TCP: Connection was half-synchronized, and our
2037 * SYN has been ACK'd (so connection is now fully
2038 * synchronized). Go to non-starred state,
2039 * increment snd_una for ACK of SYN, and check if
2040 * we can do window scaling.
2041 */
2042 tp->t_flags &= ~TF_NEEDSYN;
2043 tp->snd_una++;
2044 /* Do window scaling? */
2045 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
2046 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
2047 tp->snd_scale = tp->requested_s_scale;
2048 tp->rcv_scale = tp->request_r_scale;
2049 }
2050 }
2051
2052 process_ACK:
2053 acked = th->th_ack - tp->snd_una;
2054 tcpstat.tcps_rcvackpack++;
2055 tcpstat.tcps_rcvackbyte += acked;
2056
2057 /*
2058 * If we just performed our first retransmit, and the ACK
2059 * arrives within our recovery window, then it was a mistake
2060 * to do the retransmit in the first place. Recover our
2061 * original cwnd and ssthresh, and proceed to transmit where
2062 * we left off.
2063 */
2064 if (tp->t_rxtshift == 1 && tcp_now < tp->t_badrxtwin) {
2065 tp->snd_cwnd = tp->snd_cwnd_prev;
2066 tp->snd_ssthresh = tp->snd_ssthresh_prev;
2067 tp->snd_nxt = tp->snd_max;
2068 tp->t_badrxtwin = 0; /* XXX probably not required */
2069 }
2070
2071 /*
2072 * If we have a timestamp reply, update smoothed
2073 * round trip time. If no timestamp is present but
2074 * transmit timer is running and timed sequence
2075 * number was acked, update smoothed round trip time.
2076 * Since we now have an rtt measurement, cancel the
2077 * timer backoff (cf., Phil Karn's retransmit alg.).
2078 * Recompute the initial retransmit timer.
2079 * Also makes sure we have a valid time stamp in hand
2080 */
2081 if (((to.to_flag & TOF_TS) != 0) && (to.to_tsecr != 0))
2082 tcp_xmit_timer(tp, tcp_now - to.to_tsecr + 1);
2083 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
2084 tcp_xmit_timer(tp, tp->t_rtttime);
2085
2086 /*
2087 * If all outstanding data is acked, stop retransmit
2088 * timer and remember to restart (more output or persist).
2089 * If there is more data to be acked, restart retransmit
2090 * timer, using current (possibly backed-off) value.
2091 */
2092 if (th->th_ack == tp->snd_max) {
2093 tp->t_timer[TCPT_REXMT] = 0;
2094 needoutput = 1;
2095 } else if (tp->t_timer[TCPT_PERSIST] == 0)
2096 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
2097
2098 /*
2099 * If no data (only SYN) was ACK'd,
2100 * skip rest of ACK processing.
2101 */
2102 if (acked == 0)
2103 goto step6;
2104
2105 /*
2106 * When new data is acked, open the congestion window.
2107 * If the window gives us less than ssthresh packets
2108 * in flight, open exponentially (maxseg per packet).
2109 * Otherwise open linearly: maxseg per window
2110 * (maxseg^2 / cwnd per packet).
2111 */
2112 {
2113 register u_int cw = tp->snd_cwnd;
2114 register u_int incr = tp->t_maxseg;
2115
2116 if (cw > tp->snd_ssthresh)
2117 incr = incr * incr / cw;
2118 /*
2119 * If t_dupacks != 0 here, it indicates that we are still
2120 * in NewReno fast recovery mode, so we leave the congestion
2121 * window alone.
2122 */
2123 if (tcp_do_newreno == 0 || tp->t_dupacks == 0)
2124 tp->snd_cwnd = min(cw + incr,TCP_MAXWIN<<tp->snd_scale);
2125 }
2126 if (acked > so->so_snd.sb_cc) {
2127 tp->snd_wnd -= so->so_snd.sb_cc;
2128 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
2129 ourfinisacked = 1;
2130 } else {
2131 sbdrop(&so->so_snd, acked);
2132 tp->snd_wnd -= acked;
2133 ourfinisacked = 0;
2134 }
2135 tp->snd_una = th->th_ack;
2136 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
2137 tp->snd_nxt = tp->snd_una;
2138 sowwakeup(so);
2139
2140 switch (tp->t_state) {
2141
2142 /*
2143 * In FIN_WAIT_1 STATE in addition to the processing
2144 * for the ESTABLISHED state if our FIN is now acknowledged
2145 * then enter FIN_WAIT_2.
2146 */
2147 case TCPS_FIN_WAIT_1:
2148 if (ourfinisacked) {
2149 /*
2150 * If we can't receive any more
2151 * data, then closing user can proceed.
2152 * Starting the timer is contrary to the
2153 * specification, but if we don't get a FIN
2154 * we'll hang forever.
2155 */
2156 if (so->so_state & SS_CANTRCVMORE) {
2157 soisdisconnected(so);
2158 tp->t_timer[TCPT_2MSL] = tcp_maxidle;
2159 }
2160 add_to_time_wait(tp);
2161 tp->t_state = TCPS_FIN_WAIT_2;
2162 }
2163 break;
2164
2165 /*
2166 * In CLOSING STATE in addition to the processing for
2167 * the ESTABLISHED state if the ACK acknowledges our FIN
2168 * then enter the TIME-WAIT state, otherwise ignore
2169 * the segment.
2170 */
2171 case TCPS_CLOSING:
2172 if (ourfinisacked) {
2173 tp->t_state = TCPS_TIME_WAIT;
2174 tcp_canceltimers(tp);
2175 /* Shorten TIME_WAIT [RFC-1644, p.28] */
2176 if (tp->cc_recv != 0 &&
2177 tp->t_starttime < tcp_msl)
2178 tp->t_timer[TCPT_2MSL] =
2179 tp->t_rxtcur * TCPTV_TWTRUNC;
2180 else
2181 tp->t_timer[TCPT_2MSL] = 2 * tcp_msl;
2182 add_to_time_wait(tp);
2183 soisdisconnected(so);
2184 }
2185 break;
2186
2187 /*
2188 * In LAST_ACK, we may still be waiting for data to drain
2189 * and/or to be acked, as well as for the ack of our FIN.
2190 * If our FIN is now acknowledged, delete the TCB,
2191 * enter the closed state and return.
2192 */
2193 case TCPS_LAST_ACK:
2194 if (ourfinisacked) {
2195 tp = tcp_close(tp);
2196 goto drop;
2197 }
2198 break;
2199
2200 /*
2201 * In TIME_WAIT state the only thing that should arrive
2202 * is a retransmission of the remote FIN. Acknowledge
2203 * it and restart the finack timer.
2204 */
2205 case TCPS_TIME_WAIT:
2206 tp->t_timer[TCPT_2MSL] = 2 * tcp_msl;
2207 add_to_time_wait(tp);
2208 goto dropafterack;
2209 }
2210 }
2211
2212 step6:
2213 /*
2214 * Update window information.
2215 * Don't look at window if no ACK: TAC's send garbage on first SYN.
2216 */
2217 if ((thflags & TH_ACK) &&
2218 (SEQ_LT(tp->snd_wl1, th->th_seq) ||
2219 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
2220 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
2221 /* keep track of pure window updates */
2222 if (tlen == 0 &&
2223 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
2224 tcpstat.tcps_rcvwinupd++;
2225 tp->snd_wnd = tiwin;
2226 tp->snd_wl1 = th->th_seq;
2227 tp->snd_wl2 = th->th_ack;
2228 if (tp->snd_wnd > tp->max_sndwnd)
2229 tp->max_sndwnd = tp->snd_wnd;
2230 needoutput = 1;
2231 }
2232
2233 /*
2234 * Process segments with URG.
2235 */
2236 if ((thflags & TH_URG) && th->th_urp &&
2237 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2238 /*
2239 * This is a kludge, but if we receive and accept
2240 * random urgent pointers, we'll crash in
2241 * soreceive. It's hard to imagine someone
2242 * actually wanting to send this much urgent data.
2243 */
2244 if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
2245 th->th_urp = 0; /* XXX */
2246 thflags &= ~TH_URG; /* XXX */
2247 goto dodata; /* XXX */
2248 }
2249 /*
2250 * If this segment advances the known urgent pointer,
2251 * then mark the data stream. This should not happen
2252 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
2253 * a FIN has been received from the remote side.
2254 * In these states we ignore the URG.
2255 *
2256 * According to RFC961 (Assigned Protocols),
2257 * the urgent pointer points to the last octet
2258 * of urgent data. We continue, however,
2259 * to consider it to indicate the first octet
2260 * of data past the urgent section as the original
2261 * spec states (in one of two places).
2262 */
2263 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
2264 tp->rcv_up = th->th_seq + th->th_urp;
2265 so->so_oobmark = so->so_rcv.sb_cc +
2266 (tp->rcv_up - tp->rcv_nxt) - 1;
2267 if (so->so_oobmark == 0) {
2268 so->so_state |= SS_RCVATMARK;
2269 postevent(so, 0, EV_OOB);
2270 }
2271 sohasoutofband(so);
2272 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
2273 }
2274 /*
2275 * Remove out of band data so doesn't get presented to user.
2276 * This can happen independent of advancing the URG pointer,
2277 * but if two URG's are pending at once, some out-of-band
2278 * data may creep in... ick.
2279 */
2280 if (th->th_urp <= (u_long)tlen
2281 #if SO_OOBINLINE
2282 && (so->so_options & SO_OOBINLINE) == 0
2283 #endif
2284 )
2285 tcp_pulloutofband(so, th, m,
2286 drop_hdrlen); /* hdr drop is delayed */
2287 } else
2288 /*
2289 * If no out of band data is expected,
2290 * pull receive urgent pointer along
2291 * with the receive window.
2292 */
2293 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
2294 tp->rcv_up = tp->rcv_nxt;
2295 dodata: /* XXX */
2296
2297 /*
2298 * Process the segment text, merging it into the TCP sequencing queue,
2299 * and arranging for acknowledgment of receipt if necessary.
2300 * This process logically involves adjusting tp->rcv_wnd as data
2301 * is presented to the user (this happens in tcp_usrreq.c,
2302 * case PRU_RCVD). If a FIN has already been received on this
2303 * connection then we just ignore the text.
2304 */
2305 if ((tlen || (thflags&TH_FIN)) &&
2306 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2307 m_adj(m, drop_hdrlen); /* delayed header drop */
2308 /*
2309 * Insert segment which inludes th into reassembly queue of tcp with
2310 * control block tp. Return TH_FIN if reassembly now includes
2311 * a segment with FIN. This handle the common case inline (segment
2312 * is the next to be received on an established connection, and the
2313 * queue is empty), avoiding linkage into and removal from the queue
2314 * and repetition of various conversions.
2315 * Set DELACK for segments received in order, but ack immediately
2316 * when segments are out of order (so fast retransmit can work).
2317 */
2318 if (th->th_seq == tp->rcv_nxt &&
2319 LIST_EMPTY(&tp->t_segq) &&
2320 TCPS_HAVEESTABLISHED(tp->t_state)) {
2321 #ifdef __APPLE__
2322 if (tcp_delack_enabled) {
2323 TCP_DELACK_BITSET(tp->t_inpcb->hash_element);
2324 tp->t_flags |= TF_DELACK;
2325 }
2326 #else
2327 if (DELAY_ACK(tp))
2328 callout_reset(tp->tt_delack, tcp_delacktime,
2329 tcp_timer_delack, tp);
2330 #endif
2331 else
2332 tp->t_flags |= TF_ACKNOW;
2333 tp->rcv_nxt += tlen;
2334 thflags = th->th_flags & TH_FIN;
2335 tcpstat.tcps_rcvpack++;
2336 tcpstat.tcps_rcvbyte += tlen;
2337 ND6_HINT(tp);
2338 sbappend(&so->so_rcv, m);
2339 sorwakeup(so);
2340 } else {
2341 thflags = tcp_reass(tp, th, &tlen, m);
2342 tp->t_flags |= TF_ACKNOW;
2343 }
2344
2345 if (tp->t_flags & TF_DELACK)
2346 {
2347 #if INET6
2348 if (isipv6) {
2349 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
2350 (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
2351 th->th_seq, th->th_ack, th->th_win);
2352 }
2353 else
2354 #endif
2355 {
2356 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
2357 (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
2358 th->th_seq, th->th_ack, th->th_win);
2359 }
2360
2361 }
2362 /*
2363 * Note the amount of data that peer has sent into
2364 * our window, in order to estimate the sender's
2365 * buffer size.
2366 */
2367 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
2368 } else {
2369 m_freem(m);
2370 thflags &= ~TH_FIN;
2371 }
2372
2373 /*
2374 * If FIN is received ACK the FIN and let the user know
2375 * that the connection is closing.
2376 */
2377 if (thflags & TH_FIN) {
2378 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2379 socantrcvmore(so);
2380 postevent(so, 0, EV_FIN);
2381 /*
2382 * If connection is half-synchronized
2383 * (ie NEEDSYN flag on) then delay ACK,
2384 * so it may be piggybacked when SYN is sent.
2385 * Otherwise, since we received a FIN then no
2386 * more input can be expected, send ACK now.
2387 */
2388 if (tcp_delack_enabled && (tp->t_flags & TF_NEEDSYN)) {
2389 TCP_DELACK_BITSET(tp->t_inpcb->hash_element);
2390 tp->t_flags |= TF_DELACK;
2391 }
2392 else
2393 tp->t_flags |= TF_ACKNOW;
2394 tp->rcv_nxt++;
2395 }
2396 switch (tp->t_state) {
2397
2398 /*
2399 * In SYN_RECEIVED and ESTABLISHED STATES
2400 * enter the CLOSE_WAIT state.
2401 */
2402 case TCPS_SYN_RECEIVED:
2403 /*FALLTHROUGH*/
2404 case TCPS_ESTABLISHED:
2405 tp->t_state = TCPS_CLOSE_WAIT;
2406 break;
2407
2408 /*
2409 * If still in FIN_WAIT_1 STATE FIN has not been acked so
2410 * enter the CLOSING state.
2411 */
2412 case TCPS_FIN_WAIT_1:
2413 tp->t_state = TCPS_CLOSING;
2414 break;
2415
2416 /*
2417 * In FIN_WAIT_2 state enter the TIME_WAIT state,
2418 * starting the time-wait timer, turning off the other
2419 * standard timers.
2420 */
2421 case TCPS_FIN_WAIT_2:
2422 tp->t_state = TCPS_TIME_WAIT;
2423 tcp_canceltimers(tp);
2424 /* Shorten TIME_WAIT [RFC-1644, p.28] */
2425 if (tp->cc_recv != 0 &&
2426 tp->t_starttime < tcp_msl) {
2427 tp->t_timer[TCPT_2MSL] =
2428 tp->t_rxtcur * TCPTV_TWTRUNC;
2429 /* For transaction client, force ACK now. */
2430 tp->t_flags |= TF_ACKNOW;
2431 }
2432 else
2433 tp->t_timer[TCPT_2MSL] = 2 * tcp_msl;
2434
2435 add_to_time_wait(tp);
2436 soisdisconnected(so);
2437 break;
2438
2439 /*
2440 * In TIME_WAIT state restart the 2 MSL time_wait timer.
2441 */
2442 case TCPS_TIME_WAIT:
2443 tp->t_timer[TCPT_2MSL] = 2 * tcp_msl;
2444 add_to_time_wait(tp);
2445 break;
2446 }
2447 }
2448 #if TCPDEBUG
2449 if (so->so_options & SO_DEBUG)
2450 tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
2451 &tcp_savetcp, 0);
2452 #endif
2453
2454 /*
2455 * Return any desired output.
2456 */
2457 if (needoutput || (tp->t_flags & TF_ACKNOW))
2458 (void) tcp_output(tp);
2459 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
2460 return;
2461
2462 dropafterack:
2463 /*
2464 * Generate an ACK dropping incoming segment if it occupies
2465 * sequence space, where the ACK reflects our state.
2466 *
2467 * We can now skip the test for the RST flag since all
2468 * paths to this code happen after packets containing
2469 * RST have been dropped.
2470 *
2471 * In the SYN-RECEIVED state, don't send an ACK unless the
2472 * segment we received passes the SYN-RECEIVED ACK test.
2473 * If it fails send a RST. This breaks the loop in the
2474 * "LAND" DoS attack, and also prevents an ACK storm
2475 * between two listening ports that have been sent forged
2476 * SYN segments, each with the source address of the other.
2477 */
2478 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
2479 (SEQ_GT(tp->snd_una, th->th_ack) ||
2480 SEQ_GT(th->th_ack, tp->snd_max)) ) {
2481 rstreason = BANDLIM_RST_OPENPORT;
2482 goto dropwithreset;
2483 }
2484 #if TCPDEBUG
2485 if (so->so_options & SO_DEBUG)
2486 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
2487 &tcp_savetcp, 0);
2488 #endif
2489 m_freem(m);
2490 tp->t_flags |= TF_ACKNOW;
2491 (void) tcp_output(tp);
2492 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
2493 return;
2494
2495 dropwithreset:
2496 /*
2497 * Generate a RST, dropping incoming segment.
2498 * Make ACK acceptable to originator of segment.
2499 * Don't bother to respond if destination was broadcast/multicast.
2500 */
2501 if ((thflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST))
2502 goto drop;
2503 #if INET6
2504 if (isipv6) {
2505 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
2506 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
2507 goto drop;
2508 } else
2509 #endif /* INET6 */
2510 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
2511 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
2512 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
2513 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
2514 goto drop;
2515 /* IPv6 anycast check is done at tcp6_input() */
2516
2517 /*
2518 * Perform bandwidth limiting.
2519 */
2520 #if ICMP_BANDLIM
2521 if (badport_bandlim(rstreason) < 0)
2522 goto drop;
2523 #endif
2524
2525 #if TCPDEBUG
2526 if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
2527 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
2528 &tcp_savetcp, 0);
2529 #endif
2530 if (thflags & TH_ACK)
2531 /* mtod() below is safe as long as hdr dropping is delayed */
2532 tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack,
2533 TH_RST);
2534 else {
2535 if (thflags & TH_SYN)
2536 tlen++;
2537 /* mtod() below is safe as long as hdr dropping is delayed */
2538 tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
2539 (tcp_seq)0, TH_RST|TH_ACK);
2540 }
2541 /* destroy temporarily created socket */
2542 if (dropsocket)
2543 (void) soabort(so);
2544 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
2545 return;
2546
2547 drop:
2548 /*
2549 * Drop space held by incoming segment and return.
2550 */
2551 #if TCPDEBUG
2552 if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
2553 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
2554 &tcp_savetcp, 0);
2555 #endif
2556 m_freem(m);
2557 /* destroy temporarily created socket */
2558 if (dropsocket)
2559 (void) soabort(so);
2560 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
2561 return;
2562 }
2563
2564 static void
2565 tcp_dooptions(tp, cp, cnt, th, to)
2566 struct tcpcb *tp;
2567 u_char *cp;
2568 int cnt;
2569 struct tcphdr *th;
2570 struct tcpopt *to;
2571 {
2572 u_short mss = 0;
2573 int opt, optlen;
2574
2575 for (; cnt > 0; cnt -= optlen, cp += optlen) {
2576 opt = cp[0];
2577 if (opt == TCPOPT_EOL)
2578 break;
2579 if (opt == TCPOPT_NOP)
2580 optlen = 1;
2581 else {
2582 if (cnt < 2)
2583 break;
2584 optlen = cp[1];
2585 if (optlen < 2 || optlen > cnt)
2586 break;
2587 }
2588 switch (opt) {
2589
2590 default:
2591 continue;
2592
2593 case TCPOPT_MAXSEG:
2594 if (optlen != TCPOLEN_MAXSEG)
2595 continue;
2596 if (!(th->th_flags & TH_SYN))
2597 continue;
2598 bcopy((char *) cp + 2, (char *) &mss, sizeof(mss));
2599 NTOHS(mss);
2600 break;
2601
2602 case TCPOPT_WINDOW:
2603 if (optlen != TCPOLEN_WINDOW)
2604 continue;
2605 if (!(th->th_flags & TH_SYN))
2606 continue;
2607 tp->t_flags |= TF_RCVD_SCALE;
2608 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
2609 break;
2610
2611 case TCPOPT_TIMESTAMP:
2612 if (optlen != TCPOLEN_TIMESTAMP)
2613 continue;
2614 to->to_flag |= TOF_TS;
2615 bcopy((char *)cp + 2,
2616 (char *)&to->to_tsval, sizeof(to->to_tsval));
2617 NTOHL(to->to_tsval);
2618 bcopy((char *)cp + 6,
2619 (char *)&to->to_tsecr, sizeof(to->to_tsecr));
2620 NTOHL(to->to_tsecr);
2621
2622 /*
2623 * A timestamp received in a SYN makes
2624 * it ok to send timestamp requests and replies.
2625 */
2626 if (th->th_flags & TH_SYN) {
2627 tp->t_flags |= TF_RCVD_TSTMP;
2628 tp->ts_recent = to->to_tsval;
2629 tp->ts_recent_age = tcp_now;
2630 }
2631 break;
2632 case TCPOPT_CC:
2633 if (optlen != TCPOLEN_CC)
2634 continue;
2635 to->to_flag |= TOF_CC;
2636 bcopy((char *)cp + 2,
2637 (char *)&to->to_cc, sizeof(to->to_cc));
2638 NTOHL(to->to_cc);
2639 /*
2640 * A CC or CC.new option received in a SYN makes
2641 * it ok to send CC in subsequent segments.
2642 */
2643 if (th->th_flags & TH_SYN)
2644 tp->t_flags |= TF_RCVD_CC;
2645 break;
2646 case TCPOPT_CCNEW:
2647 if (optlen != TCPOLEN_CC)
2648 continue;
2649 if (!(th->th_flags & TH_SYN))
2650 continue;
2651 to->to_flag |= TOF_CCNEW;
2652 bcopy((char *)cp + 2,
2653 (char *)&to->to_cc, sizeof(to->to_cc));
2654 NTOHL(to->to_cc);
2655 /*
2656 * A CC or CC.new option received in a SYN makes
2657 * it ok to send CC in subsequent segments.
2658 */
2659 tp->t_flags |= TF_RCVD_CC;
2660 break;
2661 case TCPOPT_CCECHO:
2662 if (optlen != TCPOLEN_CC)
2663 continue;
2664 if (!(th->th_flags & TH_SYN))
2665 continue;
2666 to->to_flag |= TOF_CCECHO;
2667 bcopy((char *)cp + 2,
2668 (char *)&to->to_ccecho, sizeof(to->to_ccecho));
2669 NTOHL(to->to_ccecho);
2670 break;
2671 }
2672 }
2673 if (th->th_flags & TH_SYN)
2674 tcp_mss(tp, mss); /* sets t_maxseg */
2675 }
2676
2677 /*
2678 * Pull out of band byte out of a segment so
2679 * it doesn't appear in the user's data queue.
2680 * It is still reflected in the segment length for
2681 * sequencing purposes.
2682 */
2683 static void
2684 tcp_pulloutofband(so, th, m, off)
2685 struct socket *so;
2686 struct tcphdr *th;
2687 register struct mbuf *m;
2688 int off; /* delayed to be droped hdrlen */
2689 {
2690 int cnt = off + th->th_urp - 1;
2691
2692 while (cnt >= 0) {
2693 if (m->m_len > cnt) {
2694 char *cp = mtod(m, caddr_t) + cnt;
2695 struct tcpcb *tp = sototcpcb(so);
2696
2697 tp->t_iobc = *cp;
2698 tp->t_oobflags |= TCPOOB_HAVEDATA;
2699 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
2700 m->m_len--;
2701 if (m->m_flags & M_PKTHDR)
2702 m->m_pkthdr.len--;
2703 return;
2704 }
2705 cnt -= m->m_len;
2706 m = m->m_next;
2707 if (m == 0)
2708 break;
2709 }
2710 panic("tcp_pulloutofband");
2711 }
2712
2713 /*
2714 * Collect new round-trip time estimate
2715 * and update averages and current timeout.
2716 */
2717 static void
2718 tcp_xmit_timer(tp, rtt)
2719 register struct tcpcb *tp;
2720 int rtt;
2721 {
2722 register int delta;
2723
2724 tcpstat.tcps_rttupdated++;
2725 tp->t_rttupdated++;
2726 if (tp->t_srtt != 0) {
2727 /*
2728 * srtt is stored as fixed point with 5 bits after the
2729 * binary point (i.e., scaled by 8). The following magic
2730 * is equivalent to the smoothing algorithm in rfc793 with
2731 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
2732 * point). Adjust rtt to origin 0.
2733 */
2734 delta = ((rtt - 1) << TCP_DELTA_SHIFT)
2735 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
2736
2737 if ((tp->t_srtt += delta) <= 0)
2738 tp->t_srtt = 1;
2739
2740 /*
2741 * We accumulate a smoothed rtt variance (actually, a
2742 * smoothed mean difference), then set the retransmit
2743 * timer to smoothed rtt + 4 times the smoothed variance.
2744 * rttvar is stored as fixed point with 4 bits after the
2745 * binary point (scaled by 16). The following is
2746 * equivalent to rfc793 smoothing with an alpha of .75
2747 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
2748 * rfc793's wired-in beta.
2749 */
2750 if (delta < 0)
2751 delta = -delta;
2752 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
2753 if ((tp->t_rttvar += delta) <= 0)
2754 tp->t_rttvar = 1;
2755 } else {
2756 /*
2757 * No rtt measurement yet - use the unsmoothed rtt.
2758 * Set the variance to half the rtt (so our first
2759 * retransmit happens at 3*rtt).
2760 */
2761 tp->t_srtt = rtt << TCP_RTT_SHIFT;
2762 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
2763 }
2764 tp->t_rtttime = 0;
2765 tp->t_rxtshift = 0;
2766
2767 /*
2768 * the retransmit should happen at rtt + 4 * rttvar.
2769 * Because of the way we do the smoothing, srtt and rttvar
2770 * will each average +1/2 tick of bias. When we compute
2771 * the retransmit timer, we want 1/2 tick of rounding and
2772 * 1 extra tick because of +-1/2 tick uncertainty in the
2773 * firing of the timer. The bias will give us exactly the
2774 * 1.5 tick we need. But, because the bias is
2775 * statistical, we have to test that we don't drop below
2776 * the minimum feasible timer (which is 2 ticks).
2777 */
2778 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
2779 max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
2780
2781 /*
2782 * We received an ack for a packet that wasn't retransmitted;
2783 * it is probably safe to discard any error indications we've
2784 * received recently. This isn't quite right, but close enough
2785 * for now (a route might have failed after we sent a segment,
2786 * and the return path might not be symmetrical).
2787 */
2788 tp->t_softerror = 0;
2789 }
2790
2791 /*
2792 * Determine a reasonable value for maxseg size.
2793 * If the route is known, check route for mtu.
2794 * If none, use an mss that can be handled on the outgoing
2795 * interface without forcing IP to fragment; if bigger than
2796 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
2797 * to utilize large mbufs. If no route is found, route has no mtu,
2798 * or the destination isn't local, use a default, hopefully conservative
2799 * size (usually 512 or the default IP max size, but no more than the mtu
2800 * of the interface), as we can't discover anything about intervening
2801 * gateways or networks. We also initialize the congestion/slow start
2802 * window to be a single segment if the destination isn't local.
2803 * While looking at the routing entry, we also initialize other path-dependent
2804 * parameters from pre-set or cached values in the routing entry.
2805 *
2806 * Also take into account the space needed for options that we
2807 * send regularly. Make maxseg shorter by that amount to assure
2808 * that we can send maxseg amount of data even when the options
2809 * are present. Store the upper limit of the length of options plus
2810 * data in maxopd.
2811 *
2812 * NOTE that this routine is only called when we process an incoming
2813 * segment, for outgoing segments only tcp_mssopt is called.
2814 *
2815 * In case of T/TCP, we call this routine during implicit connection
2816 * setup as well (offer = -1), to initialize maxseg from the cached
2817 * MSS of our peer.
2818 */
2819 void
2820 tcp_mss(tp, offer)
2821 struct tcpcb *tp;
2822 int offer;
2823 {
2824 register struct rtentry *rt;
2825 struct ifnet *ifp;
2826 register int rtt, mss;
2827 u_long bufsize;
2828 struct inpcb *inp;
2829 struct socket *so;
2830 struct rmxp_tao *taop;
2831 int origoffer = offer;
2832 #if INET6
2833 int isipv6;
2834 int min_protoh;
2835 #endif
2836
2837 inp = tp->t_inpcb;
2838 #if INET6
2839 isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
2840 min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr)
2841 : sizeof (struct tcpiphdr);
2842 #else
2843 #define min_protoh (sizeof (struct tcpiphdr))
2844 #endif
2845 #if INET6
2846 if (isipv6)
2847 rt = tcp_rtlookup6(inp);
2848 else
2849 #endif /* INET6 */
2850 rt = tcp_rtlookup(inp);
2851 if (rt == NULL) {
2852 tp->t_maxopd = tp->t_maxseg =
2853 #if INET6
2854 isipv6 ? tcp_v6mssdflt :
2855 #endif /* INET6 */
2856 tcp_mssdflt;
2857 return;
2858 }
2859 ifp = rt->rt_ifp;
2860 /*
2861 * Slower link window correction:
2862 * If a value is specificied for slowlink_wsize use it for PPP links
2863 * believed to be on a serial modem (speed <128Kbps). Excludes 9600bps as
2864 * it is the default value adversized by pseudo-devices over ppp.
2865 */
2866 if (ifp->if_type == IFT_PPP && slowlink_wsize > 0 &&
2867 ifp->if_baudrate > 9600 && ifp->if_baudrate <= 128000) {
2868 tp->t_flags |= TF_SLOWLINK;
2869 }
2870 so = inp->inp_socket;
2871
2872 taop = rmx_taop(rt->rt_rmx);
2873 /*
2874 * Offer == -1 means that we didn't receive SYN yet,
2875 * use cached value in that case;
2876 */
2877 if (offer == -1)
2878 offer = taop->tao_mssopt;
2879 /*
2880 * Offer == 0 means that there was no MSS on the SYN segment,
2881 * in this case we use tcp_mssdflt.
2882 */
2883 if (offer == 0)
2884 offer =
2885 #if INET6
2886 isipv6 ? tcp_v6mssdflt :
2887 #endif /* INET6 */
2888 tcp_mssdflt;
2889 else
2890 /*
2891 * Sanity check: make sure that maxopd will be large
2892 * enough to allow some data on segments even is the
2893 * all the option space is used (40bytes). Otherwise
2894 * funny things may happen in tcp_output.
2895 */
2896 offer = max(offer, 64);
2897 taop->tao_mssopt = offer;
2898
2899 /*
2900 * While we're here, check if there's an initial rtt
2901 * or rttvar. Convert from the route-table units
2902 * to scaled multiples of the slow timeout timer.
2903 */
2904 if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) {
2905 /*
2906 * XXX the lock bit for RTT indicates that the value
2907 * is also a minimum value; this is subject to time.
2908 */
2909 if (rt->rt_rmx.rmx_locks & RTV_RTT)
2910 tp->t_rttmin = rtt / (RTM_RTTUNIT / PR_SLOWHZ);
2911 tp->t_srtt = rtt / (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTT_SCALE));
2912 tcpstat.tcps_usedrtt++;
2913 if (rt->rt_rmx.rmx_rttvar) {
2914 tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
2915 (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTTVAR_SCALE));
2916 tcpstat.tcps_usedrttvar++;
2917 } else {
2918 /* default variation is +- 1 rtt */
2919 tp->t_rttvar =
2920 tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
2921 }
2922 TCPT_RANGESET(tp->t_rxtcur,
2923 ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
2924 tp->t_rttmin, TCPTV_REXMTMAX);
2925 }
2926 /*
2927 * if there's an mtu associated with the route, use it
2928 * else, use the link mtu.
2929 */
2930 if (rt->rt_rmx.rmx_mtu)
2931 mss = rt->rt_rmx.rmx_mtu - min_protoh;
2932 else
2933 {
2934 mss =
2935 #if INET6
2936 (isipv6 ? nd_ifinfo[rt->rt_ifp->if_index].linkmtu :
2937 #endif
2938 ifp->if_mtu
2939 #if INET6
2940 )
2941 #endif
2942 - min_protoh;
2943 #if INET6
2944 if (isipv6) {
2945 if (!in6_localaddr(&inp->in6p_faddr))
2946 mss = min(mss, tcp_v6mssdflt);
2947 } else
2948 #endif /* INET6 */
2949 if (!in_localaddr(inp->inp_faddr))
2950 mss = min(mss, tcp_mssdflt);
2951 }
2952 mss = min(mss, offer);
2953 /*
2954 * maxopd stores the maximum length of data AND options
2955 * in a segment; maxseg is the amount of data in a normal
2956 * segment. We need to store this value (maxopd) apart
2957 * from maxseg, because now every segment carries options
2958 * and thus we normally have somewhat less data in segments.
2959 */
2960 tp->t_maxopd = mss;
2961
2962 /*
2963 * In case of T/TCP, origoffer==-1 indicates, that no segments
2964 * were received yet. In this case we just guess, otherwise
2965 * we do the same as before T/TCP.
2966 */
2967 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
2968 (origoffer == -1 ||
2969 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
2970 mss -= TCPOLEN_TSTAMP_APPA;
2971 if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
2972 (origoffer == -1 ||
2973 (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC))
2974 mss -= TCPOLEN_CC_APPA;
2975
2976 #if (MCLBYTES & (MCLBYTES - 1)) == 0
2977 if (mss > MCLBYTES)
2978 mss &= ~(MCLBYTES-1);
2979 #else
2980 if (mss > MCLBYTES)
2981 mss = mss / MCLBYTES * MCLBYTES;
2982 #endif
2983 /*
2984 * If there's a pipesize, change the socket buffer
2985 * to that size. Make the socket buffers an integral
2986 * number of mss units; if the mss is larger than
2987 * the socket buffer, decrease the mss.
2988 */
2989 #if RTV_SPIPE
2990 if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0)
2991 #endif
2992 bufsize = so->so_snd.sb_hiwat;
2993 if (bufsize < mss)
2994 mss = bufsize;
2995 else {
2996 bufsize = roundup(bufsize, mss);
2997 if (bufsize > sb_max)
2998 bufsize = sb_max;
2999 (void)sbreserve(&so->so_snd, bufsize);
3000 }
3001 tp->t_maxseg = mss;
3002
3003 #if RTV_RPIPE
3004 if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0)
3005 #endif
3006 bufsize = so->so_rcv.sb_hiwat;
3007 if (bufsize > mss) {
3008 bufsize = roundup(bufsize, mss);
3009 if (bufsize > sb_max)
3010 bufsize = sb_max;
3011 (void)sbreserve(&so->so_rcv, bufsize);
3012 }
3013
3014 /*
3015 * Set the slow-start flight size depending on whether this
3016 * is a local network or not.
3017 */
3018 if (
3019 #if INET6
3020 (isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
3021 (!isipv6 &&
3022 #endif
3023 in_localaddr(inp->inp_faddr)
3024 #if INET6
3025 )
3026 #endif
3027 )
3028 tp->snd_cwnd = mss * ss_fltsz_local;
3029 else
3030 tp->snd_cwnd = mss * ss_fltsz;
3031
3032 if (rt->rt_rmx.rmx_ssthresh) {
3033 /*
3034 * There's some sort of gateway or interface
3035 * buffer limit on the path. Use this to set
3036 * the slow start threshhold, but set the
3037 * threshold to no less than 2*mss.
3038 */
3039 tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
3040 tcpstat.tcps_usedssthresh++;
3041 }
3042 }
3043
3044 /*
3045 * Determine the MSS option to send on an outgoing SYN.
3046 */
3047 int
3048 tcp_mssopt(tp)
3049 struct tcpcb *tp;
3050 {
3051 struct rtentry *rt;
3052 #if INET6
3053 int isipv6;
3054 int min_protoh;
3055 #endif
3056
3057 #if INET6
3058 isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
3059 min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr)
3060 : sizeof (struct tcpiphdr);
3061 #else
3062 #define min_protoh (sizeof (struct tcpiphdr))
3063 #endif
3064 #if INET6
3065 if (isipv6)
3066 rt = tcp_rtlookup6(tp->t_inpcb);
3067 else
3068 #endif /* INET6 */
3069 rt = tcp_rtlookup(tp->t_inpcb);
3070 if (rt == NULL)
3071 return
3072 #if INET6
3073 isipv6 ? tcp_v6mssdflt :
3074 #endif /* INET6 */
3075 tcp_mssdflt;
3076 /*
3077 * Slower link window correction:
3078 * If a value is specificied for slowlink_wsize use it for PPP links
3079 * believed to be on a serial modem (speed <128Kbps). Excludes 9600bps as
3080 * it is the default value adversized by pseudo-devices over ppp.
3081 */
3082 if (rt->rt_ifp->if_type == IFT_PPP && slowlink_wsize > 0 &&
3083 rt->rt_ifp->if_baudrate > 9600 && rt->rt_ifp->if_baudrate <= 128000) {
3084 tp->t_flags |= TF_SLOWLINK;
3085 }
3086
3087 return rt->rt_ifp->if_mtu - min_protoh;
3088 }
3089
3090
3091 /*
3092 * Checks for partial ack. If partial ack arrives, force the retransmission
3093 * of the next unacknowledged segment, do not clear tp->t_dupacks, and return
3094 * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to
3095 * be started again. If the ack advances at least to tp->snd_recover, return 0.
3096 */
3097 static int
3098 tcp_newreno(tp, th)
3099 struct tcpcb *tp;
3100 struct tcphdr *th;
3101 {
3102 if (SEQ_LT(th->th_ack, tp->snd_recover)) {
3103 tcp_seq onxt = tp->snd_nxt;
3104 u_long ocwnd = tp->snd_cwnd;
3105 #ifdef __APPLE__
3106 tp->t_timer[TCPT_REXMT] = 0;
3107 #else
3108 callout_stop(tp->tt_rexmt);
3109 #endif
3110 tp->t_rtttime = 0;
3111 tp->snd_nxt = th->th_ack;
3112 /*
3113 * Set snd_cwnd to one segment beyond acknowledged offset
3114 * (tp->snd_una has not yet been updated when this function
3115 * is called)
3116 */
3117 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una);
3118 (void) tcp_output(tp);
3119 tp->snd_cwnd = ocwnd;
3120 if (SEQ_GT(onxt, tp->snd_nxt))
3121 tp->snd_nxt = onxt;
3122 /*
3123 * Partial window deflation. Relies on fact that tp->snd_una
3124 * not updated yet.
3125 */
3126 tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_maxseg);
3127 return (1);
3128 }
3129 return (0);
3130 }