]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/tcp_input.c
xnu-344.tar.gz
[apple/xnu.git] / bsd / netinet / tcp_input.c
1 /*
2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /*
23 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
24 * The Regents of the University of California. All rights reserved.
25 *
26 * Redistribution and use in source and binary forms, with or without
27 * modification, are permitted provided that the following conditions
28 * are met:
29 * 1. Redistributions of source code must retain the above copyright
30 * notice, this list of conditions and the following disclaimer.
31 * 2. Redistributions in binary form must reproduce the above copyright
32 * notice, this list of conditions and the following disclaimer in the
33 * documentation and/or other materials provided with the distribution.
34 * 3. All advertising materials mentioning features or use of this software
35 * must display the following acknowledgement:
36 * This product includes software developed by the University of
37 * California, Berkeley and its contributors.
38 * 4. Neither the name of the University nor the names of its contributors
39 * may be used to endorse or promote products derived from this software
40 * without specific prior written permission.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52 * SUCH DAMAGE.
53 *
54 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
55 * $FreeBSD: src/sys/netinet/tcp_input.c,v 1.107.2.16 2001/08/22 00:59:12 silby Exp $
56 */
57
58
59 #include <sys/param.h>
60 #include <sys/systm.h>
61 #include <sys/kernel.h>
62 #include <sys/sysctl.h>
63 #include <sys/malloc.h>
64 #include <sys/mbuf.h>
65 #include <sys/proc.h> /* for proc0 declaration */
66 #include <sys/protosw.h>
67 #include <sys/socket.h>
68 #include <sys/socketvar.h>
69 #include <sys/syslog.h>
70
71 #include <kern/cpu_number.h> /* before tcp_seq.h, for tcp_random18() */
72
73 #include <net/if.h>
74 #include <net/route.h>
75
76 #include <netinet/in.h>
77 #include <netinet/in_systm.h>
78 #include <netinet/ip.h>
79 #include <netinet/ip_icmp.h> /* for ICMP_BANDLIM */
80 #include <netinet/in_var.h>
81 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
82 #include <netinet/in_pcb.h>
83 #include <netinet/ip_var.h>
84 #if INET6
85 #include <netinet/ip6.h>
86 #include <netinet/icmp6.h>
87 #include <netinet6/nd6.h>
88 #include <netinet6/ip6_var.h>
89 #include <netinet6/in6_pcb.h>
90 #endif
91 #include <netinet/tcp.h>
92 #include <netinet/tcp_fsm.h>
93 #include <netinet/tcp_seq.h>
94 #include <netinet/tcp_timer.h>
95 #include <netinet/tcp_var.h>
96 #if INET6
97 #include <netinet6/tcp6_var.h>
98 #endif
99 #include <netinet/tcpip.h>
100 #if TCPDEBUG
101 #include <netinet/tcp_debug.h>
102 u_char tcp_saveipgen[40]; /* the size must be of max ip header, now IPv6 */
103 struct tcphdr tcp_savetcp;
104 #endif /* TCPDEBUG */
105
106 #if IPSEC
107 #include <netinet6/ipsec.h>
108 #if INET6
109 #include <netinet6/ipsec6.h>
110 #endif
111 #include <netkey/key.h>
112 #endif /*IPSEC*/
113
114 #include <sys/kdebug.h>
115
116 #ifndef __APPLE__
117 MALLOC_DEFINE(M_TSEGQ, "tseg_qent", "TCP segment queue entry");
118 #endif
119
120 #define DBG_LAYER_BEG NETDBG_CODE(DBG_NETTCP, 0)
121 #define DBG_LAYER_END NETDBG_CODE(DBG_NETTCP, 2)
122 #define DBG_FNC_TCP_INPUT NETDBG_CODE(DBG_NETTCP, (3 << 8))
123 #define DBG_FNC_TCP_NEWCONN NETDBG_CODE(DBG_NETTCP, (7 << 8))
124
125 static int tcprexmtthresh = 3;
126 tcp_cc tcp_ccgen;
127 extern int apple_hwcksum_rx;
128
129 #if IPSEC
130 extern int ipsec_bypass;
131 #endif
132
133 struct tcpstat tcpstat;
134 SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RD,
135 &tcpstat , tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
136
137 static int log_in_vain = 0;
138 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW,
139 &log_in_vain, 0, "Log all incoming TCP connections");
140
141 static int blackhole = 0;
142 SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW,
143 &blackhole, 0, "Do not send RST when dropping refused connections");
144
145 int tcp_delack_enabled = 1;
146 SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW,
147 &tcp_delack_enabled, 0,
148 "Delay ACK to try and piggyback it onto a data packet");
149
150 int tcp_lq_overflow = 1;
151 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_lq_overflow, CTLFLAG_RW,
152 &tcp_lq_overflow, 0,
153 "Listen Queue Overflow");
154
155 #if TCP_DROP_SYNFIN
156 static int drop_synfin = 0;
157 SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW,
158 &drop_synfin, 0, "Drop TCP packets with SYN+FIN set");
159 #endif
160
161 u_long tcp_now;
162 struct inpcbhead tcb;
163 #define tcb6 tcb /* for KAME src sync over BSD*'s */
164 struct inpcbinfo tcbinfo;
165
166 static void tcp_dooptions __P((struct tcpcb *,
167 u_char *, int, struct tcphdr *, struct tcpopt *));
168 static void tcp_pulloutofband __P((struct socket *,
169 struct tcphdr *, struct mbuf *, int));
170 static int tcp_reass __P((struct tcpcb *, struct tcphdr *, int *,
171 struct mbuf *));
172 static void tcp_xmit_timer __P((struct tcpcb *, int));
173 static int tcp_newreno __P((struct tcpcb *, struct tcphdr *));
174
175 /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
176 #if INET6
177 #define ND6_HINT(tp) \
178 do { \
179 if ((tp) && (tp)->t_inpcb && \
180 ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0 && \
181 (tp)->t_inpcb->in6p_route.ro_rt) \
182 nd6_nud_hint((tp)->t_inpcb->in6p_route.ro_rt, NULL, 0); \
183 } while (0)
184 #else
185 #define ND6_HINT(tp)
186 #endif
187
188 extern u_long *delack_bitmask;
189
190 /*
191 * Indicate whether this ack should be delayed. We can delay the ack if
192 * - delayed acks are enabled and
193 * - there is no delayed ack timer in progress and
194 * - our last ack wasn't a 0-sized window. We never want to delay
195 * the ack that opens up a 0-sized window.
196 */
197 #define DELAY_ACK(tp) \
198 (tcp_delack_enabled && !callout_pending(tp->tt_delack) && \
199 (tp->t_flags & TF_RXWIN0SENT) == 0)
200
201
202 static int
203 tcp_reass(tp, th, tlenp, m)
204 register struct tcpcb *tp;
205 register struct tcphdr *th;
206 int *tlenp;
207 struct mbuf *m;
208 {
209 struct tseg_qent *q;
210 struct tseg_qent *p = NULL;
211 struct tseg_qent *nq;
212 struct tseg_qent *te;
213 struct socket *so = tp->t_inpcb->inp_socket;
214 int flags;
215
216 /*
217 * Call with th==0 after become established to
218 * force pre-ESTABLISHED data up to user socket.
219 */
220 if (th == 0)
221 goto present;
222
223 /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
224 MALLOC(te, struct tseg_qent *, sizeof (struct tseg_qent), M_TSEGQ,
225 M_NOWAIT);
226 if (te == NULL) {
227 tcpstat.tcps_rcvmemdrop++;
228 m_freem(m);
229 return (0);
230 }
231
232 /*
233 * Find a segment which begins after this one does.
234 */
235 LIST_FOREACH(q, &tp->t_segq, tqe_q) {
236 if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
237 break;
238 p = q;
239 }
240
241 /*
242 * If there is a preceding segment, it may provide some of
243 * our data already. If so, drop the data from the incoming
244 * segment. If it provides all of our data, drop us.
245 */
246 if (p != NULL) {
247 register int i;
248 /* conversion to int (in i) handles seq wraparound */
249 i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
250 if (i > 0) {
251 if (i >= *tlenp) {
252 tcpstat.tcps_rcvduppack++;
253 tcpstat.tcps_rcvdupbyte += *tlenp;
254 m_freem(m);
255 FREE(te, M_TSEGQ);
256 /*
257 * Try to present any queued data
258 * at the left window edge to the user.
259 * This is needed after the 3-WHS
260 * completes.
261 */
262 goto present; /* ??? */
263 }
264 m_adj(m, i);
265 *tlenp -= i;
266 th->th_seq += i;
267 }
268 }
269 tcpstat.tcps_rcvoopack++;
270 tcpstat.tcps_rcvoobyte += *tlenp;
271
272 /*
273 * While we overlap succeeding segments trim them or,
274 * if they are completely covered, dequeue them.
275 */
276 while (q) {
277 register int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
278 if (i <= 0)
279 break;
280 if (i < q->tqe_len) {
281 q->tqe_th->th_seq += i;
282 q->tqe_len -= i;
283 m_adj(q->tqe_m, i);
284 break;
285 }
286
287 nq = LIST_NEXT(q, tqe_q);
288 LIST_REMOVE(q, tqe_q);
289 m_freem(q->tqe_m);
290 FREE(q, M_TSEGQ);
291 q = nq;
292 }
293
294 /* Insert the new segment queue entry into place. */
295 te->tqe_m = m;
296 te->tqe_th = th;
297 te->tqe_len = *tlenp;
298
299 if (p == NULL) {
300 LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
301 } else {
302 LIST_INSERT_AFTER(p, te, tqe_q);
303 }
304
305 present:
306 /*
307 * Present data to user, advancing rcv_nxt through
308 * completed sequence space.
309 */
310 if (!TCPS_HAVEESTABLISHED(tp->t_state))
311 return (0);
312 q = LIST_FIRST(&tp->t_segq);
313 if (!q || q->tqe_th->th_seq != tp->rcv_nxt)
314 return (0);
315 do {
316 tp->rcv_nxt += q->tqe_len;
317 flags = q->tqe_th->th_flags & TH_FIN;
318 nq = LIST_NEXT(q, tqe_q);
319 LIST_REMOVE(q, tqe_q);
320 if (so->so_state & SS_CANTRCVMORE)
321 m_freem(q->tqe_m);
322 else
323 sbappend(&so->so_rcv, q->tqe_m);
324 FREE(q, M_TSEGQ);
325 q = nq;
326 } while (q && q->tqe_th->th_seq == tp->rcv_nxt);
327 ND6_HINT(tp);
328
329 #if INET6
330 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
331
332 KERNEL_DEBUG(DBG_LAYER_BEG,
333 ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
334 (((tp->t_inpcb->in6p_laddr.s6_addr16[0] & 0xffff) << 16) |
335 (tp->t_inpcb->in6p_faddr.s6_addr16[0] & 0xffff)),
336 0,0,0);
337 }
338 else
339 #endif
340 {
341 KERNEL_DEBUG(DBG_LAYER_BEG,
342 ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
343 (((tp->t_inpcb->inp_laddr.s_addr & 0xffff) << 16) |
344 (tp->t_inpcb->inp_faddr.s_addr & 0xffff)),
345 0,0,0);
346 }
347 sorwakeup(so);
348 return (flags);
349
350 }
351
352
353 /*
354 * TCP input routine, follows pages 65-76 of the
355 * protocol specification dated September, 1981 very closely.
356 */
357 #if INET6
358 int
359 tcp6_input(mp, offp, proto)
360 struct mbuf **mp;
361 int *offp, proto;
362 {
363 register struct mbuf *m = *mp;
364 struct in6_ifaddr *ia6;
365
366 IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE);
367
368 /*
369 * draft-itojun-ipv6-tcp-to-anycast
370 * better place to put this in?
371 */
372 ia6 = ip6_getdstifaddr(m);
373 if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) {
374 struct ip6_hdr *ip6;
375
376 ip6 = mtod(m, struct ip6_hdr *);
377 icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
378 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
379 return IPPROTO_DONE;
380 }
381
382 tcp_input(m, *offp);
383 return IPPROTO_DONE;
384 }
385 #endif
386
387 void
388 tcp_input(m, off0)
389 struct mbuf *m;
390 int off0;
391 {
392 register struct tcphdr *th;
393 register struct ip *ip = NULL;
394 register struct ipovly *ipov;
395 register struct inpcb *inp;
396 u_char *optp = NULL;
397 int optlen = 0;
398 int len, tlen, off;
399 int drop_hdrlen;
400 register struct tcpcb *tp = 0;
401 register int thflags;
402 struct socket *so = 0;
403 int todrop, acked, ourfinisacked, needoutput = 0;
404 struct in_addr laddr;
405 #if INET6
406 struct in6_addr laddr6;
407 #endif
408 int dropsocket = 0;
409 int iss = 0;
410 u_long tiwin;
411 struct tcpopt to; /* options in this segment */
412 struct rmxp_tao *taop; /* pointer to our TAO cache entry */
413 struct rmxp_tao tao_noncached; /* in case there's no cached entry */
414 #if TCPDEBUG
415 short ostate = 0;
416 #endif
417 #if INET6
418 struct ip6_hdr *ip6 = NULL;
419 int isipv6;
420 #endif /* INET6 */
421 int rstreason; /* For badport_bandlim accounting purposes */
422 struct proc *proc0=current_proc();
423
424 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_START,0,0,0,0,0);
425
426 #if INET6
427 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
428 #endif
429 bzero((char *)&to, sizeof(to));
430
431 tcpstat.tcps_rcvtotal++;
432
433
434
435 #if INET6
436 if (isipv6) {
437 /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */
438 ip6 = mtod(m, struct ip6_hdr *);
439 tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
440 if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) {
441 tcpstat.tcps_rcvbadsum++;
442 goto drop;
443 }
444 th = (struct tcphdr *)((caddr_t)ip6 + off0);
445
446 KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport),
447 (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
448 th->th_seq, th->th_ack, th->th_win);
449 /*
450 * Be proactive about unspecified IPv6 address in source.
451 * As we use all-zero to indicate unbounded/unconnected pcb,
452 * unspecified IPv6 address can be used to confuse us.
453 *
454 * Note that packets with unspecified IPv6 destination is
455 * already dropped in ip6_input.
456 */
457 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
458 /* XXX stat */
459 goto drop;
460 }
461 } else
462 #endif /* INET6 */
463 {
464 /*
465 * Get IP and TCP header together in first mbuf.
466 * Note: IP leaves IP header in first mbuf.
467 */
468 if (off0 > sizeof (struct ip)) {
469 ip_stripoptions(m, (struct mbuf *)0);
470 off0 = sizeof(struct ip);
471 if (m->m_pkthdr.csum_flags & CSUM_TCP_SUM16)
472 m->m_pkthdr.csum_flags = 0; /* invalidate hwcksuming */
473
474 }
475 if (m->m_len < sizeof (struct tcpiphdr)) {
476 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) {
477 tcpstat.tcps_rcvshort++;
478 return;
479 }
480 }
481 ip = mtod(m, struct ip *);
482 ipov = (struct ipovly *)ip;
483 th = (struct tcphdr *)((caddr_t)ip + off0);
484 tlen = ip->ip_len;
485
486 KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport),
487 (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
488 th->th_seq, th->th_ack, th->th_win);
489
490 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
491 if (apple_hwcksum_rx && (m->m_pkthdr.csum_flags & CSUM_TCP_SUM16)) {
492 u_short pseudo;
493 bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
494 ipov->ih_len = (u_short)tlen;
495 HTONS(ipov->ih_len);
496 pseudo = in_cksum(m, sizeof (struct ip));
497 th->th_sum = in_addword(pseudo, (m->m_pkthdr.csum_data & 0xFFFF));
498 } else {
499 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
500 th->th_sum = m->m_pkthdr.csum_data;
501 else
502 th->th_sum = in_pseudo(ip->ip_src.s_addr,
503 ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data +
504 ip->ip_len + IPPROTO_TCP));
505 }
506 th->th_sum ^= 0xffff;
507 } else {
508 /*
509 * Checksum extended TCP header and data.
510 */
511 len = sizeof (struct ip) + tlen;
512 bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
513 ipov->ih_len = (u_short)tlen;
514 HTONS(ipov->ih_len);
515 th->th_sum = in_cksum(m, len);
516 }
517 if (th->th_sum) {
518 tcpstat.tcps_rcvbadsum++;
519 goto drop;
520 }
521 #if INET6
522 /* Re-initialization for later version check */
523 ip->ip_v = IPVERSION;
524 #endif
525 }
526
527 /*
528 * Check that TCP offset makes sense,
529 * pull out TCP options and adjust length. XXX
530 */
531 off = th->th_off << 2;
532 if (off < sizeof (struct tcphdr) || off > tlen) {
533 tcpstat.tcps_rcvbadoff++;
534 goto drop;
535 }
536 tlen -= off; /* tlen is used instead of ti->ti_len */
537 if (off > sizeof (struct tcphdr)) {
538 #if INET6
539 if (isipv6) {
540 IP6_EXTHDR_CHECK(m, off0, off, );
541 ip6 = mtod(m, struct ip6_hdr *);
542 th = (struct tcphdr *)((caddr_t)ip6 + off0);
543 } else
544 #endif /* INET6 */
545 {
546 if (m->m_len < sizeof(struct ip) + off) {
547 if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) {
548 tcpstat.tcps_rcvshort++;
549 return;
550 }
551 ip = mtod(m, struct ip *);
552 ipov = (struct ipovly *)ip;
553 th = (struct tcphdr *)((caddr_t)ip + off0);
554 }
555 }
556 optlen = off - sizeof (struct tcphdr);
557 optp = (u_char *)(th + 1);
558 /*
559 * Do quick retrieval of timestamp options ("options
560 * prediction?"). If timestamp is the only option and it's
561 * formatted as recommended in RFC 1323 appendix A, we
562 * quickly get the values now and not bother calling
563 * tcp_dooptions(), etc.
564 */
565 if ((optlen == TCPOLEN_TSTAMP_APPA ||
566 (optlen > TCPOLEN_TSTAMP_APPA &&
567 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
568 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
569 (th->th_flags & TH_SYN) == 0) {
570 to.to_flag |= TOF_TS;
571 to.to_tsval = ntohl(*(u_int32_t *)(optp + 4));
572 to.to_tsecr = ntohl(*(u_int32_t *)(optp + 8));
573 optp = NULL; /* we've parsed the options */
574 }
575 }
576 thflags = th->th_flags;
577
578 #if TCP_DROP_SYNFIN
579 /*
580 * If the drop_synfin option is enabled, drop all packets with
581 * both the SYN and FIN bits set. This prevents e.g. nmap from
582 * identifying the TCP/IP stack.
583 *
584 * This is incompatible with RFC1644 extensions (T/TCP).
585 */
586 if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN))
587 goto drop;
588 #endif
589
590 /*
591 * Convert TCP protocol specific fields to host format.
592 */
593 NTOHL(th->th_seq);
594 NTOHL(th->th_ack);
595 NTOHS(th->th_win);
596 NTOHS(th->th_urp);
597
598 /*
599 * Delay droping TCP, IP headers, IPv6 ext headers, and TCP options,
600 * until after ip6_savecontrol() is called and before other functions
601 * which don't want those proto headers.
602 * Because ip6_savecontrol() is going to parse the mbuf to
603 * search for data to be passed up to user-land, it wants mbuf
604 * parameters to be unchanged.
605 */
606 drop_hdrlen = off0 + off;
607
608 /*
609 * Locate pcb for segment.
610 */
611 findpcb:
612 #if IPFIREWALL_FORWARD
613 if (ip_fw_fwd_addr != NULL
614 #if INET6
615 && isipv6 == NULL /* IPv6 support is not yet */
616 #endif /* INET6 */
617 ) {
618 /*
619 * Diverted. Pretend to be the destination.
620 * already got one like this?
621 */
622 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
623 ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif);
624 if (!inp) {
625 /*
626 * No, then it's new. Try find the ambushing socket
627 */
628 if (!ip_fw_fwd_addr->sin_port) {
629 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src,
630 th->th_sport, ip_fw_fwd_addr->sin_addr,
631 th->th_dport, 1, m->m_pkthdr.rcvif);
632 } else {
633 inp = in_pcblookup_hash(&tcbinfo,
634 ip->ip_src, th->th_sport,
635 ip_fw_fwd_addr->sin_addr,
636 ntohs(ip_fw_fwd_addr->sin_port), 1,
637 m->m_pkthdr.rcvif);
638 }
639 }
640 ip_fw_fwd_addr = NULL;
641 } else
642 #endif /* IPFIREWALL_FORWARD */
643 {
644 #if INET6
645 if (isipv6)
646 inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport,
647 &ip6->ip6_dst, th->th_dport, 1,
648 m->m_pkthdr.rcvif);
649 else
650 #endif /* INET6 */
651 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
652 ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif);
653 }
654
655 #if IPSEC
656 #if INET6
657 if (isipv6) {
658 if (ipsec_bypass == 0 && inp != NULL && ipsec6_in_reject_so(m, inp->inp_socket)) {
659 ipsec6stat.in_polvio++;
660 goto drop;
661 }
662 } else
663 #endif /* INET6 */
664 if (ipsec_bypass == 0 && inp != NULL && ipsec4_in_reject_so(m, inp->inp_socket)) {
665 ipsecstat.in_polvio++;
666 goto drop;
667 }
668 #endif /*IPSEC*/
669
670 /*
671 * If the state is CLOSED (i.e., TCB does not exist) then
672 * all data in the incoming segment is discarded.
673 * If the TCB exists but is in CLOSED state, it is embryonic,
674 * but should either do a listen or a connect soon.
675 */
676 if (inp == NULL) {
677 if (log_in_vain) {
678 #if INET6
679 char dbuf[INET6_ADDRSTRLEN], sbuf[INET6_ADDRSTRLEN];
680 #else /* INET6 */
681 char dbuf[4*sizeof "123"], sbuf[4*sizeof "123"];
682 #endif /* INET6 */
683
684 #if INET6
685 if (isipv6) {
686 strcpy(dbuf, ip6_sprintf(&ip6->ip6_dst));
687 strcpy(sbuf, ip6_sprintf(&ip6->ip6_src));
688 } else
689 #endif
690 {
691 strcpy(dbuf, inet_ntoa(ip->ip_dst));
692 strcpy(sbuf, inet_ntoa(ip->ip_src));
693 }
694 switch (log_in_vain) {
695 case 1:
696 if(thflags & TH_SYN)
697 log(LOG_INFO,
698 "Connection attempt to TCP %s:%d from %s:%d\n",
699 dbuf, ntohs(th->th_dport),
700 sbuf,
701 ntohs(th->th_sport));
702 break;
703 case 2:
704 log(LOG_INFO,
705 "Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n",
706 dbuf, ntohs(th->th_dport), sbuf,
707 ntohs(th->th_sport), thflags);
708 break;
709 default:
710 break;
711 }
712 }
713 if (blackhole) {
714 switch (blackhole) {
715 case 1:
716 if (thflags & TH_SYN)
717 goto drop;
718 break;
719 case 2:
720 goto drop;
721 default:
722 goto drop;
723 }
724 }
725 rstreason = BANDLIM_RST_CLOSEDPORT;
726 goto dropwithreset;
727 }
728 tp = intotcpcb(inp);
729 if (tp == 0) {
730 rstreason = BANDLIM_RST_CLOSEDPORT;
731 goto dropwithreset;
732 }
733 if (tp->t_state == TCPS_CLOSED)
734 goto drop;
735
736 #ifdef __APPLE__
737 /*
738 * Bogus state when listening port owned by SharedIP with loopback as the
739 * only configured interface: BlueBox does not filters loopback
740 */
741 if (tp->t_state == TCP_NSTATES)
742 goto drop;
743 #endif
744
745 /* Unscale the window into a 32-bit value. */
746 if ((thflags & TH_SYN) == 0)
747 tiwin = th->th_win << tp->snd_scale;
748 else
749 tiwin = th->th_win;
750
751 so = inp->inp_socket;
752 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
753 #if TCPDEBUG
754 if (so->so_options & SO_DEBUG) {
755 ostate = tp->t_state;
756 #if INET6
757 if (isipv6)
758 bcopy((char *)ip6, (char *)tcp_saveipgen,
759 sizeof(*ip6));
760 else
761 #endif /* INET6 */
762 bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip));
763 tcp_savetcp = *th;
764 }
765 #endif
766 if (so->so_options & SO_ACCEPTCONN) {
767 register struct tcpcb *tp0 = tp;
768 struct socket *so2;
769 #if IPSEC
770 struct socket *oso;
771 #endif
772 #if INET6
773 struct inpcb *oinp = sotoinpcb(so);
774 #endif /* INET6 */
775
776 #if !IPSEC
777 /*
778 * Current IPsec implementation makes incorrect IPsec
779 * cache if this check is done here.
780 * So delay this until duplicated socket is created.
781 */
782 if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
783 /*
784 * Note: dropwithreset makes sure we don't
785 * send a RST in response to a RST.
786 */
787 if (thflags & TH_ACK) {
788 tcpstat.tcps_badsyn++;
789 rstreason = BANDLIM_RST_OPENPORT;
790 goto dropwithreset;
791 }
792 goto drop;
793 }
794 #endif
795 KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_START,0,0,0,0,0);
796
797 #if INET6
798 /*
799 * If deprecated address is forbidden,
800 * we do not accept SYN to deprecated interface
801 * address to prevent any new inbound connection from
802 * getting established.
803 * When we do not accept SYN, we send a TCP RST,
804 * with deprecated source address (instead of dropping
805 * it). We compromise it as it is much better for peer
806 * to send a RST, and RST will be the final packet
807 * for the exchange.
808 *
809 * If we do not forbid deprecated addresses, we accept
810 * the SYN packet. RFC2462 does not suggest dropping
811 * SYN in this case.
812 * If we decipher RFC2462 5.5.4, it says like this:
813 * 1. use of deprecated addr with existing
814 * communication is okay - "SHOULD continue to be
815 * used"
816 * 2. use of it with new communication:
817 * (2a) "SHOULD NOT be used if alternate address
818 * with sufficient scope is available"
819 * (2b) nothing mentioned otherwise.
820 * Here we fall into (2b) case as we have no choice in
821 * our source address selection - we must obey the peer.
822 *
823 * The wording in RFC2462 is confusing, and there are
824 * multiple description text for deprecated address
825 * handling - worse, they are not exactly the same.
826 * I believe 5.5.4 is the best one, so we follow 5.5.4.
827 */
828 if (isipv6 && !ip6_use_deprecated) {
829 struct in6_ifaddr *ia6;
830
831 if ((ia6 = ip6_getdstifaddr(m)) &&
832 (ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
833 tp = NULL;
834 rstreason = BANDLIM_RST_OPENPORT;
835 goto dropwithreset;
836 }
837 }
838 #endif
839
840 so2 = sonewconn(so, 0);
841 if (so2 == 0) {
842 tcpstat.tcps_listendrop++;
843 so2 = sodropablereq(so);
844 if (so2) {
845 if (tcp_lq_overflow)
846 sototcpcb(so2)->t_flags |=
847 TF_LQ_OVERFLOW;
848 tcp_drop(sototcpcb(so2), ETIMEDOUT);
849 so2 = sonewconn(so, 0);
850 }
851 if (!so2)
852 goto drop;
853 }
854 #if IPSEC
855 oso = so;
856 #endif
857 so = so2;
858 /*
859 * This is ugly, but ....
860 *
861 * Mark socket as temporary until we're
862 * committed to keeping it. The code at
863 * ``drop'' and ``dropwithreset'' check the
864 * flag dropsocket to see if the temporary
865 * socket created here should be discarded.
866 * We mark the socket as discardable until
867 * we're committed to it below in TCPS_LISTEN.
868 */
869 dropsocket++;
870 inp = (struct inpcb *)so->so_pcb;
871 #if INET6
872 if (isipv6)
873 inp->in6p_laddr = ip6->ip6_dst;
874 else {
875 inp->inp_vflag &= ~INP_IPV6;
876 inp->inp_vflag |= INP_IPV4;
877 #endif /* INET6 */
878 inp->inp_laddr = ip->ip_dst;
879 #if INET6
880 }
881 #endif /* INET6 */
882 inp->inp_lport = th->th_dport;
883 if (in_pcbinshash(inp) != 0) {
884 /*
885 * Undo the assignments above if we failed to
886 * put the PCB on the hash lists.
887 */
888 #if INET6
889 if (isipv6)
890 inp->in6p_laddr = in6addr_any;
891 else
892 #endif /* INET6 */
893 inp->inp_laddr.s_addr = INADDR_ANY;
894 inp->inp_lport = 0;
895 goto drop;
896 }
897 #if IPSEC
898 /*
899 * To avoid creating incorrectly cached IPsec
900 * association, this is need to be done here.
901 *
902 * Subject: (KAME-snap 748)
903 * From: Wayne Knowles <w.knowles@niwa.cri.nz>
904 * ftp://ftp.kame.net/pub/mail-list/snap-users/748
905 */
906 if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
907 /*
908 * Note: dropwithreset makes sure we don't
909 * send a RST in response to a RST.
910 */
911 if (thflags & TH_ACK) {
912 tcpstat.tcps_badsyn++;
913 rstreason = BANDLIM_RST_OPENPORT;
914 goto dropwithreset;
915 }
916 goto drop;
917 }
918 #endif
919 #if INET6
920 if (isipv6) {
921 /*
922 * Inherit socket options from the listening
923 * socket.
924 * Note that in6p_inputopts are not (even
925 * should not be) copied, since it stores
926 * previously received options and is used to
927 * detect if each new option is different than
928 * the previous one and hence should be passed
929 * to a user.
930 * If we copied in6p_inputopts, a user would
931 * not be able to receive options just after
932 * calling the accept system call.
933 */
934 inp->inp_flags |=
935 oinp->inp_flags & INP_CONTROLOPTS;
936 if (oinp->in6p_outputopts)
937 inp->in6p_outputopts =
938 ip6_copypktopts(oinp->in6p_outputopts,
939 M_NOWAIT);
940 } else
941 #endif /* INET6 */
942 inp->inp_options = ip_srcroute();
943 #if IPSEC
944 /* copy old policy into new socket's */
945 if (sotoinpcb(oso)->inp_sp)
946 {
947 int error = 0;
948 /* Is it a security hole here to silently fail to copy the policy? */
949 if (inp->inp_sp != NULL)
950 error = ipsec_init_policy(so, &inp->inp_sp);
951 if (error != 0 || ipsec_copy_policy(sotoinpcb(oso)->inp_sp, inp->inp_sp))
952 printf("tcp_input: could not copy policy\n");
953 }
954 #endif
955 tp = intotcpcb(inp);
956 tp->t_state = TCPS_LISTEN;
957 tp->t_flags |= tp0->t_flags & (TF_NOPUSH|TF_NOOPT|TF_NODELAY);
958
959 /* Compute proper scaling value from buffer space */
960 while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
961 TCP_MAXWIN << tp->request_r_scale <
962 so->so_rcv.sb_hiwat)
963 tp->request_r_scale++;
964
965 KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_END,0,0,0,0,0);
966 }
967 }
968
969 /*
970 * Segment received on connection.
971 * Reset idle time and keep-alive timer.
972 */
973 tp->t_rcvtime = 0;
974 if (TCPS_HAVEESTABLISHED(tp->t_state))
975 tp->t_timer[TCPT_KEEP] = tcp_keepidle;
976
977 /*
978 * Process options if not in LISTEN state,
979 * else do it below (after getting remote address).
980 */
981 if (tp->t_state != TCPS_LISTEN && optp)
982 tcp_dooptions(tp, optp, optlen, th, &to);
983
984 /*
985 * Header prediction: check for the two common cases
986 * of a uni-directional data xfer. If the packet has
987 * no control flags, is in-sequence, the window didn't
988 * change and we're not retransmitting, it's a
989 * candidate. If the length is zero and the ack moved
990 * forward, we're the sender side of the xfer. Just
991 * free the data acked & wake any higher level process
992 * that was blocked waiting for space. If the length
993 * is non-zero and the ack didn't move, we're the
994 * receiver side. If we're getting packets in-order
995 * (the reassembly queue is empty), add the data to
996 * the socket buffer and note that we need a delayed ack.
997 * Make sure that the hidden state-flags are also off.
998 * Since we check for TCPS_ESTABLISHED above, it can only
999 * be TH_NEEDSYN.
1000 */
1001 if (tp->t_state == TCPS_ESTABLISHED &&
1002 (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
1003 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
1004 ((to.to_flag & TOF_TS) == 0 ||
1005 TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
1006 /*
1007 * Using the CC option is compulsory if once started:
1008 * the segment is OK if no T/TCP was negotiated or
1009 * if the segment has a CC option equal to CCrecv
1010 */
1011 ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) != (TF_REQ_CC|TF_RCVD_CC) ||
1012 ((to.to_flag & TOF_CC) != 0 && to.to_cc == tp->cc_recv)) &&
1013 th->th_seq == tp->rcv_nxt &&
1014 tiwin && tiwin == tp->snd_wnd &&
1015 tp->snd_nxt == tp->snd_max) {
1016
1017 /*
1018 * If last ACK falls within this segment's sequence numbers,
1019 * record the timestamp.
1020 * NOTE that the test is modified according to the latest
1021 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
1022 */
1023 if ((to.to_flag & TOF_TS) != 0 &&
1024 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
1025 tp->ts_recent_age = tcp_now;
1026 tp->ts_recent = to.to_tsval;
1027 }
1028
1029 if (tlen == 0) {
1030 if (SEQ_GT(th->th_ack, tp->snd_una) &&
1031 SEQ_LEQ(th->th_ack, tp->snd_max) &&
1032 tp->snd_cwnd >= tp->snd_wnd &&
1033 tp->t_dupacks < tcprexmtthresh) {
1034 /*
1035 * this is a pure ack for outstanding data.
1036 */
1037 ++tcpstat.tcps_predack;
1038 /*
1039 * "bad retransmit" recovery
1040 */
1041 if (tp->t_rxtshift == 1 &&
1042 tcp_now < tp->t_badrxtwin) {
1043 tp->snd_cwnd = tp->snd_cwnd_prev;
1044 tp->snd_ssthresh =
1045 tp->snd_ssthresh_prev;
1046 tp->snd_nxt = tp->snd_max;
1047 tp->t_badrxtwin = 0;
1048 }
1049 if ((to.to_flag & TOF_TS) != 0)
1050 tcp_xmit_timer(tp,
1051 tcp_now - to.to_tsecr + 1);
1052 else if (tp->t_rtttime &&
1053 SEQ_GT(th->th_ack, tp->t_rtseq))
1054 tcp_xmit_timer(tp, tp->t_rtttime);
1055 acked = th->th_ack - tp->snd_una;
1056 tcpstat.tcps_rcvackpack++;
1057 tcpstat.tcps_rcvackbyte += acked;
1058 sbdrop(&so->so_snd, acked);
1059 tp->snd_una = th->th_ack;
1060 m_freem(m);
1061 ND6_HINT(tp); /* some progress has been done */
1062
1063 /*
1064 * If all outstanding data are acked, stop
1065 * retransmit timer, otherwise restart timer
1066 * using current (possibly backed-off) value.
1067 * If process is waiting for space,
1068 * wakeup/selwakeup/signal. If data
1069 * are ready to send, let tcp_output
1070 * decide between more output or persist.
1071 */
1072 if (tp->snd_una == tp->snd_max)
1073 tp->t_timer[TCPT_REXMT] = 0;
1074 else if (tp->t_timer[TCPT_PERSIST] == 0)
1075 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
1076
1077 if (so->so_snd.sb_cc)
1078 (void) tcp_output(tp);
1079 sowwakeup(so);
1080 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
1081 return;
1082 }
1083 } else if (th->th_ack == tp->snd_una &&
1084 LIST_EMPTY(&tp->t_segq) &&
1085 tlen <= sbspace(&so->so_rcv)) {
1086 /*
1087 * this is a pure, in-sequence data packet
1088 * with nothing on the reassembly queue and
1089 * we have enough buffer space to take it.
1090 */
1091 ++tcpstat.tcps_preddat;
1092 tp->rcv_nxt += tlen;
1093 tcpstat.tcps_rcvpack++;
1094 tcpstat.tcps_rcvbyte += tlen;
1095 ND6_HINT(tp); /* some progress has been done */
1096 /*
1097 * Add data to socket buffer.
1098 */
1099 m_adj(m, drop_hdrlen); /* delayed header drop */
1100 sbappend(&so->so_rcv, m);
1101 #if INET6
1102 if (isipv6) {
1103 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
1104 (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
1105 th->th_seq, th->th_ack, th->th_win);
1106 }
1107 else
1108 #endif
1109 {
1110 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
1111 (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
1112 th->th_seq, th->th_ack, th->th_win);
1113 }
1114 if (tcp_delack_enabled) {
1115 TCP_DELACK_BITSET(tp->t_inpcb->hash_element);
1116 tp->t_flags |= TF_DELACK;
1117 } else {
1118 tp->t_flags |= TF_ACKNOW;
1119 tcp_output(tp);
1120 }
1121 sorwakeup(so);
1122 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
1123 return;
1124 }
1125 }
1126
1127 /*
1128 * Calculate amount of space in receive window,
1129 * and then do TCP input processing.
1130 * Receive window is amount of space in rcv queue,
1131 * but not less than advertised window.
1132 */
1133 { int win;
1134
1135 win = sbspace(&so->so_rcv);
1136 if (win < 0)
1137 win = 0;
1138 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
1139 }
1140
1141 switch (tp->t_state) {
1142
1143 /*
1144 * If the state is LISTEN then ignore segment if it contains an RST.
1145 * If the segment contains an ACK then it is bad and send a RST.
1146 * If it does not contain a SYN then it is not interesting; drop it.
1147 * If it is from this socket, drop it, it must be forged.
1148 * Don't bother responding if the destination was a broadcast.
1149 * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial
1150 * tp->iss, and send a segment:
1151 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
1152 * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
1153 * Fill in remote peer address fields if not previously specified.
1154 * Enter SYN_RECEIVED state, and process any other fields of this
1155 * segment in this state.
1156 */
1157 case TCPS_LISTEN: {
1158 register struct sockaddr_in *sin;
1159 #if INET6
1160 register struct sockaddr_in6 *sin6;
1161 #endif
1162
1163 if (thflags & TH_RST)
1164 goto drop;
1165 if (thflags & TH_ACK) {
1166 rstreason = BANDLIM_RST_OPENPORT;
1167 goto dropwithreset;
1168 }
1169 if ((thflags & TH_SYN) == 0)
1170 goto drop;
1171 if (th->th_dport == th->th_sport) {
1172 #if INET6
1173 if (isipv6) {
1174 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
1175 &ip6->ip6_src))
1176 goto drop;
1177 } else
1178 #endif /* INET6 */
1179 if (ip->ip_dst.s_addr == ip->ip_src.s_addr)
1180 goto drop;
1181 }
1182 /*
1183 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
1184 * in_broadcast() should never return true on a received
1185 * packet with M_BCAST not set.
1186 *
1187 * Packets with a multicast source address should also
1188 * be discarded.
1189 */
1190 if (m->m_flags & (M_BCAST|M_MCAST))
1191 goto drop;
1192 #if INET6
1193 if (isipv6) {
1194 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
1195 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
1196 goto drop;
1197 } else
1198 #endif
1199 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
1200 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
1201 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
1202 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
1203 goto drop;
1204 #if INET6
1205 if (isipv6) {
1206 MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6,
1207 M_SONAME, M_NOWAIT);
1208 if (sin6 == NULL)
1209 goto drop;
1210 bzero(sin6, sizeof(*sin6));
1211 sin6->sin6_family = AF_INET6;
1212 sin6->sin6_len = sizeof(*sin6);
1213 sin6->sin6_addr = ip6->ip6_src;
1214 sin6->sin6_port = th->th_sport;
1215 laddr6 = inp->in6p_laddr;
1216 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
1217 inp->in6p_laddr = ip6->ip6_dst;
1218 if (in6_pcbconnect(inp, (struct sockaddr *)sin6,
1219 proc0)) {
1220 inp->in6p_laddr = laddr6;
1221 FREE(sin6, M_SONAME);
1222 goto drop;
1223 }
1224 FREE(sin6, M_SONAME);
1225 } else
1226 #endif
1227 {
1228 MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME,
1229 M_NOWAIT);
1230 if (sin == NULL)
1231 goto drop;
1232 sin->sin_family = AF_INET;
1233 sin->sin_len = sizeof(*sin);
1234 sin->sin_addr = ip->ip_src;
1235 sin->sin_port = th->th_sport;
1236 bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero));
1237 laddr = inp->inp_laddr;
1238 if (inp->inp_laddr.s_addr == INADDR_ANY)
1239 inp->inp_laddr = ip->ip_dst;
1240 if (in_pcbconnect(inp, (struct sockaddr *)sin, proc0)) {
1241 inp->inp_laddr = laddr;
1242 FREE(sin, M_SONAME);
1243 goto drop;
1244 }
1245 FREE(sin, M_SONAME);
1246 }
1247 if ((taop = tcp_gettaocache(inp)) == NULL) {
1248 taop = &tao_noncached;
1249 bzero(taop, sizeof(*taop));
1250 }
1251 tcp_dooptions(tp, optp, optlen, th, &to);
1252 if (iss)
1253 tp->iss = iss;
1254 else {
1255 tp->iss = tcp_new_isn(tp);
1256 }
1257 tp->irs = th->th_seq;
1258 tcp_sendseqinit(tp);
1259 tcp_rcvseqinit(tp);
1260 tp->snd_recover = tp->snd_una;
1261 /*
1262 * Initialization of the tcpcb for transaction;
1263 * set SND.WND = SEG.WND,
1264 * initialize CCsend and CCrecv.
1265 */
1266 tp->snd_wnd = tiwin; /* initial send-window */
1267 tp->cc_send = CC_INC(tcp_ccgen);
1268 tp->cc_recv = to.to_cc;
1269 /*
1270 * Perform TAO test on incoming CC (SEG.CC) option, if any.
1271 * - compare SEG.CC against cached CC from the same host,
1272 * if any.
1273 * - if SEG.CC > chached value, SYN must be new and is accepted
1274 * immediately: save new CC in the cache, mark the socket
1275 * connected, enter ESTABLISHED state, turn on flag to
1276 * send a SYN in the next segment.
1277 * A virtual advertised window is set in rcv_adv to
1278 * initialize SWS prevention. Then enter normal segment
1279 * processing: drop SYN, process data and FIN.
1280 * - otherwise do a normal 3-way handshake.
1281 */
1282 if ((to.to_flag & TOF_CC) != 0) {
1283 if (((tp->t_flags & TF_NOPUSH) != 0) &&
1284 taop->tao_cc != 0 && CC_GT(to.to_cc, taop->tao_cc)) {
1285
1286 taop->tao_cc = to.to_cc;
1287
1288 tp->t_state = TCPS_ESTABLISHED;
1289
1290 /*
1291 * If there is a FIN, or if there is data and the
1292 * connection is local, then delay SYN,ACK(SYN) in
1293 * the hope of piggy-backing it on a response
1294 * segment. Otherwise must send ACK now in case
1295 * the other side is slow starting.
1296 */
1297 if (tcp_delack_enabled && ((thflags & TH_FIN) ||
1298 (tlen != 0 &&
1299 #if INET6
1300 (isipv6 && in6_localaddr(&inp->in6p_faddr))
1301 ||
1302 (!isipv6 &&
1303 #endif /* INET6 */
1304 in_localaddr(inp->inp_faddr)
1305 #if INET6
1306 )
1307 #endif /* INET6 */
1308 ))) {
1309 TCP_DELACK_BITSET(tp->t_inpcb->hash_element);
1310 tp->t_flags |= (TF_DELACK | TF_NEEDSYN);
1311 }
1312 else
1313 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
1314
1315 /*
1316 * Limit the `virtual advertised window' to TCP_MAXWIN
1317 * here. Even if we requested window scaling, it will
1318 * become effective only later when our SYN is acked.
1319 */
1320 tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN);
1321 tcpstat.tcps_connects++;
1322 soisconnected(so);
1323 tp->t_timer[TCPT_KEEP] = tcp_keepinit;
1324 dropsocket = 0; /* committed to socket */
1325 tcpstat.tcps_accepts++;
1326 goto trimthenstep6;
1327 }
1328 /* else do standard 3-way handshake */
1329 } else {
1330 /*
1331 * No CC option, but maybe CC.NEW:
1332 * invalidate cached value.
1333 */
1334 taop->tao_cc = 0;
1335 }
1336 /*
1337 * TAO test failed or there was no CC option,
1338 * do a standard 3-way handshake.
1339 */
1340 tp->t_flags |= TF_ACKNOW;
1341 tp->t_state = TCPS_SYN_RECEIVED;
1342 tp->t_timer[TCPT_KEEP] = tcp_keepinit;
1343 dropsocket = 0; /* committed to socket */
1344 tcpstat.tcps_accepts++;
1345 goto trimthenstep6;
1346 }
1347
1348 /*
1349 * If the state is SYN_RECEIVED:
1350 * if seg contains an ACK, but not for our SYN/ACK, send a RST.
1351 */
1352 case TCPS_SYN_RECEIVED:
1353 if ((thflags & TH_ACK) &&
1354 (SEQ_LEQ(th->th_ack, tp->snd_una) ||
1355 SEQ_GT(th->th_ack, tp->snd_max))) {
1356 rstreason = BANDLIM_RST_OPENPORT;
1357 goto dropwithreset;
1358 }
1359 break;
1360
1361 /*
1362 * If the state is SYN_SENT:
1363 * if seg contains an ACK, but not for our SYN, drop the input.
1364 * if seg contains a RST, then drop the connection.
1365 * if seg does not contain SYN, then drop it.
1366 * Otherwise this is an acceptable SYN segment
1367 * initialize tp->rcv_nxt and tp->irs
1368 * if seg contains ack then advance tp->snd_una
1369 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
1370 * arrange for segment to be acked (eventually)
1371 * continue processing rest of data/controls, beginning with URG
1372 */
1373 case TCPS_SYN_SENT:
1374 if ((taop = tcp_gettaocache(inp)) == NULL) {
1375 taop = &tao_noncached;
1376 bzero(taop, sizeof(*taop));
1377 }
1378
1379 if ((thflags & TH_ACK) &&
1380 (SEQ_LEQ(th->th_ack, tp->iss) ||
1381 SEQ_GT(th->th_ack, tp->snd_max))) {
1382 /*
1383 * If we have a cached CCsent for the remote host,
1384 * hence we haven't just crashed and restarted,
1385 * do not send a RST. This may be a retransmission
1386 * from the other side after our earlier ACK was lost.
1387 * Our new SYN, when it arrives, will serve as the
1388 * needed ACK.
1389 */
1390 if (taop->tao_ccsent != 0)
1391 goto drop;
1392 else {
1393 rstreason = BANDLIM_UNLIMITED;
1394 goto dropwithreset;
1395 }
1396 }
1397 if (thflags & TH_RST) {
1398 if (thflags & TH_ACK) {
1399 tp = tcp_drop(tp, ECONNREFUSED);
1400 postevent(so, 0, EV_RESET);
1401 }
1402 goto drop;
1403 }
1404 if ((thflags & TH_SYN) == 0)
1405 goto drop;
1406 tp->snd_wnd = th->th_win; /* initial send window */
1407 tp->cc_recv = to.to_cc; /* foreign CC */
1408
1409 tp->irs = th->th_seq;
1410 tcp_rcvseqinit(tp);
1411 if (thflags & TH_ACK) {
1412 /*
1413 * Our SYN was acked. If segment contains CC.ECHO
1414 * option, check it to make sure this segment really
1415 * matches our SYN. If not, just drop it as old
1416 * duplicate, but send an RST if we're still playing
1417 * by the old rules. If no CC.ECHO option, make sure
1418 * we don't get fooled into using T/TCP.
1419 */
1420 if (to.to_flag & TOF_CCECHO) {
1421 if (tp->cc_send != to.to_ccecho) {
1422 if (taop->tao_ccsent != 0)
1423 goto drop;
1424 else {
1425 rstreason = BANDLIM_UNLIMITED;
1426 goto dropwithreset;
1427 }
1428 }
1429 } else
1430 tp->t_flags &= ~TF_RCVD_CC;
1431 tcpstat.tcps_connects++;
1432 soisconnected(so);
1433 /* Do window scaling on this connection? */
1434 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1435 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1436 tp->snd_scale = tp->requested_s_scale;
1437 tp->rcv_scale = tp->request_r_scale;
1438 }
1439 /* Segment is acceptable, update cache if undefined. */
1440 if (taop->tao_ccsent == 0)
1441 taop->tao_ccsent = to.to_ccecho;
1442
1443 tp->rcv_adv += tp->rcv_wnd;
1444 tp->snd_una++; /* SYN is acked */
1445 /*
1446 * If there's data, delay ACK; if there's also a FIN
1447 * ACKNOW will be turned on later.
1448 */
1449 if (tcp_delack_enabled && tlen != 0) {
1450 TCP_DELACK_BITSET(tp->t_inpcb->hash_element);
1451 tp->t_flags |= TF_DELACK;
1452 }
1453 else
1454 tp->t_flags |= TF_ACKNOW;
1455 /*
1456 * Received <SYN,ACK> in SYN_SENT[*] state.
1457 * Transitions:
1458 * SYN_SENT --> ESTABLISHED
1459 * SYN_SENT* --> FIN_WAIT_1
1460 */
1461 if (tp->t_flags & TF_NEEDFIN) {
1462 tp->t_state = TCPS_FIN_WAIT_1;
1463 tp->t_flags &= ~TF_NEEDFIN;
1464 thflags &= ~TH_SYN;
1465 } else {
1466 tp->t_state = TCPS_ESTABLISHED;
1467 tp->t_timer[TCPT_KEEP] = tcp_keepidle;
1468 }
1469 } else {
1470 /*
1471 * Received initial SYN in SYN-SENT[*] state => simul-
1472 * taneous open. If segment contains CC option and there is
1473 * a cached CC, apply TAO test; if it succeeds, connection is
1474 * half-synchronized. Otherwise, do 3-way handshake:
1475 * SYN-SENT -> SYN-RECEIVED
1476 * SYN-SENT* -> SYN-RECEIVED*
1477 * If there was no CC option, clear cached CC value.
1478 */
1479 tp->t_flags |= TF_ACKNOW;
1480 tp->t_timer[TCPT_REXMT] = 0;
1481 if (to.to_flag & TOF_CC) {
1482 if (taop->tao_cc != 0 &&
1483 CC_GT(to.to_cc, taop->tao_cc)) {
1484 /*
1485 * update cache and make transition:
1486 * SYN-SENT -> ESTABLISHED*
1487 * SYN-SENT* -> FIN-WAIT-1*
1488 */
1489 taop->tao_cc = to.to_cc;
1490 if (tp->t_flags & TF_NEEDFIN) {
1491 tp->t_state = TCPS_FIN_WAIT_1;
1492 tp->t_flags &= ~TF_NEEDFIN;
1493 } else {
1494 tp->t_state = TCPS_ESTABLISHED;
1495 tp->t_timer[TCPT_KEEP] = tcp_keepidle;
1496 }
1497 tp->t_flags |= TF_NEEDSYN;
1498 } else
1499 tp->t_state = TCPS_SYN_RECEIVED;
1500 } else {
1501 /* CC.NEW or no option => invalidate cache */
1502 taop->tao_cc = 0;
1503 tp->t_state = TCPS_SYN_RECEIVED;
1504 }
1505 }
1506
1507 trimthenstep6:
1508 /*
1509 * Advance th->th_seq to correspond to first data byte.
1510 * If data, trim to stay within window,
1511 * dropping FIN if necessary.
1512 */
1513 th->th_seq++;
1514 if (tlen > tp->rcv_wnd) {
1515 todrop = tlen - tp->rcv_wnd;
1516 m_adj(m, -todrop);
1517 tlen = tp->rcv_wnd;
1518 thflags &= ~TH_FIN;
1519 tcpstat.tcps_rcvpackafterwin++;
1520 tcpstat.tcps_rcvbyteafterwin += todrop;
1521 }
1522 tp->snd_wl1 = th->th_seq - 1;
1523 tp->rcv_up = th->th_seq;
1524 /*
1525 * Client side of transaction: already sent SYN and data.
1526 * If the remote host used T/TCP to validate the SYN,
1527 * our data will be ACK'd; if so, enter normal data segment
1528 * processing in the middle of step 5, ack processing.
1529 * Otherwise, goto step 6.
1530 */
1531 if (thflags & TH_ACK)
1532 goto process_ACK;
1533 goto step6;
1534 /*
1535 * If the state is LAST_ACK or CLOSING or TIME_WAIT:
1536 * if segment contains a SYN and CC [not CC.NEW] option:
1537 * if state == TIME_WAIT and connection duration > MSL,
1538 * drop packet and send RST;
1539 *
1540 * if SEG.CC > CCrecv then is new SYN, and can implicitly
1541 * ack the FIN (and data) in retransmission queue.
1542 * Complete close and delete TCPCB. Then reprocess
1543 * segment, hoping to find new TCPCB in LISTEN state;
1544 *
1545 * else must be old SYN; drop it.
1546 * else do normal processing.
1547 */
1548 case TCPS_LAST_ACK:
1549 case TCPS_CLOSING:
1550 case TCPS_TIME_WAIT:
1551 if ((thflags & TH_SYN) &&
1552 (to.to_flag & TOF_CC) && tp->cc_recv != 0) {
1553 if (tp->t_state == TCPS_TIME_WAIT &&
1554 tp->t_starttime > tcp_msl) {
1555 rstreason = BANDLIM_UNLIMITED;
1556 goto dropwithreset;
1557 }
1558 if (CC_GT(to.to_cc, tp->cc_recv)) {
1559 tp = tcp_close(tp);
1560 goto findpcb;
1561 }
1562 else
1563 goto drop;
1564 }
1565 break; /* continue normal processing */
1566 }
1567
1568 /*
1569 * States other than LISTEN or SYN_SENT.
1570 * First check the RST flag and sequence number since reset segments
1571 * are exempt from the timestamp and connection count tests. This
1572 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
1573 * below which allowed reset segments in half the sequence space
1574 * to fall though and be processed (which gives forged reset
1575 * segments with a random sequence number a 50 percent chance of
1576 * killing a connection).
1577 * Then check timestamp, if present.
1578 * Then check the connection count, if present.
1579 * Then check that at least some bytes of segment are within
1580 * receive window. If segment begins before rcv_nxt,
1581 * drop leading data (and SYN); if nothing left, just ack.
1582 *
1583 *
1584 * If the RST bit is set, check the sequence number to see
1585 * if this is a valid reset segment.
1586 * RFC 793 page 37:
1587 * In all states except SYN-SENT, all reset (RST) segments
1588 * are validated by checking their SEQ-fields. A reset is
1589 * valid if its sequence number is in the window.
1590 * Note: this does not take into account delayed ACKs, so
1591 * we should test against last_ack_sent instead of rcv_nxt.
1592 * The sequence number in the reset segment is normally an
1593 * echo of our outgoing acknowlegement numbers, but some hosts
1594 * send a reset with the sequence number at the rightmost edge
1595 * of our receive window, and we have to handle this case.
1596 * If we have multiple segments in flight, the intial reset
1597 * segment sequence numbers will be to the left of last_ack_sent,
1598 * but they will eventually catch up.
1599 * In any case, it never made sense to trim reset segments to
1600 * fit the receive window since RFC 1122 says:
1601 * 4.2.2.12 RST Segment: RFC-793 Section 3.4
1602 *
1603 * A TCP SHOULD allow a received RST segment to include data.
1604 *
1605 * DISCUSSION
1606 * It has been suggested that a RST segment could contain
1607 * ASCII text that encoded and explained the cause of the
1608 * RST. No standard has yet been established for such
1609 * data.
1610 *
1611 * If the reset segment passes the sequence number test examine
1612 * the state:
1613 * SYN_RECEIVED STATE:
1614 * If passive open, return to LISTEN state.
1615 * If active open, inform user that connection was refused.
1616 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
1617 * Inform user that connection was reset, and close tcb.
1618 * CLOSING, LAST_ACK STATES:
1619 * Close the tcb.
1620 * TIME_WAIT STATE:
1621 * Drop the segment - see Stevens, vol. 2, p. 964 and
1622 * RFC 1337.
1623 */
1624 if (thflags & TH_RST) {
1625 if (SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
1626 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
1627 switch (tp->t_state) {
1628
1629 case TCPS_SYN_RECEIVED:
1630 so->so_error = ECONNREFUSED;
1631 goto close;
1632
1633 case TCPS_ESTABLISHED:
1634 case TCPS_FIN_WAIT_1:
1635 case TCPS_CLOSE_WAIT:
1636 /*
1637 Drop through ...
1638 */
1639 case TCPS_FIN_WAIT_2:
1640 so->so_error = ECONNRESET;
1641 close:
1642 postevent(so, 0, EV_RESET);
1643 tp->t_state = TCPS_CLOSED;
1644 tcpstat.tcps_drops++;
1645 tp = tcp_close(tp);
1646 break;
1647
1648 case TCPS_CLOSING:
1649 case TCPS_LAST_ACK:
1650 tp = tcp_close(tp);
1651 break;
1652
1653 case TCPS_TIME_WAIT:
1654 break;
1655 }
1656 }
1657 goto drop;
1658 }
1659
1660 /*
1661 * RFC 1323 PAWS: If we have a timestamp reply on this segment
1662 * and it's less than ts_recent, drop it.
1663 */
1664 if ((to.to_flag & TOF_TS) != 0 && tp->ts_recent &&
1665 TSTMP_LT(to.to_tsval, tp->ts_recent)) {
1666
1667 /* Check to see if ts_recent is over 24 days old. */
1668 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) {
1669 /*
1670 * Invalidate ts_recent. If this segment updates
1671 * ts_recent, the age will be reset later and ts_recent
1672 * will get a valid value. If it does not, setting
1673 * ts_recent to zero will at least satisfy the
1674 * requirement that zero be placed in the timestamp
1675 * echo reply when ts_recent isn't valid. The
1676 * age isn't reset until we get a valid ts_recent
1677 * because we don't want out-of-order segments to be
1678 * dropped when ts_recent is old.
1679 */
1680 tp->ts_recent = 0;
1681 } else {
1682 tcpstat.tcps_rcvduppack++;
1683 tcpstat.tcps_rcvdupbyte += tlen;
1684 tcpstat.tcps_pawsdrop++;
1685 goto dropafterack;
1686 }
1687 }
1688
1689 /*
1690 * T/TCP mechanism
1691 * If T/TCP was negotiated and the segment doesn't have CC,
1692 * or if its CC is wrong then drop the segment.
1693 * RST segments do not have to comply with this.
1694 */
1695 if ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) == (TF_REQ_CC|TF_RCVD_CC) &&
1696 ((to.to_flag & TOF_CC) == 0 || tp->cc_recv != to.to_cc))
1697 goto dropafterack;
1698
1699 /*
1700 * In the SYN-RECEIVED state, validate that the packet belongs to
1701 * this connection before trimming the data to fit the receive
1702 * window. Check the sequence number versus IRS since we know
1703 * the sequence numbers haven't wrapped. This is a partial fix
1704 * for the "LAND" DoS attack.
1705 */
1706 if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
1707 rstreason = BANDLIM_RST_OPENPORT;
1708 goto dropwithreset;
1709 }
1710
1711 todrop = tp->rcv_nxt - th->th_seq;
1712 if (todrop > 0) {
1713 if (thflags & TH_SYN) {
1714 thflags &= ~TH_SYN;
1715 th->th_seq++;
1716 if (th->th_urp > 1)
1717 th->th_urp--;
1718 else
1719 thflags &= ~TH_URG;
1720 todrop--;
1721 }
1722 /*
1723 * Following if statement from Stevens, vol. 2, p. 960.
1724 */
1725 if (todrop > tlen
1726 || (todrop == tlen && (thflags & TH_FIN) == 0)) {
1727 /*
1728 * Any valid FIN must be to the left of the window.
1729 * At this point the FIN must be a duplicate or out
1730 * of sequence; drop it.
1731 */
1732 thflags &= ~TH_FIN;
1733
1734 /*
1735 * Send an ACK to resynchronize and drop any data.
1736 * But keep on processing for RST or ACK.
1737 */
1738 tp->t_flags |= TF_ACKNOW;
1739 todrop = tlen;
1740 tcpstat.tcps_rcvduppack++;
1741 tcpstat.tcps_rcvdupbyte += todrop;
1742 } else {
1743 tcpstat.tcps_rcvpartduppack++;
1744 tcpstat.tcps_rcvpartdupbyte += todrop;
1745 }
1746 drop_hdrlen += todrop; /* drop from the top afterwards */
1747 th->th_seq += todrop;
1748 tlen -= todrop;
1749 if (th->th_urp > todrop)
1750 th->th_urp -= todrop;
1751 else {
1752 thflags &= ~TH_URG;
1753 th->th_urp = 0;
1754 }
1755 }
1756
1757 /*
1758 * If new data are received on a connection after the
1759 * user processes are gone, then RST the other end.
1760 */
1761 if ((so->so_state & SS_NOFDREF) &&
1762 tp->t_state > TCPS_CLOSE_WAIT && tlen) {
1763 tp = tcp_close(tp);
1764 tcpstat.tcps_rcvafterclose++;
1765 rstreason = BANDLIM_UNLIMITED;
1766 goto dropwithreset;
1767 }
1768
1769 /*
1770 * If segment ends after window, drop trailing data
1771 * (and PUSH and FIN); if nothing left, just ACK.
1772 */
1773 todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd);
1774 if (todrop > 0) {
1775 tcpstat.tcps_rcvpackafterwin++;
1776 if (todrop >= tlen) {
1777 tcpstat.tcps_rcvbyteafterwin += tlen;
1778 /*
1779 * If a new connection request is received
1780 * while in TIME_WAIT, drop the old connection
1781 * and start over if the sequence numbers
1782 * are above the previous ones.
1783 */
1784 if (thflags & TH_SYN &&
1785 tp->t_state == TCPS_TIME_WAIT &&
1786 SEQ_GT(th->th_seq, tp->rcv_nxt)) {
1787 iss = tcp_new_isn(tp);
1788 tp = tcp_close(tp);
1789 goto findpcb;
1790 }
1791 /*
1792 * If window is closed can only take segments at
1793 * window edge, and have to drop data and PUSH from
1794 * incoming segments. Continue processing, but
1795 * remember to ack. Otherwise, drop segment
1796 * and ack.
1797 */
1798 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
1799 tp->t_flags |= TF_ACKNOW;
1800 tcpstat.tcps_rcvwinprobe++;
1801 } else
1802 goto dropafterack;
1803 } else
1804 tcpstat.tcps_rcvbyteafterwin += todrop;
1805 m_adj(m, -todrop);
1806 tlen -= todrop;
1807 thflags &= ~(TH_PUSH|TH_FIN);
1808 }
1809
1810 /*
1811 * If last ACK falls within this segment's sequence numbers,
1812 * record its timestamp.
1813 * NOTE that the test is modified according to the latest
1814 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
1815 */
1816 if ((to.to_flag & TOF_TS) != 0 &&
1817 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
1818 tp->ts_recent_age = tcp_now;
1819 tp->ts_recent = to.to_tsval;
1820 }
1821
1822 /*
1823 * If a SYN is in the window, then this is an
1824 * error and we send an RST and drop the connection.
1825 */
1826 if (thflags & TH_SYN) {
1827 tp = tcp_drop(tp, ECONNRESET);
1828 rstreason = BANDLIM_UNLIMITED;
1829 postevent(so, 0, EV_RESET);
1830 goto dropwithreset;
1831 }
1832
1833 /*
1834 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN
1835 * flag is on (half-synchronized state), then queue data for
1836 * later processing; else drop segment and return.
1837 */
1838 if ((thflags & TH_ACK) == 0) {
1839 if (tp->t_state == TCPS_SYN_RECEIVED ||
1840 (tp->t_flags & TF_NEEDSYN))
1841 goto step6;
1842 else
1843 goto drop;
1844 }
1845
1846 /*
1847 * Ack processing.
1848 */
1849 switch (tp->t_state) {
1850
1851 /*
1852 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
1853 * ESTABLISHED state and continue processing.
1854 * The ACK was checked above.
1855 */
1856 case TCPS_SYN_RECEIVED:
1857
1858 tcpstat.tcps_connects++;
1859 soisconnected(so);
1860
1861 /* Do window scaling? */
1862 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1863 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1864 tp->snd_scale = tp->requested_s_scale;
1865 tp->rcv_scale = tp->request_r_scale;
1866 }
1867 /*
1868 * Upon successful completion of 3-way handshake,
1869 * update cache.CC if it was undefined, pass any queued
1870 * data to the user, and advance state appropriately.
1871 */
1872 if ((taop = tcp_gettaocache(inp)) != NULL &&
1873 taop->tao_cc == 0)
1874 taop->tao_cc = tp->cc_recv;
1875
1876 /*
1877 * Make transitions:
1878 * SYN-RECEIVED -> ESTABLISHED
1879 * SYN-RECEIVED* -> FIN-WAIT-1
1880 */
1881 if (tp->t_flags & TF_NEEDFIN) {
1882 tp->t_state = TCPS_FIN_WAIT_1;
1883 tp->t_flags &= ~TF_NEEDFIN;
1884 } else {
1885 tp->t_state = TCPS_ESTABLISHED;
1886 tp->t_timer[TCPT_KEEP] = tcp_keepidle;
1887 }
1888 /*
1889 * If segment contains data or ACK, will call tcp_reass()
1890 * later; if not, do so now to pass queued data to user.
1891 */
1892 if (tlen == 0 && (thflags & TH_FIN) == 0)
1893 (void) tcp_reass(tp, (struct tcphdr *)0, 0,
1894 (struct mbuf *)0);
1895 tp->snd_wl1 = th->th_seq - 1;
1896 /* fall into ... */
1897
1898 /*
1899 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
1900 * ACKs. If the ack is in the range
1901 * tp->snd_una < th->th_ack <= tp->snd_max
1902 * then advance tp->snd_una to th->th_ack and drop
1903 * data from the retransmission queue. If this ACK reflects
1904 * more up to date window information we update our window information.
1905 */
1906 case TCPS_ESTABLISHED:
1907 case TCPS_FIN_WAIT_1:
1908 case TCPS_FIN_WAIT_2:
1909 case TCPS_CLOSE_WAIT:
1910 case TCPS_CLOSING:
1911 case TCPS_LAST_ACK:
1912 case TCPS_TIME_WAIT:
1913
1914 if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
1915 if (tlen == 0 && tiwin == tp->snd_wnd) {
1916 tcpstat.tcps_rcvdupack++;
1917 /*
1918 * If we have outstanding data (other than
1919 * a window probe), this is a completely
1920 * duplicate ack (ie, window info didn't
1921 * change), the ack is the biggest we've
1922 * seen and we've seen exactly our rexmt
1923 * threshhold of them, assume a packet
1924 * has been dropped and retransmit it.
1925 * Kludge snd_nxt & the congestion
1926 * window so we send only this one
1927 * packet.
1928 *
1929 * We know we're losing at the current
1930 * window size so do congestion avoidance
1931 * (set ssthresh to half the current window
1932 * and pull our congestion window back to
1933 * the new ssthresh).
1934 *
1935 * Dup acks mean that packets have left the
1936 * network (they're now cached at the receiver)
1937 * so bump cwnd by the amount in the receiver
1938 * to keep a constant cwnd packets in the
1939 * network.
1940 */
1941 if (tp->t_timer[TCPT_REXMT] == 0 ||
1942 th->th_ack != tp->snd_una)
1943 tp->t_dupacks = 0;
1944 else if (++tp->t_dupacks == tcprexmtthresh) {
1945 tcp_seq onxt = tp->snd_nxt;
1946 u_int win =
1947 min(tp->snd_wnd, tp->snd_cwnd) / 2 /
1948 tp->t_maxseg;
1949 if (tcp_do_newreno && SEQ_LT(th->th_ack,
1950 tp->snd_recover)) {
1951 /* False retransmit, should not
1952 * cut window
1953 */
1954 tp->snd_cwnd += tp->t_maxseg;
1955 tp->t_dupacks = 0;
1956 (void) tcp_output(tp);
1957 goto drop;
1958 }
1959 if (win < 2)
1960 win = 2;
1961 tp->snd_ssthresh = win * tp->t_maxseg;
1962 tp->snd_recover = tp->snd_max;
1963 tp->t_timer[TCPT_REXMT] = 0;
1964 tp->t_rtttime = 0;
1965 tp->snd_nxt = th->th_ack;
1966 tp->snd_cwnd = tp->t_maxseg;
1967 (void) tcp_output(tp);
1968 tp->snd_cwnd = tp->snd_ssthresh +
1969 tp->t_maxseg * tp->t_dupacks;
1970 if (SEQ_GT(onxt, tp->snd_nxt))
1971 tp->snd_nxt = onxt;
1972 goto drop;
1973 } else if (tp->t_dupacks > tcprexmtthresh) {
1974 tp->snd_cwnd += tp->t_maxseg;
1975 (void) tcp_output(tp);
1976 goto drop;
1977 }
1978 } else
1979 tp->t_dupacks = 0;
1980 break;
1981 }
1982 /*
1983 * If the congestion window was inflated to account
1984 * for the other side's cached packets, retract it.
1985 */
1986 if (tcp_do_newreno == 0) {
1987 if (tp->t_dupacks >= tcprexmtthresh &&
1988 tp->snd_cwnd > tp->snd_ssthresh)
1989 tp->snd_cwnd = tp->snd_ssthresh;
1990 tp->t_dupacks = 0;
1991 } else if (tp->t_dupacks >= tcprexmtthresh &&
1992 !tcp_newreno(tp, th)) {
1993 /*
1994 * Window inflation should have left us with approx.
1995 * snd_ssthresh outstanding data. But in case we
1996 * would be inclined to send a burst, better to do
1997 * it via the slow start mechanism.
1998 */
1999 if (SEQ_GT(th->th_ack + tp->snd_ssthresh, tp->snd_max))
2000 tp->snd_cwnd =
2001 tp->snd_max - th->th_ack + tp->t_maxseg;
2002 else
2003 tp->snd_cwnd = tp->snd_ssthresh;
2004 tp->t_dupacks = 0;
2005 }
2006
2007 if (tp->t_dupacks < tcprexmtthresh)
2008 tp->t_dupacks = 0;
2009
2010 if (SEQ_GT(th->th_ack, tp->snd_max)) {
2011 tcpstat.tcps_rcvacktoomuch++;
2012 goto dropafterack;
2013 }
2014 /*
2015 * If we reach this point, ACK is not a duplicate,
2016 * i.e., it ACKs something we sent.
2017 */
2018 if (tp->t_flags & TF_NEEDSYN) {
2019 /*
2020 * T/TCP: Connection was half-synchronized, and our
2021 * SYN has been ACK'd (so connection is now fully
2022 * synchronized). Go to non-starred state,
2023 * increment snd_una for ACK of SYN, and check if
2024 * we can do window scaling.
2025 */
2026 tp->t_flags &= ~TF_NEEDSYN;
2027 tp->snd_una++;
2028 /* Do window scaling? */
2029 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
2030 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
2031 tp->snd_scale = tp->requested_s_scale;
2032 tp->rcv_scale = tp->request_r_scale;
2033 }
2034 }
2035
2036 process_ACK:
2037 acked = th->th_ack - tp->snd_una;
2038 tcpstat.tcps_rcvackpack++;
2039 tcpstat.tcps_rcvackbyte += acked;
2040
2041 /*
2042 * If we just performed our first retransmit, and the ACK
2043 * arrives within our recovery window, then it was a mistake
2044 * to do the retransmit in the first place. Recover our
2045 * original cwnd and ssthresh, and proceed to transmit where
2046 * we left off.
2047 */
2048 if (tp->t_rxtshift == 1 && tcp_now < tp->t_badrxtwin) {
2049 tp->snd_cwnd = tp->snd_cwnd_prev;
2050 tp->snd_ssthresh = tp->snd_ssthresh_prev;
2051 tp->snd_nxt = tp->snd_max;
2052 tp->t_badrxtwin = 0; /* XXX probably not required */
2053 }
2054
2055 /*
2056 * If we have a timestamp reply, update smoothed
2057 * round trip time. If no timestamp is present but
2058 * transmit timer is running and timed sequence
2059 * number was acked, update smoothed round trip time.
2060 * Since we now have an rtt measurement, cancel the
2061 * timer backoff (cf., Phil Karn's retransmit alg.).
2062 * Recompute the initial retransmit timer.
2063 */
2064 if (to.to_flag & TOF_TS)
2065 tcp_xmit_timer(tp, tcp_now - to.to_tsecr + 1);
2066 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
2067 tcp_xmit_timer(tp, tp->t_rtttime);
2068
2069 /*
2070 * If all outstanding data is acked, stop retransmit
2071 * timer and remember to restart (more output or persist).
2072 * If there is more data to be acked, restart retransmit
2073 * timer, using current (possibly backed-off) value.
2074 */
2075 if (th->th_ack == tp->snd_max) {
2076 tp->t_timer[TCPT_REXMT] = 0;
2077 needoutput = 1;
2078 } else if (tp->t_timer[TCPT_PERSIST] == 0)
2079 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
2080
2081 /*
2082 * If no data (only SYN) was ACK'd,
2083 * skip rest of ACK processing.
2084 */
2085 if (acked == 0)
2086 goto step6;
2087
2088 /*
2089 * When new data is acked, open the congestion window.
2090 * If the window gives us less than ssthresh packets
2091 * in flight, open exponentially (maxseg per packet).
2092 * Otherwise open linearly: maxseg per window
2093 * (maxseg^2 / cwnd per packet).
2094 */
2095 {
2096 register u_int cw = tp->snd_cwnd;
2097 register u_int incr = tp->t_maxseg;
2098
2099 if (cw > tp->snd_ssthresh)
2100 incr = incr * incr / cw;
2101 /*
2102 * If t_dupacks != 0 here, it indicates that we are still
2103 * in NewReno fast recovery mode, so we leave the congestion
2104 * window alone.
2105 */
2106 if (tcp_do_newreno == 0 || tp->t_dupacks == 0)
2107 tp->snd_cwnd = min(cw + incr,TCP_MAXWIN<<tp->snd_scale);
2108 }
2109 if (acked > so->so_snd.sb_cc) {
2110 tp->snd_wnd -= so->so_snd.sb_cc;
2111 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
2112 ourfinisacked = 1;
2113 } else {
2114 sbdrop(&so->so_snd, acked);
2115 tp->snd_wnd -= acked;
2116 ourfinisacked = 0;
2117 }
2118 sowwakeup(so);
2119 tp->snd_una = th->th_ack;
2120 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
2121 tp->snd_nxt = tp->snd_una;
2122
2123 switch (tp->t_state) {
2124
2125 /*
2126 * In FIN_WAIT_1 STATE in addition to the processing
2127 * for the ESTABLISHED state if our FIN is now acknowledged
2128 * then enter FIN_WAIT_2.
2129 */
2130 case TCPS_FIN_WAIT_1:
2131 if (ourfinisacked) {
2132 /*
2133 * If we can't receive any more
2134 * data, then closing user can proceed.
2135 * Starting the timer is contrary to the
2136 * specification, but if we don't get a FIN
2137 * we'll hang forever.
2138 */
2139 if (so->so_state & SS_CANTRCVMORE) {
2140 soisdisconnected(so);
2141 tp->t_timer[TCPT_2MSL] = tcp_maxidle;
2142 }
2143 add_to_time_wait(tp);
2144 tp->t_state = TCPS_FIN_WAIT_2;
2145 }
2146 break;
2147
2148 /*
2149 * In CLOSING STATE in addition to the processing for
2150 * the ESTABLISHED state if the ACK acknowledges our FIN
2151 * then enter the TIME-WAIT state, otherwise ignore
2152 * the segment.
2153 */
2154 case TCPS_CLOSING:
2155 if (ourfinisacked) {
2156 tp->t_state = TCPS_TIME_WAIT;
2157 tcp_canceltimers(tp);
2158 /* Shorten TIME_WAIT [RFC-1644, p.28] */
2159 if (tp->cc_recv != 0 &&
2160 tp->t_starttime < tcp_msl)
2161 tp->t_timer[TCPT_2MSL] =
2162 tp->t_rxtcur * TCPTV_TWTRUNC;
2163 else
2164 tp->t_timer[TCPT_2MSL] = 2 * tcp_msl;
2165 add_to_time_wait(tp);
2166 soisdisconnected(so);
2167 }
2168 break;
2169
2170 /*
2171 * In LAST_ACK, we may still be waiting for data to drain
2172 * and/or to be acked, as well as for the ack of our FIN.
2173 * If our FIN is now acknowledged, delete the TCB,
2174 * enter the closed state and return.
2175 */
2176 case TCPS_LAST_ACK:
2177 if (ourfinisacked) {
2178 tp = tcp_close(tp);
2179 goto drop;
2180 }
2181 break;
2182
2183 /*
2184 * In TIME_WAIT state the only thing that should arrive
2185 * is a retransmission of the remote FIN. Acknowledge
2186 * it and restart the finack timer.
2187 */
2188 case TCPS_TIME_WAIT:
2189 tp->t_timer[TCPT_2MSL] = 2 * tcp_msl;
2190 add_to_time_wait(tp);
2191 goto dropafterack;
2192 }
2193 }
2194
2195 step6:
2196 /*
2197 * Update window information.
2198 * Don't look at window if no ACK: TAC's send garbage on first SYN.
2199 */
2200 if ((thflags & TH_ACK) &&
2201 (SEQ_LT(tp->snd_wl1, th->th_seq) ||
2202 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
2203 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
2204 /* keep track of pure window updates */
2205 if (tlen == 0 &&
2206 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
2207 tcpstat.tcps_rcvwinupd++;
2208 tp->snd_wnd = tiwin;
2209 tp->snd_wl1 = th->th_seq;
2210 tp->snd_wl2 = th->th_ack;
2211 if (tp->snd_wnd > tp->max_sndwnd)
2212 tp->max_sndwnd = tp->snd_wnd;
2213 needoutput = 1;
2214 }
2215
2216 /*
2217 * Process segments with URG.
2218 */
2219 if ((thflags & TH_URG) && th->th_urp &&
2220 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2221 /*
2222 * This is a kludge, but if we receive and accept
2223 * random urgent pointers, we'll crash in
2224 * soreceive. It's hard to imagine someone
2225 * actually wanting to send this much urgent data.
2226 */
2227 if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
2228 th->th_urp = 0; /* XXX */
2229 thflags &= ~TH_URG; /* XXX */
2230 goto dodata; /* XXX */
2231 }
2232 /*
2233 * If this segment advances the known urgent pointer,
2234 * then mark the data stream. This should not happen
2235 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
2236 * a FIN has been received from the remote side.
2237 * In these states we ignore the URG.
2238 *
2239 * According to RFC961 (Assigned Protocols),
2240 * the urgent pointer points to the last octet
2241 * of urgent data. We continue, however,
2242 * to consider it to indicate the first octet
2243 * of data past the urgent section as the original
2244 * spec states (in one of two places).
2245 */
2246 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
2247 tp->rcv_up = th->th_seq + th->th_urp;
2248 so->so_oobmark = so->so_rcv.sb_cc +
2249 (tp->rcv_up - tp->rcv_nxt) - 1;
2250 if (so->so_oobmark == 0) {
2251 so->so_state |= SS_RCVATMARK;
2252 postevent(so, 0, EV_OOB);
2253 }
2254 sohasoutofband(so);
2255 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
2256 }
2257 /*
2258 * Remove out of band data so doesn't get presented to user.
2259 * This can happen independent of advancing the URG pointer,
2260 * but if two URG's are pending at once, some out-of-band
2261 * data may creep in... ick.
2262 */
2263 if (th->th_urp <= (u_long)tlen
2264 #if SO_OOBINLINE
2265 && (so->so_options & SO_OOBINLINE) == 0
2266 #endif
2267 )
2268 tcp_pulloutofband(so, th, m,
2269 drop_hdrlen); /* hdr drop is delayed */
2270 } else
2271 /*
2272 * If no out of band data is expected,
2273 * pull receive urgent pointer along
2274 * with the receive window.
2275 */
2276 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
2277 tp->rcv_up = tp->rcv_nxt;
2278 dodata: /* XXX */
2279
2280 /*
2281 * Process the segment text, merging it into the TCP sequencing queue,
2282 * and arranging for acknowledgment of receipt if necessary.
2283 * This process logically involves adjusting tp->rcv_wnd as data
2284 * is presented to the user (this happens in tcp_usrreq.c,
2285 * case PRU_RCVD). If a FIN has already been received on this
2286 * connection then we just ignore the text.
2287 */
2288 if ((tlen || (thflags&TH_FIN)) &&
2289 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2290 m_adj(m, drop_hdrlen); /* delayed header drop */
2291 /*
2292 * Insert segment which inludes th into reassembly queue of tcp with
2293 * control block tp. Return TH_FIN if reassembly now includes
2294 * a segment with FIN. This handle the common case inline (segment
2295 * is the next to be received on an established connection, and the
2296 * queue is empty), avoiding linkage into and removal from the queue
2297 * and repetition of various conversions.
2298 * Set DELACK for segments received in order, but ack immediately
2299 * when segments are out of order (so fast retransmit can work).
2300 */
2301 if (th->th_seq == tp->rcv_nxt &&
2302 LIST_EMPTY(&tp->t_segq) &&
2303 TCPS_HAVEESTABLISHED(tp->t_state)) {
2304 #ifdef __APPLE__
2305 if (tcp_delack_enabled) {
2306 TCP_DELACK_BITSET(tp->t_inpcb->hash_element);
2307 tp->t_flags |= TF_DELACK;
2308 }
2309 #else
2310 if (DELAY_ACK(tp))
2311 callout_reset(tp->tt_delack, tcp_delacktime,
2312 tcp_timer_delack, tp);
2313 #endif
2314 else
2315 tp->t_flags |= TF_ACKNOW;
2316 tp->rcv_nxt += tlen;
2317 thflags = th->th_flags & TH_FIN;
2318 tcpstat.tcps_rcvpack++;
2319 tcpstat.tcps_rcvbyte += tlen;
2320 ND6_HINT(tp);
2321 sbappend(&so->so_rcv, m);
2322 sorwakeup(so);
2323 } else {
2324 thflags = tcp_reass(tp, th, &tlen, m);
2325 tp->t_flags |= TF_ACKNOW;
2326 }
2327
2328 if (tp->t_flags & TF_DELACK)
2329 {
2330 #if INET6
2331 if (isipv6) {
2332 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
2333 (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
2334 th->th_seq, th->th_ack, th->th_win);
2335 }
2336 else
2337 #endif
2338 {
2339 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
2340 (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
2341 th->th_seq, th->th_ack, th->th_win);
2342 }
2343
2344 }
2345 /*
2346 * Note the amount of data that peer has sent into
2347 * our window, in order to estimate the sender's
2348 * buffer size.
2349 */
2350 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
2351 } else {
2352 m_freem(m);
2353 thflags &= ~TH_FIN;
2354 }
2355
2356 /*
2357 * If FIN is received ACK the FIN and let the user know
2358 * that the connection is closing.
2359 */
2360 if (thflags & TH_FIN) {
2361 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2362 socantrcvmore(so);
2363 postevent(so, 0, EV_FIN);
2364 /*
2365 * If connection is half-synchronized
2366 * (ie NEEDSYN flag on) then delay ACK,
2367 * so it may be piggybacked when SYN is sent.
2368 * Otherwise, since we received a FIN then no
2369 * more input can be expected, send ACK now.
2370 */
2371 if (tcp_delack_enabled && (tp->t_flags & TF_NEEDSYN)) {
2372 TCP_DELACK_BITSET(tp->t_inpcb->hash_element);
2373 tp->t_flags |= TF_DELACK;
2374 }
2375 else
2376 tp->t_flags |= TF_ACKNOW;
2377 tp->rcv_nxt++;
2378 }
2379 switch (tp->t_state) {
2380
2381 /*
2382 * In SYN_RECEIVED and ESTABLISHED STATES
2383 * enter the CLOSE_WAIT state.
2384 */
2385 case TCPS_SYN_RECEIVED:
2386 /*FALLTHROUGH*/
2387 case TCPS_ESTABLISHED:
2388 tp->t_state = TCPS_CLOSE_WAIT;
2389 break;
2390
2391 /*
2392 * If still in FIN_WAIT_1 STATE FIN has not been acked so
2393 * enter the CLOSING state.
2394 */
2395 case TCPS_FIN_WAIT_1:
2396 tp->t_state = TCPS_CLOSING;
2397 break;
2398
2399 /*
2400 * In FIN_WAIT_2 state enter the TIME_WAIT state,
2401 * starting the time-wait timer, turning off the other
2402 * standard timers.
2403 */
2404 case TCPS_FIN_WAIT_2:
2405 tp->t_state = TCPS_TIME_WAIT;
2406 tcp_canceltimers(tp);
2407 /* Shorten TIME_WAIT [RFC-1644, p.28] */
2408 if (tp->cc_recv != 0 &&
2409 tp->t_starttime < tcp_msl) {
2410 tp->t_timer[TCPT_2MSL] =
2411 tp->t_rxtcur * TCPTV_TWTRUNC;
2412 /* For transaction client, force ACK now. */
2413 tp->t_flags |= TF_ACKNOW;
2414 }
2415 else
2416 tp->t_timer[TCPT_2MSL] = 2 * tcp_msl;
2417
2418 add_to_time_wait(tp);
2419 soisdisconnected(so);
2420 break;
2421
2422 /*
2423 * In TIME_WAIT state restart the 2 MSL time_wait timer.
2424 */
2425 case TCPS_TIME_WAIT:
2426 tp->t_timer[TCPT_2MSL] = 2 * tcp_msl;
2427 add_to_time_wait(tp);
2428 break;
2429 }
2430 }
2431 #if TCPDEBUG
2432 if (so->so_options & SO_DEBUG)
2433 tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
2434 &tcp_savetcp, 0);
2435 #endif
2436
2437 /*
2438 * Return any desired output.
2439 */
2440 if (needoutput || (tp->t_flags & TF_ACKNOW))
2441 (void) tcp_output(tp);
2442 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
2443 return;
2444
2445 dropafterack:
2446 /*
2447 * Generate an ACK dropping incoming segment if it occupies
2448 * sequence space, where the ACK reflects our state.
2449 *
2450 * We can now skip the test for the RST flag since all
2451 * paths to this code happen after packets containing
2452 * RST have been dropped.
2453 *
2454 * In the SYN-RECEIVED state, don't send an ACK unless the
2455 * segment we received passes the SYN-RECEIVED ACK test.
2456 * If it fails send a RST. This breaks the loop in the
2457 * "LAND" DoS attack, and also prevents an ACK storm
2458 * between two listening ports that have been sent forged
2459 * SYN segments, each with the source address of the other.
2460 */
2461 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
2462 (SEQ_GT(tp->snd_una, th->th_ack) ||
2463 SEQ_GT(th->th_ack, tp->snd_max)) ) {
2464 rstreason = BANDLIM_RST_OPENPORT;
2465 goto dropwithreset;
2466 }
2467 #if TCPDEBUG
2468 if (so->so_options & SO_DEBUG)
2469 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
2470 &tcp_savetcp, 0);
2471 #endif
2472 m_freem(m);
2473 tp->t_flags |= TF_ACKNOW;
2474 (void) tcp_output(tp);
2475 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
2476 return;
2477
2478 dropwithreset:
2479 /*
2480 * Generate a RST, dropping incoming segment.
2481 * Make ACK acceptable to originator of segment.
2482 * Don't bother to respond if destination was broadcast/multicast.
2483 */
2484 if ((thflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST))
2485 goto drop;
2486 #if INET6
2487 if (isipv6) {
2488 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
2489 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
2490 goto drop;
2491 } else
2492 #endif /* INET6 */
2493 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
2494 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
2495 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
2496 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
2497 goto drop;
2498 /* IPv6 anycast check is done at tcp6_input() */
2499
2500 /*
2501 * Perform bandwidth limiting.
2502 */
2503 #if ICMP_BANDLIM
2504 if (badport_bandlim(rstreason) < 0)
2505 goto drop;
2506 #endif
2507
2508 #if TCPDEBUG
2509 if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
2510 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
2511 &tcp_savetcp, 0);
2512 #endif
2513 if (thflags & TH_ACK)
2514 /* mtod() below is safe as long as hdr dropping is delayed */
2515 tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack,
2516 TH_RST);
2517 else {
2518 if (thflags & TH_SYN)
2519 tlen++;
2520 /* mtod() below is safe as long as hdr dropping is delayed */
2521 tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
2522 (tcp_seq)0, TH_RST|TH_ACK);
2523 }
2524 /* destroy temporarily created socket */
2525 if (dropsocket)
2526 (void) soabort(so);
2527 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
2528 return;
2529
2530 drop:
2531 /*
2532 * Drop space held by incoming segment and return.
2533 */
2534 #if TCPDEBUG
2535 if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
2536 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
2537 &tcp_savetcp, 0);
2538 #endif
2539 m_freem(m);
2540 /* destroy temporarily created socket */
2541 if (dropsocket)
2542 (void) soabort(so);
2543 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
2544 return;
2545 }
2546
2547 static void
2548 tcp_dooptions(tp, cp, cnt, th, to)
2549 struct tcpcb *tp;
2550 u_char *cp;
2551 int cnt;
2552 struct tcphdr *th;
2553 struct tcpopt *to;
2554 {
2555 u_short mss = 0;
2556 int opt, optlen;
2557
2558 for (; cnt > 0; cnt -= optlen, cp += optlen) {
2559 opt = cp[0];
2560 if (opt == TCPOPT_EOL)
2561 break;
2562 if (opt == TCPOPT_NOP)
2563 optlen = 1;
2564 else {
2565 if (cnt < 2)
2566 break;
2567 optlen = cp[1];
2568 if (optlen < 2 || optlen > cnt)
2569 break;
2570 }
2571 switch (opt) {
2572
2573 default:
2574 continue;
2575
2576 case TCPOPT_MAXSEG:
2577 if (optlen != TCPOLEN_MAXSEG)
2578 continue;
2579 if (!(th->th_flags & TH_SYN))
2580 continue;
2581 bcopy((char *) cp + 2, (char *) &mss, sizeof(mss));
2582 NTOHS(mss);
2583 break;
2584
2585 case TCPOPT_WINDOW:
2586 if (optlen != TCPOLEN_WINDOW)
2587 continue;
2588 if (!(th->th_flags & TH_SYN))
2589 continue;
2590 tp->t_flags |= TF_RCVD_SCALE;
2591 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
2592 break;
2593
2594 case TCPOPT_TIMESTAMP:
2595 if (optlen != TCPOLEN_TIMESTAMP)
2596 continue;
2597 to->to_flag |= TOF_TS;
2598 bcopy((char *)cp + 2,
2599 (char *)&to->to_tsval, sizeof(to->to_tsval));
2600 NTOHL(to->to_tsval);
2601 bcopy((char *)cp + 6,
2602 (char *)&to->to_tsecr, sizeof(to->to_tsecr));
2603 NTOHL(to->to_tsecr);
2604
2605 /*
2606 * A timestamp received in a SYN makes
2607 * it ok to send timestamp requests and replies.
2608 */
2609 if (th->th_flags & TH_SYN) {
2610 tp->t_flags |= TF_RCVD_TSTMP;
2611 tp->ts_recent = to->to_tsval;
2612 tp->ts_recent_age = tcp_now;
2613 }
2614 break;
2615 case TCPOPT_CC:
2616 if (optlen != TCPOLEN_CC)
2617 continue;
2618 to->to_flag |= TOF_CC;
2619 bcopy((char *)cp + 2,
2620 (char *)&to->to_cc, sizeof(to->to_cc));
2621 NTOHL(to->to_cc);
2622 /*
2623 * A CC or CC.new option received in a SYN makes
2624 * it ok to send CC in subsequent segments.
2625 */
2626 if (th->th_flags & TH_SYN)
2627 tp->t_flags |= TF_RCVD_CC;
2628 break;
2629 case TCPOPT_CCNEW:
2630 if (optlen != TCPOLEN_CC)
2631 continue;
2632 if (!(th->th_flags & TH_SYN))
2633 continue;
2634 to->to_flag |= TOF_CCNEW;
2635 bcopy((char *)cp + 2,
2636 (char *)&to->to_cc, sizeof(to->to_cc));
2637 NTOHL(to->to_cc);
2638 /*
2639 * A CC or CC.new option received in a SYN makes
2640 * it ok to send CC in subsequent segments.
2641 */
2642 tp->t_flags |= TF_RCVD_CC;
2643 break;
2644 case TCPOPT_CCECHO:
2645 if (optlen != TCPOLEN_CC)
2646 continue;
2647 if (!(th->th_flags & TH_SYN))
2648 continue;
2649 to->to_flag |= TOF_CCECHO;
2650 bcopy((char *)cp + 2,
2651 (char *)&to->to_ccecho, sizeof(to->to_ccecho));
2652 NTOHL(to->to_ccecho);
2653 break;
2654 }
2655 }
2656 if (th->th_flags & TH_SYN)
2657 tcp_mss(tp, mss); /* sets t_maxseg */
2658 }
2659
2660 /*
2661 * Pull out of band byte out of a segment so
2662 * it doesn't appear in the user's data queue.
2663 * It is still reflected in the segment length for
2664 * sequencing purposes.
2665 */
2666 static void
2667 tcp_pulloutofband(so, th, m, off)
2668 struct socket *so;
2669 struct tcphdr *th;
2670 register struct mbuf *m;
2671 int off; /* delayed to be droped hdrlen */
2672 {
2673 int cnt = off + th->th_urp - 1;
2674
2675 while (cnt >= 0) {
2676 if (m->m_len > cnt) {
2677 char *cp = mtod(m, caddr_t) + cnt;
2678 struct tcpcb *tp = sototcpcb(so);
2679
2680 tp->t_iobc = *cp;
2681 tp->t_oobflags |= TCPOOB_HAVEDATA;
2682 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
2683 m->m_len--;
2684 if (m->m_flags & M_PKTHDR)
2685 m->m_pkthdr.len--;
2686 return;
2687 }
2688 cnt -= m->m_len;
2689 m = m->m_next;
2690 if (m == 0)
2691 break;
2692 }
2693 panic("tcp_pulloutofband");
2694 }
2695
2696 /*
2697 * Collect new round-trip time estimate
2698 * and update averages and current timeout.
2699 */
2700 static void
2701 tcp_xmit_timer(tp, rtt)
2702 register struct tcpcb *tp;
2703 int rtt;
2704 {
2705 register int delta;
2706
2707 tcpstat.tcps_rttupdated++;
2708 tp->t_rttupdated++;
2709 if (tp->t_srtt != 0) {
2710 /*
2711 * srtt is stored as fixed point with 5 bits after the
2712 * binary point (i.e., scaled by 8). The following magic
2713 * is equivalent to the smoothing algorithm in rfc793 with
2714 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
2715 * point). Adjust rtt to origin 0.
2716 */
2717 delta = ((rtt - 1) << TCP_DELTA_SHIFT)
2718 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
2719
2720 if ((tp->t_srtt += delta) <= 0)
2721 tp->t_srtt = 1;
2722
2723 /*
2724 * We accumulate a smoothed rtt variance (actually, a
2725 * smoothed mean difference), then set the retransmit
2726 * timer to smoothed rtt + 4 times the smoothed variance.
2727 * rttvar is stored as fixed point with 4 bits after the
2728 * binary point (scaled by 16). The following is
2729 * equivalent to rfc793 smoothing with an alpha of .75
2730 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
2731 * rfc793's wired-in beta.
2732 */
2733 if (delta < 0)
2734 delta = -delta;
2735 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
2736 if ((tp->t_rttvar += delta) <= 0)
2737 tp->t_rttvar = 1;
2738 } else {
2739 /*
2740 * No rtt measurement yet - use the unsmoothed rtt.
2741 * Set the variance to half the rtt (so our first
2742 * retransmit happens at 3*rtt).
2743 */
2744 tp->t_srtt = rtt << TCP_RTT_SHIFT;
2745 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
2746 }
2747 tp->t_rtttime = 0;
2748 tp->t_rxtshift = 0;
2749
2750 /*
2751 * the retransmit should happen at rtt + 4 * rttvar.
2752 * Because of the way we do the smoothing, srtt and rttvar
2753 * will each average +1/2 tick of bias. When we compute
2754 * the retransmit timer, we want 1/2 tick of rounding and
2755 * 1 extra tick because of +-1/2 tick uncertainty in the
2756 * firing of the timer. The bias will give us exactly the
2757 * 1.5 tick we need. But, because the bias is
2758 * statistical, we have to test that we don't drop below
2759 * the minimum feasible timer (which is 2 ticks).
2760 */
2761 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
2762 max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
2763
2764 /*
2765 * We received an ack for a packet that wasn't retransmitted;
2766 * it is probably safe to discard any error indications we've
2767 * received recently. This isn't quite right, but close enough
2768 * for now (a route might have failed after we sent a segment,
2769 * and the return path might not be symmetrical).
2770 */
2771 tp->t_softerror = 0;
2772 }
2773
2774 /*
2775 * Determine a reasonable value for maxseg size.
2776 * If the route is known, check route for mtu.
2777 * If none, use an mss that can be handled on the outgoing
2778 * interface without forcing IP to fragment; if bigger than
2779 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
2780 * to utilize large mbufs. If no route is found, route has no mtu,
2781 * or the destination isn't local, use a default, hopefully conservative
2782 * size (usually 512 or the default IP max size, but no more than the mtu
2783 * of the interface), as we can't discover anything about intervening
2784 * gateways or networks. We also initialize the congestion/slow start
2785 * window to be a single segment if the destination isn't local.
2786 * While looking at the routing entry, we also initialize other path-dependent
2787 * parameters from pre-set or cached values in the routing entry.
2788 *
2789 * Also take into account the space needed for options that we
2790 * send regularly. Make maxseg shorter by that amount to assure
2791 * that we can send maxseg amount of data even when the options
2792 * are present. Store the upper limit of the length of options plus
2793 * data in maxopd.
2794 *
2795 * NOTE that this routine is only called when we process an incoming
2796 * segment, for outgoing segments only tcp_mssopt is called.
2797 *
2798 * In case of T/TCP, we call this routine during implicit connection
2799 * setup as well (offer = -1), to initialize maxseg from the cached
2800 * MSS of our peer.
2801 */
2802 void
2803 tcp_mss(tp, offer)
2804 struct tcpcb *tp;
2805 int offer;
2806 {
2807 register struct rtentry *rt;
2808 struct ifnet *ifp;
2809 register int rtt, mss;
2810 u_long bufsize;
2811 struct inpcb *inp;
2812 struct socket *so;
2813 struct rmxp_tao *taop;
2814 int origoffer = offer;
2815 #if INET6
2816 int isipv6;
2817 int min_protoh;
2818 #endif
2819
2820 inp = tp->t_inpcb;
2821 #if INET6
2822 isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
2823 min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr)
2824 : sizeof (struct tcpiphdr);
2825 #else
2826 #define min_protoh (sizeof (struct tcpiphdr))
2827 #endif
2828 #if INET6
2829 if (isipv6)
2830 rt = tcp_rtlookup6(inp);
2831 else
2832 #endif /* INET6 */
2833 rt = tcp_rtlookup(inp);
2834 if (rt == NULL) {
2835 tp->t_maxopd = tp->t_maxseg =
2836 #if INET6
2837 isipv6 ? tcp_v6mssdflt :
2838 #endif /* INET6 */
2839 tcp_mssdflt;
2840 return;
2841 }
2842 ifp = rt->rt_ifp;
2843 so = inp->inp_socket;
2844
2845 taop = rmx_taop(rt->rt_rmx);
2846 /*
2847 * Offer == -1 means that we didn't receive SYN yet,
2848 * use cached value in that case;
2849 */
2850 if (offer == -1)
2851 offer = taop->tao_mssopt;
2852 /*
2853 * Offer == 0 means that there was no MSS on the SYN segment,
2854 * in this case we use tcp_mssdflt.
2855 */
2856 if (offer == 0)
2857 offer =
2858 #if INET6
2859 isipv6 ? tcp_v6mssdflt :
2860 #endif /* INET6 */
2861 tcp_mssdflt;
2862 else
2863 /*
2864 * Sanity check: make sure that maxopd will be large
2865 * enough to allow some data on segments even is the
2866 * all the option space is used (40bytes). Otherwise
2867 * funny things may happen in tcp_output.
2868 */
2869 offer = max(offer, 64);
2870 taop->tao_mssopt = offer;
2871
2872 /*
2873 * While we're here, check if there's an initial rtt
2874 * or rttvar. Convert from the route-table units
2875 * to scaled multiples of the slow timeout timer.
2876 */
2877 if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) {
2878 /*
2879 * XXX the lock bit for RTT indicates that the value
2880 * is also a minimum value; this is subject to time.
2881 */
2882 if (rt->rt_rmx.rmx_locks & RTV_RTT)
2883 tp->t_rttmin = rtt / (RTM_RTTUNIT / PR_SLOWHZ);
2884 tp->t_srtt = rtt / (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTT_SCALE));
2885 tcpstat.tcps_usedrtt++;
2886 if (rt->rt_rmx.rmx_rttvar) {
2887 tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
2888 (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTTVAR_SCALE));
2889 tcpstat.tcps_usedrttvar++;
2890 } else {
2891 /* default variation is +- 1 rtt */
2892 tp->t_rttvar =
2893 tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
2894 }
2895 TCPT_RANGESET(tp->t_rxtcur,
2896 ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
2897 tp->t_rttmin, TCPTV_REXMTMAX);
2898 }
2899 /*
2900 * if there's an mtu associated with the route, use it
2901 * else, use the link mtu.
2902 */
2903 if (rt->rt_rmx.rmx_mtu)
2904 mss = rt->rt_rmx.rmx_mtu - min_protoh;
2905 else
2906 {
2907 mss =
2908 #if INET6
2909 (isipv6 ? nd_ifinfo[rt->rt_ifp->if_index].linkmtu :
2910 #endif
2911 ifp->if_mtu
2912 #if INET6
2913 )
2914 #endif
2915 - min_protoh;
2916 #if INET6
2917 if (isipv6) {
2918 if (!in6_localaddr(&inp->in6p_faddr))
2919 mss = min(mss, tcp_v6mssdflt);
2920 } else
2921 #endif /* INET6 */
2922 if (!in_localaddr(inp->inp_faddr))
2923 mss = min(mss, tcp_mssdflt);
2924 }
2925 mss = min(mss, offer);
2926 /*
2927 * maxopd stores the maximum length of data AND options
2928 * in a segment; maxseg is the amount of data in a normal
2929 * segment. We need to store this value (maxopd) apart
2930 * from maxseg, because now every segment carries options
2931 * and thus we normally have somewhat less data in segments.
2932 */
2933 tp->t_maxopd = mss;
2934
2935 /*
2936 * In case of T/TCP, origoffer==-1 indicates, that no segments
2937 * were received yet. In this case we just guess, otherwise
2938 * we do the same as before T/TCP.
2939 */
2940 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
2941 (origoffer == -1 ||
2942 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
2943 mss -= TCPOLEN_TSTAMP_APPA;
2944 if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
2945 (origoffer == -1 ||
2946 (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC))
2947 mss -= TCPOLEN_CC_APPA;
2948
2949 #if (MCLBYTES & (MCLBYTES - 1)) == 0
2950 if (mss > MCLBYTES)
2951 mss &= ~(MCLBYTES-1);
2952 #else
2953 if (mss > MCLBYTES)
2954 mss = mss / MCLBYTES * MCLBYTES;
2955 #endif
2956 /*
2957 * If there's a pipesize, change the socket buffer
2958 * to that size. Make the socket buffers an integral
2959 * number of mss units; if the mss is larger than
2960 * the socket buffer, decrease the mss.
2961 */
2962 #if RTV_SPIPE
2963 if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0)
2964 #endif
2965 bufsize = so->so_snd.sb_hiwat;
2966 if (bufsize < mss)
2967 mss = bufsize;
2968 else {
2969 bufsize = roundup(bufsize, mss);
2970 if (bufsize > sb_max)
2971 bufsize = sb_max;
2972 (void)sbreserve(&so->so_snd, bufsize);
2973 }
2974 tp->t_maxseg = mss;
2975
2976 #if RTV_RPIPE
2977 if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0)
2978 #endif
2979 bufsize = so->so_rcv.sb_hiwat;
2980 if (bufsize > mss) {
2981 bufsize = roundup(bufsize, mss);
2982 if (bufsize > sb_max)
2983 bufsize = sb_max;
2984 (void)sbreserve(&so->so_rcv, bufsize);
2985 }
2986
2987 /*
2988 * Set the slow-start flight size depending on whether this
2989 * is a local network or not.
2990 */
2991 if (
2992 #if INET6
2993 (isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
2994 (!isipv6 &&
2995 #endif
2996 in_localaddr(inp->inp_faddr)
2997 #if INET6
2998 )
2999 #endif
3000 )
3001 tp->snd_cwnd = mss * ss_fltsz_local;
3002 else
3003 tp->snd_cwnd = mss * ss_fltsz;
3004
3005 if (rt->rt_rmx.rmx_ssthresh) {
3006 /*
3007 * There's some sort of gateway or interface
3008 * buffer limit on the path. Use this to set
3009 * the slow start threshhold, but set the
3010 * threshold to no less than 2*mss.
3011 */
3012 tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
3013 tcpstat.tcps_usedssthresh++;
3014 }
3015 }
3016
3017 /*
3018 * Determine the MSS option to send on an outgoing SYN.
3019 */
3020 int
3021 tcp_mssopt(tp)
3022 struct tcpcb *tp;
3023 {
3024 struct rtentry *rt;
3025 #if INET6
3026 int isipv6;
3027 int min_protoh;
3028 #endif
3029
3030 #if INET6
3031 isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
3032 min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr)
3033 : sizeof (struct tcpiphdr);
3034 #else
3035 #define min_protoh (sizeof (struct tcpiphdr))
3036 #endif
3037 #if INET6
3038 if (isipv6)
3039 rt = tcp_rtlookup6(tp->t_inpcb);
3040 else
3041 #endif /* INET6 */
3042 rt = tcp_rtlookup(tp->t_inpcb);
3043 if (rt == NULL)
3044 return
3045 #if INET6
3046 isipv6 ? tcp_v6mssdflt :
3047 #endif /* INET6 */
3048 tcp_mssdflt;
3049
3050 return rt->rt_ifp->if_mtu - min_protoh;
3051 }
3052
3053
3054 /*
3055 * Checks for partial ack. If partial ack arrives, force the retransmission
3056 * of the next unacknowledged segment, do not clear tp->t_dupacks, and return
3057 * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to
3058 * be started again. If the ack advances at least to tp->snd_recover, return 0.
3059 */
3060 static int
3061 tcp_newreno(tp, th)
3062 struct tcpcb *tp;
3063 struct tcphdr *th;
3064 {
3065 if (SEQ_LT(th->th_ack, tp->snd_recover)) {
3066 tcp_seq onxt = tp->snd_nxt;
3067 u_long ocwnd = tp->snd_cwnd;
3068 #ifdef __APPLE__
3069 tp->t_timer[TCPT_REXMT] = 0;
3070 #else
3071 callout_stop(tp->tt_rexmt);
3072 #endif
3073 tp->t_rtttime = 0;
3074 tp->snd_nxt = th->th_ack;
3075 /*
3076 * Set snd_cwnd to one segment beyond acknowledged offset
3077 * (tp->snd_una has not yet been updated when this function
3078 * is called)
3079 */
3080 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una);
3081 (void) tcp_output(tp);
3082 tp->snd_cwnd = ocwnd;
3083 if (SEQ_GT(onxt, tp->snd_nxt))
3084 tp->snd_nxt = onxt;
3085 /*
3086 * Partial window deflation. Relies on fact that tp->snd_una
3087 * not updated yet.
3088 */
3089 tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_maxseg);
3090 return (1);
3091 }
3092 return (0);
3093 }