]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/tcp_input.c
6345225a8d2a9de4c7b005f2a7046b90b241e301
[apple/xnu.git] / bsd / netinet / tcp_input.c
1 /*
2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /*
23 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
24 * The Regents of the University of California. All rights reserved.
25 *
26 * Redistribution and use in source and binary forms, with or without
27 * modification, are permitted provided that the following conditions
28 * are met:
29 * 1. Redistributions of source code must retain the above copyright
30 * notice, this list of conditions and the following disclaimer.
31 * 2. Redistributions in binary form must reproduce the above copyright
32 * notice, this list of conditions and the following disclaimer in the
33 * documentation and/or other materials provided with the distribution.
34 * 3. All advertising materials mentioning features or use of this software
35 * must display the following acknowledgement:
36 * This product includes software developed by the University of
37 * California, Berkeley and its contributors.
38 * 4. Neither the name of the University nor the names of its contributors
39 * may be used to endorse or promote products derived from this software
40 * without specific prior written permission.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52 * SUCH DAMAGE.
53 *
54 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
55 * $FreeBSD: src/sys/netinet/tcp_input.c,v 1.107.2.16 2001/08/22 00:59:12 silby Exp $
56 */
57
58
59 #include <sys/param.h>
60 #include <sys/systm.h>
61 #include <sys/kernel.h>
62 #include <sys/sysctl.h>
63 #include <sys/malloc.h>
64 #include <sys/mbuf.h>
65 #include <sys/proc.h> /* for proc0 declaration */
66 #include <sys/protosw.h>
67 #include <sys/socket.h>
68 #include <sys/socketvar.h>
69 #include <sys/syslog.h>
70
71 #include <kern/cpu_number.h> /* before tcp_seq.h, for tcp_random18() */
72
73 #include <net/if.h>
74 #include <net/if_types.h>
75 #include <net/route.h>
76
77 #include <netinet/in.h>
78 #include <netinet/in_systm.h>
79 #include <netinet/ip.h>
80 #include <netinet/ip_icmp.h> /* for ICMP_BANDLIM */
81 #include <netinet/in_var.h>
82 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
83 #include <netinet/in_pcb.h>
84 #include <netinet/ip_var.h>
85 #if INET6
86 #include <netinet/ip6.h>
87 #include <netinet/icmp6.h>
88 #include <netinet6/nd6.h>
89 #include <netinet6/ip6_var.h>
90 #include <netinet6/in6_pcb.h>
91 #endif
92 #include <netinet/tcp.h>
93 #include <netinet/tcp_fsm.h>
94 #include <netinet/tcp_seq.h>
95 #include <netinet/tcp_timer.h>
96 #include <netinet/tcp_var.h>
97 #if INET6
98 #include <netinet6/tcp6_var.h>
99 #endif
100 #include <netinet/tcpip.h>
101 #if TCPDEBUG
102 #include <netinet/tcp_debug.h>
103 u_char tcp_saveipgen[40]; /* the size must be of max ip header, now IPv6 */
104 struct tcphdr tcp_savetcp;
105 #endif /* TCPDEBUG */
106
107 #if IPSEC
108 #include <netinet6/ipsec.h>
109 #if INET6
110 #include <netinet6/ipsec6.h>
111 #endif
112 #include <netkey/key.h>
113 #endif /*IPSEC*/
114
115 #include <sys/kdebug.h>
116
117 #ifndef __APPLE__
118 MALLOC_DEFINE(M_TSEGQ, "tseg_qent", "TCP segment queue entry");
119 #endif
120
121 #define DBG_LAYER_BEG NETDBG_CODE(DBG_NETTCP, 0)
122 #define DBG_LAYER_END NETDBG_CODE(DBG_NETTCP, 2)
123 #define DBG_FNC_TCP_INPUT NETDBG_CODE(DBG_NETTCP, (3 << 8))
124 #define DBG_FNC_TCP_NEWCONN NETDBG_CODE(DBG_NETTCP, (7 << 8))
125
126 static int tcprexmtthresh = 3;
127 tcp_cc tcp_ccgen;
128 extern int apple_hwcksum_rx;
129
130 #if IPSEC
131 extern int ipsec_bypass;
132 extern lck_mtx_t *sadb_mutex;
133 #endif
134
135 struct tcpstat tcpstat;
136
137 static int log_in_vain = 0;
138 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW,
139 &log_in_vain, 0, "Log all incoming TCP connections");
140
141 static int blackhole = 0;
142 SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW,
143 &blackhole, 0, "Do not send RST when dropping refused connections");
144
145 int tcp_delack_enabled = 3;
146 SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW,
147 &tcp_delack_enabled, 0,
148 "Delay ACK to try and piggyback it onto a data packet");
149
150 int tcp_lq_overflow = 1;
151 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_lq_overflow, CTLFLAG_RW,
152 &tcp_lq_overflow, 0,
153 "Listen Queue Overflow");
154
155 #if TCP_DROP_SYNFIN
156 static int drop_synfin = 1;
157 SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW,
158 &drop_synfin, 0, "Drop TCP packets with SYN+FIN set");
159 #endif
160
161 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0,
162 "TCP Segment Reassembly Queue");
163
164 __private_extern__ int tcp_reass_maxseg = 0;
165 SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RW,
166 &tcp_reass_maxseg, 0,
167 "Global maximum number of TCP Segments in Reassembly Queue");
168
169 __private_extern__ int tcp_reass_qsize = 0;
170 SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_RD,
171 &tcp_reass_qsize, 0,
172 "Global number of TCP Segments currently in Reassembly Queue");
173
174 static int tcp_reass_overflows = 0;
175 SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD,
176 &tcp_reass_overflows, 0,
177 "Global number of TCP Segment Reassembly Queue Overflows");
178
179
180 __private_extern__ int slowlink_wsize = 8192;
181 SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowlink_wsize, CTLFLAG_RW,
182 &slowlink_wsize, 0, "Maximum advertised window size for slowlink");
183
184
185 u_long tcp_now;
186 struct inpcbhead tcb;
187 #define tcb6 tcb /* for KAME src sync over BSD*'s */
188 struct inpcbinfo tcbinfo;
189
190 static void tcp_dooptions(struct tcpcb *,
191 u_char *, int, struct tcphdr *, struct tcpopt *);
192 static void tcp_pulloutofband(struct socket *,
193 struct tcphdr *, struct mbuf *, int);
194 static int tcp_reass(struct tcpcb *, struct tcphdr *, int *,
195 struct mbuf *);
196 static void tcp_xmit_timer(struct tcpcb *, int);
197
198 /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
199 #if INET6
200 #define ND6_HINT(tp) \
201 do { \
202 if ((tp) && (tp)->t_inpcb && \
203 ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0 && \
204 (tp)->t_inpcb->in6p_route.ro_rt) \
205 nd6_nud_hint((tp)->t_inpcb->in6p_route.ro_rt, NULL, 0); \
206 } while (0)
207 #else
208 #define ND6_HINT(tp)
209 #endif
210
211 extern u_long *delack_bitmask;
212
213 extern void ipfwsyslog( int level, char *format,...);
214 extern int ChkAddressOK( __uint32_t dstaddr, __uint32_t srcaddr );
215 extern int fw_verbose;
216
217 #define log_in_vain_log( a ) { \
218 if ( (log_in_vain == 3 ) && (fw_verbose == 2)) { /* Apple logging, log to ipfw.log */ \
219 ipfwsyslog a ; \
220 } \
221 else log a ; \
222 }
223
224 /*
225 * Indicate whether this ack should be delayed.
226 * We can delay the ack if:
227 * - delayed acks are enabled (set to 1) and
228 * - our last ack wasn't a 0-sized window. We never want to delay
229 * the ack that opens up a 0-sized window.
230 * - delayed acks are enabled (set to 2, "more compatible") and
231 * - our last ack wasn't a 0-sized window.
232 * - if the peer hasn't sent us a TH_PUSH data packet (this solves 3649245)
233 * - the peer hasn't sent us a TH_PUSH data packet, if he did, take this as a clue that we
234 * need to ACK with no delay. This helps higher level protocols who won't send
235 * us more data even if the window is open because their last "segment" hasn't been ACKed
236 * - delayed acks are enabled (set to 3, "streaming detection") and
237 * - if we receive more than 4 full packets per second on this socket, we're streaming acts as "1".
238 * - if we don't meet that criteria, acts like "2". Allowing faster acking while browsing for example.
239 *
240 */
241 #define DELAY_ACK(tp) \
242 (((tcp_delack_enabled == 1) && ((tp->t_flags & TF_RXWIN0SENT) == 0)) || \
243 (((tcp_delack_enabled == 2) && (tp->t_flags & TF_RXWIN0SENT) == 0) && \
244 ((thflags & TH_PUSH) == 0) && ((tp->t_flags & TF_DELACK) == 0)) || \
245 (((tcp_delack_enabled == 3) && (tp->t_flags & TF_RXWIN0SENT) == 0) && \
246 (((tp->t_rcvtime == 0) && (tp->rcv_byps > (4* tp->t_maxseg))) || (((thflags & TH_PUSH) == 0) && ((tp->t_flags & TF_DELACK) == 0)))))
247
248
249 static int tcpdropdropablreq(struct socket *head);
250 static void tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th);
251
252
253 static int
254 tcp_reass(tp, th, tlenp, m)
255 register struct tcpcb *tp;
256 register struct tcphdr *th;
257 int *tlenp;
258 struct mbuf *m;
259 {
260 struct tseg_qent *q;
261 struct tseg_qent *p = NULL;
262 struct tseg_qent *nq;
263 struct tseg_qent *te = NULL;
264 struct socket *so = tp->t_inpcb->inp_socket;
265 int flags;
266 int dowakeup = 0;
267
268 /*
269 * Call with th==0 after become established to
270 * force pre-ESTABLISHED data up to user socket.
271 */
272 if (th == NULL)
273 goto present;
274
275 /*
276 * Limit the number of segments in the reassembly queue to prevent
277 * holding on to too many segments (and thus running out of mbufs).
278 * Make sure to let the missing segment through which caused this
279 * queue. Always keep one global queue entry spare to be able to
280 * process the missing segment.
281 */
282 if (th->th_seq != tp->rcv_nxt &&
283 tcp_reass_qsize + 1 >= tcp_reass_maxseg) {
284 tcp_reass_overflows++;
285 tcpstat.tcps_rcvmemdrop++;
286 m_freem(m);
287 return (0);
288 }
289
290 /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
291 MALLOC(te, struct tseg_qent *, sizeof (struct tseg_qent), M_TSEGQ,
292 M_NOWAIT);
293 if (te == NULL) {
294 tcpstat.tcps_rcvmemdrop++;
295 m_freem(m);
296 return (0);
297 }
298 tcp_reass_qsize++;
299
300 /*
301 * Find a segment which begins after this one does.
302 */
303 LIST_FOREACH(q, &tp->t_segq, tqe_q) {
304 if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
305 break;
306 p = q;
307 }
308
309 /*
310 * If there is a preceding segment, it may provide some of
311 * our data already. If so, drop the data from the incoming
312 * segment. If it provides all of our data, drop us.
313 */
314 if (p != NULL) {
315 register int i;
316 /* conversion to int (in i) handles seq wraparound */
317 i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
318 if (i > 0) {
319 if (i >= *tlenp) {
320 tcpstat.tcps_rcvduppack++;
321 tcpstat.tcps_rcvdupbyte += *tlenp;
322 m_freem(m);
323 FREE(te, M_TSEGQ);
324 tcp_reass_qsize--;
325 /*
326 * Try to present any queued data
327 * at the left window edge to the user.
328 * This is needed after the 3-WHS
329 * completes.
330 */
331 goto present; /* ??? */
332 }
333 m_adj(m, i);
334 *tlenp -= i;
335 th->th_seq += i;
336 }
337 }
338 tcpstat.tcps_rcvoopack++;
339 tcpstat.tcps_rcvoobyte += *tlenp;
340
341 /*
342 * While we overlap succeeding segments trim them or,
343 * if they are completely covered, dequeue them.
344 */
345 while (q) {
346 register int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
347 if (i <= 0)
348 break;
349 if (i < q->tqe_len) {
350 q->tqe_th->th_seq += i;
351 q->tqe_len -= i;
352 m_adj(q->tqe_m, i);
353 break;
354 }
355
356 nq = LIST_NEXT(q, tqe_q);
357 LIST_REMOVE(q, tqe_q);
358 m_freem(q->tqe_m);
359 FREE(q, M_TSEGQ);
360 tcp_reass_qsize--;
361 q = nq;
362 }
363
364 /* Insert the new segment queue entry into place. */
365 te->tqe_m = m;
366 te->tqe_th = th;
367 te->tqe_len = *tlenp;
368
369 if (p == NULL) {
370 LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
371 } else {
372 LIST_INSERT_AFTER(p, te, tqe_q);
373 }
374
375 present:
376 /*
377 * Present data to user, advancing rcv_nxt through
378 * completed sequence space.
379 */
380 if (!TCPS_HAVEESTABLISHED(tp->t_state))
381 return (0);
382 q = LIST_FIRST(&tp->t_segq);
383 if (!q || q->tqe_th->th_seq != tp->rcv_nxt)
384 return (0);
385 do {
386 tp->rcv_nxt += q->tqe_len;
387 flags = q->tqe_th->th_flags & TH_FIN;
388 nq = LIST_NEXT(q, tqe_q);
389 LIST_REMOVE(q, tqe_q);
390 if (so->so_state & SS_CANTRCVMORE)
391 m_freem(q->tqe_m);
392 else {
393 if (sbappend(&so->so_rcv, q->tqe_m))
394 dowakeup = 1;
395 }
396 FREE(q, M_TSEGQ);
397 tcp_reass_qsize--;
398 q = nq;
399 } while (q && q->tqe_th->th_seq == tp->rcv_nxt);
400 ND6_HINT(tp);
401
402 #if INET6
403 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
404
405 KERNEL_DEBUG(DBG_LAYER_BEG,
406 ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
407 (((tp->t_inpcb->in6p_laddr.s6_addr16[0] & 0xffff) << 16) |
408 (tp->t_inpcb->in6p_faddr.s6_addr16[0] & 0xffff)),
409 0,0,0);
410 }
411 else
412 #endif
413 {
414 KERNEL_DEBUG(DBG_LAYER_BEG,
415 ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
416 (((tp->t_inpcb->inp_laddr.s_addr & 0xffff) << 16) |
417 (tp->t_inpcb->inp_faddr.s_addr & 0xffff)),
418 0,0,0);
419 }
420 if (dowakeup)
421 sorwakeup(so); /* done with socket lock held */
422 return (flags);
423
424 }
425
426
427 /*
428 * TCP input routine, follows pages 65-76 of the
429 * protocol specification dated September, 1981 very closely.
430 */
431 #if INET6
432 int
433 tcp6_input(mp, offp)
434 struct mbuf **mp;
435 int *offp;
436 {
437 register struct mbuf *m = *mp;
438 struct in6_ifaddr *ia6;
439
440 IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), return IPPROTO_DONE);
441
442 /*
443 * draft-itojun-ipv6-tcp-to-anycast
444 * better place to put this in?
445 */
446 ia6 = ip6_getdstifaddr(m);
447 if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) {
448 struct ip6_hdr *ip6;
449
450 ip6 = mtod(m, struct ip6_hdr *);
451 icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
452 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
453 return IPPROTO_DONE;
454 }
455
456 tcp_input(m, *offp);
457 return IPPROTO_DONE;
458 }
459 #endif
460
461 void
462 tcp_input(m, off0)
463 struct mbuf *m;
464 int off0;
465 {
466 register struct tcphdr *th;
467 register struct ip *ip = NULL;
468 register struct ipovly *ipov;
469 register struct inpcb *inp;
470 u_char *optp = NULL;
471 int optlen = 0;
472 int len, tlen, off;
473 int drop_hdrlen;
474 register struct tcpcb *tp = 0;
475 register int thflags;
476 struct socket *so = 0;
477 int todrop, acked, ourfinisacked, needoutput = 0;
478 struct in_addr laddr;
479 #if INET6
480 struct in6_addr laddr6;
481 #endif
482 int dropsocket = 0;
483 int iss = 0;
484 int nosock = 0;
485 u_long tiwin;
486 struct tcpopt to; /* options in this segment */
487 struct rmxp_tao *taop; /* pointer to our TAO cache entry */
488 struct rmxp_tao tao_noncached; /* in case there's no cached entry */
489 struct sockaddr_in *next_hop = NULL;
490 #if TCPDEBUG
491 short ostate = 0;
492 #endif
493 struct m_tag *fwd_tag;
494
495 /* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */
496 fwd_tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFORWARD, NULL);
497 if (fwd_tag != NULL) {
498 struct ip_fwd_tag *ipfwd_tag = (struct ip_fwd_tag *)(fwd_tag+1);
499
500 next_hop = ipfwd_tag->next_hop;
501 m_tag_delete(m, fwd_tag);
502 }
503
504 #if INET6
505 struct ip6_hdr *ip6 = NULL;
506 int isipv6;
507 #endif /* INET6 */
508 int rstreason; /* For badport_bandlim accounting purposes */
509 struct proc *proc0=current_proc();
510
511 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_START,0,0,0,0,0);
512
513 #if INET6
514 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
515 #endif
516 bzero((char *)&to, sizeof(to));
517
518 tcpstat.tcps_rcvtotal++;
519
520
521
522 #if INET6
523 if (isipv6) {
524 /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */
525 ip6 = mtod(m, struct ip6_hdr *);
526 tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
527 if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) {
528 tcpstat.tcps_rcvbadsum++;
529 goto dropnosock;
530 }
531 th = (struct tcphdr *)((caddr_t)ip6 + off0);
532
533 KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport),
534 (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
535 th->th_seq, th->th_ack, th->th_win);
536 /*
537 * Be proactive about unspecified IPv6 address in source.
538 * As we use all-zero to indicate unbounded/unconnected pcb,
539 * unspecified IPv6 address can be used to confuse us.
540 *
541 * Note that packets with unspecified IPv6 destination is
542 * already dropped in ip6_input.
543 */
544 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
545 /* XXX stat */
546 goto dropnosock;
547 }
548 } else
549 #endif /* INET6 */
550 {
551 /*
552 * Get IP and TCP header together in first mbuf.
553 * Note: IP leaves IP header in first mbuf.
554 */
555 if (off0 > sizeof (struct ip)) {
556 ip_stripoptions(m, (struct mbuf *)0);
557 off0 = sizeof(struct ip);
558 if (m->m_pkthdr.csum_flags & CSUM_TCP_SUM16)
559 m->m_pkthdr.csum_flags = 0; /* invalidate hwcksuming */
560
561 }
562 if (m->m_len < sizeof (struct tcpiphdr)) {
563 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) {
564 tcpstat.tcps_rcvshort++;
565 return;
566 }
567 }
568 ip = mtod(m, struct ip *);
569 ipov = (struct ipovly *)ip;
570 th = (struct tcphdr *)((caddr_t)ip + off0);
571 tlen = ip->ip_len;
572
573 KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport),
574 (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
575 th->th_seq, th->th_ack, th->th_win);
576
577 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
578 if (apple_hwcksum_rx && (m->m_pkthdr.csum_flags & CSUM_TCP_SUM16)) {
579 u_short pseudo;
580 char b[9];
581 *(uint32_t*)&b[0] = *(uint32_t*)&ipov->ih_x1[0];
582 *(uint32_t*)&b[4] = *(uint32_t*)&ipov->ih_x1[4];
583 *(uint8_t*)&b[8] = *(uint8_t*)&ipov->ih_x1[8];
584
585 bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
586 ipov->ih_len = (u_short)tlen;
587 HTONS(ipov->ih_len);
588 pseudo = in_cksum(m, sizeof (struct ip));
589
590 *(uint32_t*)&ipov->ih_x1[0] = *(uint32_t*)&b[0];
591 *(uint32_t*)&ipov->ih_x1[4] = *(uint32_t*)&b[4];
592 *(uint8_t*)&ipov->ih_x1[8] = *(uint8_t*)&b[8];
593
594 th->th_sum = in_addword(pseudo, (m->m_pkthdr.csum_data & 0xFFFF));
595 } else {
596 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
597 th->th_sum = m->m_pkthdr.csum_data;
598 else
599 th->th_sum = in_pseudo(ip->ip_src.s_addr,
600 ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data +
601 ip->ip_len + IPPROTO_TCP));
602 }
603 th->th_sum ^= 0xffff;
604 } else {
605 char b[9];
606 /*
607 * Checksum extended TCP header and data.
608 */
609 *(uint32_t*)&b[0] = *(uint32_t*)&ipov->ih_x1[0];
610 *(uint32_t*)&b[4] = *(uint32_t*)&ipov->ih_x1[4];
611 *(uint8_t*)&b[8] = *(uint8_t*)&ipov->ih_x1[8];
612
613 len = sizeof (struct ip) + tlen;
614 bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
615 ipov->ih_len = (u_short)tlen;
616 HTONS(ipov->ih_len);
617 th->th_sum = in_cksum(m, len);
618
619 *(uint32_t*)&ipov->ih_x1[0] = *(uint32_t*)&b[0];
620 *(uint32_t*)&ipov->ih_x1[4] = *(uint32_t*)&b[4];
621 *(uint8_t*)&ipov->ih_x1[8] = *(uint8_t*)&b[8];
622 }
623 if (th->th_sum) {
624 tcpstat.tcps_rcvbadsum++;
625 goto dropnosock;
626 }
627 #if INET6
628 /* Re-initialization for later version check */
629 ip->ip_v = IPVERSION;
630 #endif
631 }
632
633 /*
634 * Check that TCP offset makes sense,
635 * pull out TCP options and adjust length. XXX
636 */
637 off = th->th_off << 2;
638 if (off < sizeof (struct tcphdr) || off > tlen) {
639 tcpstat.tcps_rcvbadoff++;
640 goto dropnosock;
641 }
642 tlen -= off; /* tlen is used instead of ti->ti_len */
643 if (off > sizeof (struct tcphdr)) {
644 #if INET6
645 if (isipv6) {
646 IP6_EXTHDR_CHECK(m, off0, off, return);
647 ip6 = mtod(m, struct ip6_hdr *);
648 th = (struct tcphdr *)((caddr_t)ip6 + off0);
649 } else
650 #endif /* INET6 */
651 {
652 if (m->m_len < sizeof(struct ip) + off) {
653 if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) {
654 tcpstat.tcps_rcvshort++;
655 return;
656 }
657 ip = mtod(m, struct ip *);
658 ipov = (struct ipovly *)ip;
659 th = (struct tcphdr *)((caddr_t)ip + off0);
660 }
661 }
662 optlen = off - sizeof (struct tcphdr);
663 optp = (u_char *)(th + 1);
664 /*
665 * Do quick retrieval of timestamp options ("options
666 * prediction?"). If timestamp is the only option and it's
667 * formatted as recommended in RFC 1323 appendix A, we
668 * quickly get the values now and not bother calling
669 * tcp_dooptions(), etc.
670 */
671 if ((optlen == TCPOLEN_TSTAMP_APPA ||
672 (optlen > TCPOLEN_TSTAMP_APPA &&
673 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
674 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
675 (th->th_flags & TH_SYN) == 0) {
676 to.to_flags |= TOF_TS;
677 to.to_tsval = ntohl(*(u_int32_t *)(optp + 4));
678 to.to_tsecr = ntohl(*(u_int32_t *)(optp + 8));
679 optp = NULL; /* we've parsed the options */
680 }
681 }
682 thflags = th->th_flags;
683
684 #if TCP_DROP_SYNFIN
685 /*
686 * If the drop_synfin option is enabled, drop all packets with
687 * both the SYN and FIN bits set. This prevents e.g. nmap from
688 * identifying the TCP/IP stack.
689 *
690 * This is a violation of the TCP specification.
691 */
692 if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN))
693 goto dropnosock;
694 #endif
695
696 /*
697 * Convert TCP protocol specific fields to host format.
698 */
699 NTOHL(th->th_seq);
700 NTOHL(th->th_ack);
701 NTOHS(th->th_win);
702 NTOHS(th->th_urp);
703
704 /*
705 * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options,
706 * until after ip6_savecontrol() is called and before other functions
707 * which don't want those proto headers.
708 * Because ip6_savecontrol() is going to parse the mbuf to
709 * search for data to be passed up to user-land, it wants mbuf
710 * parameters to be unchanged.
711 */
712 drop_hdrlen = off0 + off;
713
714 /*
715 * Locate pcb for segment.
716 */
717 findpcb:
718 #if IPFIREWALL_FORWARD
719 if (next_hop != NULL
720 #if INET6
721 && isipv6 == NULL /* IPv6 support is not yet */
722 #endif /* INET6 */
723 ) {
724 /*
725 * Diverted. Pretend to be the destination.
726 * already got one like this?
727 */
728 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
729 ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif);
730 if (!inp) {
731 /*
732 * No, then it's new. Try find the ambushing socket
733 */
734 if (!next_hop->sin_port) {
735 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src,
736 th->th_sport, next_hop->sin_addr,
737 th->th_dport, 1, m->m_pkthdr.rcvif);
738 } else {
739 inp = in_pcblookup_hash(&tcbinfo,
740 ip->ip_src, th->th_sport,
741 next_hop->sin_addr,
742 ntohs(next_hop->sin_port), 1,
743 m->m_pkthdr.rcvif);
744 }
745 }
746 } else
747 #endif /* IPFIREWALL_FORWARD */
748 {
749 #if INET6
750 if (isipv6)
751 inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport,
752 &ip6->ip6_dst, th->th_dport, 1,
753 m->m_pkthdr.rcvif);
754 else
755 #endif /* INET6 */
756 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
757 ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif);
758 }
759
760 #if IPSEC
761 if (ipsec_bypass == 0) {
762 lck_mtx_lock(sadb_mutex);
763 #if INET6
764 if (isipv6) {
765 if (inp != NULL && ipsec6_in_reject_so(m, inp->inp_socket)) {
766 ipsec6stat.in_polvio++;
767 lck_mtx_unlock(sadb_mutex);
768 goto dropnosock;
769 }
770 } else
771 #endif /* INET6 */
772 if (inp != NULL && ipsec4_in_reject_so(m, inp->inp_socket)) {
773 ipsecstat.in_polvio++;
774 lck_mtx_unlock(sadb_mutex);
775 goto dropnosock;
776 }
777 lck_mtx_unlock(sadb_mutex);
778 }
779 #endif /*IPSEC*/
780
781 /*
782 * If the state is CLOSED (i.e., TCB does not exist) then
783 * all data in the incoming segment is discarded.
784 * If the TCB exists but is in CLOSED state, it is embryonic,
785 * but should either do a listen or a connect soon.
786 */
787 if (inp == NULL) {
788 if (log_in_vain) {
789 #if INET6
790 char dbuf[MAX_IPv6_STR_LEN], sbuf[MAX_IPv6_STR_LEN];
791 #else /* INET6 */
792 char dbuf[MAX_IPv4_STR_LEN], sbuf[MAX_IPv4_STR_LEN];
793 #endif /* INET6 */
794
795 #if INET6
796 if (isipv6) {
797 inet_ntop(AF_INET6, &ip6->ip6_dst, dbuf, sizeof(dbuf));
798 inet_ntop(AF_INET6, &ip6->ip6_src, sbuf, sizeof(sbuf));
799 } else
800 #endif
801 {
802 inet_ntop(AF_INET, &ip->ip_dst, dbuf, sizeof(dbuf));
803 inet_ntop(AF_INET, &ip->ip_src, sbuf, sizeof(sbuf));
804 }
805 switch (log_in_vain) {
806 case 1:
807 if(thflags & TH_SYN)
808 log(LOG_INFO,
809 "Connection attempt to TCP %s:%d from %s:%d\n",
810 dbuf, ntohs(th->th_dport),
811 sbuf,
812 ntohs(th->th_sport));
813 break;
814 case 2:
815 log(LOG_INFO,
816 "Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n",
817 dbuf, ntohs(th->th_dport), sbuf,
818 ntohs(th->th_sport), thflags);
819 break;
820 case 3:
821 if ((thflags & TH_SYN) &&
822 !(m->m_flags & (M_BCAST | M_MCAST)) &&
823 #if INET6
824 ((isipv6 && !IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) ||
825 (!isipv6 && ip->ip_dst.s_addr != ip->ip_src.s_addr))
826 #else
827 ip->ip_dst.s_addr != ip->ip_src.s_addr
828 #endif
829 )
830 log_in_vain_log((LOG_INFO,
831 "Stealth Mode connection attempt to TCP %s:%d from %s:%d\n",
832 dbuf, ntohs(th->th_dport),
833 sbuf,
834 ntohs(th->th_sport)));
835 break;
836 default:
837 break;
838 }
839 }
840 if (blackhole) {
841 if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type != IFT_LOOP)
842 switch (blackhole) {
843 case 1:
844 if (thflags & TH_SYN)
845 goto dropnosock;
846 break;
847 case 2:
848 goto dropnosock;
849 default:
850 goto dropnosock;
851 }
852 }
853 rstreason = BANDLIM_RST_CLOSEDPORT;
854 goto dropwithresetnosock;
855 }
856 so = inp->inp_socket;
857 if (so == NULL) {
858 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING)
859 inp = NULL; // pretend we didn't find it
860 #if TEMPDEBUG
861 printf("tcp_input: no more socket for inp=%x\n", inp);
862 #endif
863 goto dropnosock;
864 }
865
866 #ifdef __APPLE__
867 /*
868 * Bogus state when listening port owned by SharedIP with loopback as the
869 * only configured interface: BlueBox does not filters loopback
870 */
871 if (so == &tcbinfo.nat_dummy_socket)
872 goto drop;
873
874 #endif
875 tcp_lock(so, 1, 2);
876 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
877 tcp_unlock(so, 1, 2);
878 inp = NULL; // pretend we didn't find it
879 goto dropnosock;
880 }
881
882 tp = intotcpcb(inp);
883 if (tp == 0) {
884 rstreason = BANDLIM_RST_CLOSEDPORT;
885 goto dropwithreset;
886 }
887 if (tp->t_state == TCPS_CLOSED)
888 goto drop;
889
890 /* Unscale the window into a 32-bit value. */
891 if ((thflags & TH_SYN) == 0)
892 tiwin = th->th_win << tp->snd_scale;
893 else
894 tiwin = th->th_win;
895
896 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
897 #if TCPDEBUG
898 if (so->so_options & SO_DEBUG) {
899 ostate = tp->t_state;
900 #if INET6
901 if (isipv6)
902 bcopy((char *)ip6, (char *)tcp_saveipgen,
903 sizeof(*ip6));
904 else
905 #endif /* INET6 */
906 bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip));
907 tcp_savetcp = *th;
908 }
909 #endif
910 if (so->so_options & SO_ACCEPTCONN) {
911 register struct tcpcb *tp0 = tp;
912 struct socket *so2;
913 struct socket *oso;
914 struct sockaddr_storage from;
915 #if INET6
916 struct inpcb *oinp = sotoinpcb(so);
917 #endif /* INET6 */
918 int ogencnt = so->so_gencnt;
919
920 #if !IPSEC
921 /*
922 * Current IPsec implementation makes incorrect IPsec
923 * cache if this check is done here.
924 * So delay this until duplicated socket is created.
925 */
926 if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
927 /*
928 * Note: dropwithreset makes sure we don't
929 * send a RST in response to a RST.
930 */
931 if (thflags & TH_ACK) {
932 tcpstat.tcps_badsyn++;
933 rstreason = BANDLIM_RST_OPENPORT;
934 goto dropwithreset;
935 }
936 goto drop;
937 }
938 #endif
939 KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_START,0,0,0,0,0);
940
941 #if INET6
942 /*
943 * If deprecated address is forbidden,
944 * we do not accept SYN to deprecated interface
945 * address to prevent any new inbound connection from
946 * getting established.
947 * When we do not accept SYN, we send a TCP RST,
948 * with deprecated source address (instead of dropping
949 * it). We compromise it as it is much better for peer
950 * to send a RST, and RST will be the final packet
951 * for the exchange.
952 *
953 * If we do not forbid deprecated addresses, we accept
954 * the SYN packet. RFC2462 does not suggest dropping
955 * SYN in this case.
956 * If we decipher RFC2462 5.5.4, it says like this:
957 * 1. use of deprecated addr with existing
958 * communication is okay - "SHOULD continue to be
959 * used"
960 * 2. use of it with new communication:
961 * (2a) "SHOULD NOT be used if alternate address
962 * with sufficient scope is available"
963 * (2b) nothing mentioned otherwise.
964 * Here we fall into (2b) case as we have no choice in
965 * our source address selection - we must obey the peer.
966 *
967 * The wording in RFC2462 is confusing, and there are
968 * multiple description text for deprecated address
969 * handling - worse, they are not exactly the same.
970 * I believe 5.5.4 is the best one, so we follow 5.5.4.
971 */
972 if (isipv6 && !ip6_use_deprecated) {
973 struct in6_ifaddr *ia6;
974
975 if ((ia6 = ip6_getdstifaddr(m)) &&
976 (ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
977 tp = NULL;
978 rstreason = BANDLIM_RST_OPENPORT;
979 goto dropwithreset;
980 }
981 }
982 #endif
983 if (so->so_filt) {
984 if (isipv6) {
985 struct sockaddr_in6 *sin6 = (struct sockaddr_in6*)&from;
986
987 sin6->sin6_len = sizeof(*sin6);
988 sin6->sin6_family = AF_INET6;
989 sin6->sin6_port = th->th_sport;
990 sin6->sin6_flowinfo = 0;
991 sin6->sin6_addr = ip6->ip6_src;
992 sin6->sin6_scope_id = 0;
993 } else {
994 struct sockaddr_in *sin = (struct sockaddr_in*)&from;
995
996 sin->sin_len = sizeof(*sin);
997 sin->sin_family = AF_INET;
998 sin->sin_port = th->th_sport;
999 sin->sin_addr = ip->ip_src;
1000 }
1001 so2 = sonewconn(so, 0, (struct sockaddr*)&from);
1002 } else {
1003 so2 = sonewconn(so, 0, NULL);
1004 }
1005 if (so2 == 0) {
1006 tcpstat.tcps_listendrop++;
1007 if (tcpdropdropablreq(so)) {
1008 if (so->so_filt)
1009 so2 = sonewconn(so, 0, (struct sockaddr*)&from);
1010 else
1011 so2 = sonewconn(so, 0, NULL);
1012 }
1013 if (!so2)
1014 goto drop;
1015 }
1016 /*
1017 * Make sure listening socket did not get closed during socket allocation,
1018 * not only this is incorrect but it is know to cause panic
1019 */
1020 if (so->so_gencnt != ogencnt)
1021 goto drop;
1022
1023 oso = so;
1024 tcp_unlock(so, 0, 0); /* Unlock but keep a reference on listener for now */
1025
1026 so = so2;
1027 tcp_lock(so, 1, 0);
1028 /*
1029 * This is ugly, but ....
1030 *
1031 * Mark socket as temporary until we're
1032 * committed to keeping it. The code at
1033 * ``drop'' and ``dropwithreset'' check the
1034 * flag dropsocket to see if the temporary
1035 * socket created here should be discarded.
1036 * We mark the socket as discardable until
1037 * we're committed to it below in TCPS_LISTEN.
1038 */
1039 dropsocket++;
1040 inp = (struct inpcb *)so->so_pcb;
1041 #if INET6
1042 if (isipv6)
1043 inp->in6p_laddr = ip6->ip6_dst;
1044 else {
1045 inp->inp_vflag &= ~INP_IPV6;
1046 inp->inp_vflag |= INP_IPV4;
1047 #endif /* INET6 */
1048 inp->inp_laddr = ip->ip_dst;
1049 #if INET6
1050 }
1051 #endif /* INET6 */
1052 inp->inp_lport = th->th_dport;
1053 if (in_pcbinshash(inp, 0) != 0) {
1054 /*
1055 * Undo the assignments above if we failed to
1056 * put the PCB on the hash lists.
1057 */
1058 #if INET6
1059 if (isipv6)
1060 inp->in6p_laddr = in6addr_any;
1061 else
1062 #endif /* INET6 */
1063 inp->inp_laddr.s_addr = INADDR_ANY;
1064 inp->inp_lport = 0;
1065 tcp_lock(oso, 0, 0); /* release ref on parent */
1066 tcp_unlock(oso, 1, 0);
1067 goto drop;
1068 }
1069 #if IPSEC
1070 /*
1071 * To avoid creating incorrectly cached IPsec
1072 * association, this is need to be done here.
1073 *
1074 * Subject: (KAME-snap 748)
1075 * From: Wayne Knowles <w.knowles@niwa.cri.nz>
1076 * ftp://ftp.kame.net/pub/mail-list/snap-users/748
1077 */
1078 if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
1079 /*
1080 * Note: dropwithreset makes sure we don't
1081 * send a RST in response to a RST.
1082 */
1083 tcp_lock(oso, 0, 0); /* release ref on parent */
1084 tcp_unlock(oso, 1, 0);
1085 if (thflags & TH_ACK) {
1086 tcpstat.tcps_badsyn++;
1087 rstreason = BANDLIM_RST_OPENPORT;
1088 goto dropwithreset;
1089 }
1090 goto drop;
1091 }
1092 #endif
1093 #if INET6
1094 if (isipv6) {
1095 /*
1096 * Inherit socket options from the listening
1097 * socket.
1098 * Note that in6p_inputopts are not (even
1099 * should not be) copied, since it stores
1100 * previously received options and is used to
1101 * detect if each new option is different than
1102 * the previous one and hence should be passed
1103 * to a user.
1104 * If we copied in6p_inputopts, a user would
1105 * not be able to receive options just after
1106 * calling the accept system call.
1107 */
1108 inp->inp_flags |=
1109 oinp->inp_flags & INP_CONTROLOPTS;
1110 if (oinp->in6p_outputopts)
1111 inp->in6p_outputopts =
1112 ip6_copypktopts(oinp->in6p_outputopts,
1113 M_NOWAIT);
1114 } else
1115 #endif /* INET6 */
1116 inp->inp_options = ip_srcroute();
1117 tcp_lock(oso, 0, 0);
1118 #if IPSEC
1119 /* copy old policy into new socket's */
1120 if (sotoinpcb(oso)->inp_sp)
1121 {
1122 int error = 0;
1123 lck_mtx_lock(sadb_mutex);
1124 /* Is it a security hole here to silently fail to copy the policy? */
1125 if (inp->inp_sp != NULL)
1126 error = ipsec_init_policy(so, &inp->inp_sp);
1127 if (error != 0 || ipsec_copy_policy(sotoinpcb(oso)->inp_sp, inp->inp_sp))
1128 printf("tcp_input: could not copy policy\n");
1129 lck_mtx_unlock(sadb_mutex);
1130 }
1131 #endif
1132 tcp_unlock(oso, 1, 0); /* now drop the reference on the listener */
1133 tp = intotcpcb(inp);
1134 tp->t_state = TCPS_LISTEN;
1135 tp->t_flags |= tp0->t_flags & (TF_NOPUSH|TF_NOOPT|TF_NODELAY);
1136 tp->t_inpcb->inp_ip_ttl = tp0->t_inpcb->inp_ip_ttl;
1137 /* Compute proper scaling value from buffer space */
1138 while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
1139 TCP_MAXWIN << tp->request_r_scale <
1140 so->so_rcv.sb_hiwat)
1141 tp->request_r_scale++;
1142
1143 KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_END,0,0,0,0,0);
1144 }
1145 }
1146
1147 #if 1
1148 lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1149 #endif
1150 /*
1151 * Radar 3529618
1152 * This is the second part of the MSS DoS prevention code (after
1153 * minmss on the sending side) and it deals with too many too small
1154 * tcp packets in a too short timeframe (1 second).
1155 *
1156 * For every full second we count the number of received packets
1157 * and bytes. If we get a lot of packets per second for this connection
1158 * (tcp_minmssoverload) we take a closer look at it and compute the
1159 * average packet size for the past second. If that is less than
1160 * tcp_minmss we get too many packets with very small payload which
1161 * is not good and burdens our system (and every packet generates
1162 * a wakeup to the process connected to our socket). We can reasonable
1163 * expect this to be small packet DoS attack to exhaust our CPU
1164 * cycles.
1165 *
1166 * Care has to be taken for the minimum packet overload value. This
1167 * value defines the minimum number of packets per second before we
1168 * start to worry. This must not be too low to avoid killing for
1169 * example interactive connections with many small packets like
1170 * telnet or SSH.
1171 *
1172 *
1173 * Account for packet if payload packet, skip over ACK, etc.
1174 *
1175 * The packet per second count is done all the time and is also used
1176 * by "DELAY_ACK" to detect streaming situations.
1177 *
1178 */
1179 if (tp->t_state == TCPS_ESTABLISHED && tlen > 0) {
1180 if (tp->rcv_reset > tcp_now) {
1181 tp->rcv_pps++;
1182 tp->rcv_byps += tlen + off;
1183 /*
1184 * Setting either tcp_minmssoverload or tcp_minmss to "0" disables
1185 * the check.
1186 */
1187 if (tcp_minmss && tcp_minmssoverload && tp->rcv_pps > tcp_minmssoverload) {
1188 if ((tp->rcv_byps / tp->rcv_pps) < tcp_minmss) {
1189 char ipstrbuf[MAX_IPv6_STR_LEN];
1190 printf("too many small tcp packets from "
1191 "%s:%u, av. %lubyte/packet, "
1192 "dropping connection\n",
1193 #ifdef INET6
1194 isipv6 ?
1195 inet_ntop(AF_INET6, &inp->in6p_faddr, ipstrbuf,
1196 sizeof(ipstrbuf)) :
1197 #endif
1198 inet_ntop(AF_INET, &inp->inp_faddr, ipstrbuf,
1199 sizeof(ipstrbuf)),
1200 inp->inp_fport,
1201 tp->rcv_byps / tp->rcv_pps);
1202 tp = tcp_drop(tp, ECONNRESET);
1203 /* tcpstat.tcps_minmssdrops++; */
1204 goto drop;
1205 }
1206 }
1207 } else {
1208 tp->rcv_reset = tcp_now + PR_SLOWHZ;
1209 tp->rcv_pps = 1;
1210 tp->rcv_byps = tlen + off;
1211 }
1212 }
1213
1214 /*
1215 * Segment received on connection.
1216 * Reset idle time and keep-alive timer.
1217 */
1218 tp->t_rcvtime = 0;
1219 if (TCPS_HAVEESTABLISHED(tp->t_state))
1220 tp->t_timer[TCPT_KEEP] = TCP_KEEPIDLE(tp);
1221
1222 /*
1223 * Process options if not in LISTEN state,
1224 * else do it below (after getting remote address).
1225 */
1226 if (tp->t_state != TCPS_LISTEN && optp)
1227 tcp_dooptions(tp, optp, optlen, th, &to);
1228
1229 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
1230 if (to.to_flags & TOF_SCALE) {
1231 tp->t_flags |= TF_RCVD_SCALE;
1232 tp->requested_s_scale = to.to_requested_s_scale;
1233 }
1234 if (to.to_flags & TOF_TS) {
1235 tp->t_flags |= TF_RCVD_TSTMP;
1236 tp->ts_recent = to.to_tsval;
1237 tp->ts_recent_age = tcp_now;
1238 }
1239 if (to.to_flags & TOF_MSS)
1240 tcp_mss(tp, to.to_mss);
1241 if (tp->sack_enable) {
1242 if (!(to.to_flags & TOF_SACK))
1243 tp->sack_enable = 0;
1244 else
1245 tp->t_flags |= TF_SACK_PERMIT;
1246 }
1247 }
1248
1249 /*
1250 * Header prediction: check for the two common cases
1251 * of a uni-directional data xfer. If the packet has
1252 * no control flags, is in-sequence, the window didn't
1253 * change and we're not retransmitting, it's a
1254 * candidate. If the length is zero and the ack moved
1255 * forward, we're the sender side of the xfer. Just
1256 * free the data acked & wake any higher level process
1257 * that was blocked waiting for space. If the length
1258 * is non-zero and the ack didn't move, we're the
1259 * receiver side. If we're getting packets in-order
1260 * (the reassembly queue is empty), add the data to
1261 * the socket buffer and note that we need a delayed ack.
1262 * Make sure that the hidden state-flags are also off.
1263 * Since we check for TCPS_ESTABLISHED above, it can only
1264 * be TH_NEEDSYN.
1265 */
1266 if (tp->t_state == TCPS_ESTABLISHED &&
1267 (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
1268 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
1269 ((to.to_flags & TOF_TS) == 0 ||
1270 TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
1271 th->th_seq == tp->rcv_nxt &&
1272 tiwin && tiwin == tp->snd_wnd &&
1273 tp->snd_nxt == tp->snd_max) {
1274
1275 /*
1276 * If last ACK falls within this segment's sequence numbers,
1277 * record the timestamp.
1278 * NOTE that the test is modified according to the latest
1279 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
1280 */
1281 if ((to.to_flags & TOF_TS) != 0 &&
1282 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
1283 tp->ts_recent_age = tcp_now;
1284 tp->ts_recent = to.to_tsval;
1285 }
1286
1287 if (tlen == 0) {
1288 if (SEQ_GT(th->th_ack, tp->snd_una) &&
1289 SEQ_LEQ(th->th_ack, tp->snd_max) &&
1290 tp->snd_cwnd >= tp->snd_wnd &&
1291 ((!tcp_do_newreno && !tp->sack_enable &&
1292 tp->t_dupacks < tcprexmtthresh) ||
1293 ((tcp_do_newreno || tp->sack_enable) &&
1294 !IN_FASTRECOVERY(tp) && to.to_nsacks == 0 &&
1295 TAILQ_EMPTY(&tp->snd_holes)))) {
1296 /*
1297 * this is a pure ack for outstanding data.
1298 */
1299 ++tcpstat.tcps_predack;
1300 /*
1301 * "bad retransmit" recovery
1302 */
1303 if (tp->t_rxtshift == 1 &&
1304 tcp_now < tp->t_badrxtwin) {
1305 tp->snd_cwnd = tp->snd_cwnd_prev;
1306 tp->snd_ssthresh =
1307 tp->snd_ssthresh_prev;
1308 tp->snd_recover = tp->snd_recover_prev;
1309 if (tp->t_flags & TF_WASFRECOVERY)
1310 ENTER_FASTRECOVERY(tp);
1311 tp->snd_nxt = tp->snd_max;
1312 tp->t_badrxtwin = 0;
1313 }
1314 if (((to.to_flags & TOF_TS) != 0) && (to.to_tsecr != 0)) /* Makes sure we already have a TS */
1315 tcp_xmit_timer(tp,
1316 tcp_now - to.to_tsecr + 1);
1317 else if (tp->t_rtttime &&
1318 SEQ_GT(th->th_ack, tp->t_rtseq))
1319 tcp_xmit_timer(tp, tp->t_rtttime);
1320 acked = th->th_ack - tp->snd_una;
1321 tcpstat.tcps_rcvackpack++;
1322 tcpstat.tcps_rcvackbyte += acked;
1323 sbdrop(&so->so_snd, acked);
1324 if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
1325 SEQ_LEQ(th->th_ack, tp->snd_recover))
1326 tp->snd_recover = th->th_ack - 1;
1327 tp->snd_una = th->th_ack;
1328 /*
1329 * pull snd_wl2 up to prevent seq wrap relative
1330 * to th_ack.
1331 */
1332 tp->snd_wl2 = th->th_ack;
1333 tp->t_dupacks = 0;
1334 m_freem(m);
1335 ND6_HINT(tp); /* some progress has been done */
1336
1337 /*
1338 * If all outstanding data are acked, stop
1339 * retransmit timer, otherwise restart timer
1340 * using current (possibly backed-off) value.
1341 * If process is waiting for space,
1342 * wakeup/selwakeup/signal. If data
1343 * are ready to send, let tcp_output
1344 * decide between more output or persist.
1345 */
1346 if (tp->snd_una == tp->snd_max)
1347 tp->t_timer[TCPT_REXMT] = 0;
1348 else if (tp->t_timer[TCPT_PERSIST] == 0)
1349 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
1350
1351 sowwakeup(so); /* has to be done with socket lock held */
1352 if ((so->so_snd.sb_cc) || (tp->t_flags & TF_ACKNOW))
1353 (void) tcp_output(tp);
1354 tcp_unlock(so, 1, 0);
1355 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
1356 return;
1357 }
1358 } else if (th->th_ack == tp->snd_una &&
1359 LIST_EMPTY(&tp->t_segq) &&
1360 tlen <= sbspace(&so->so_rcv)) {
1361 /*
1362 * this is a pure, in-sequence data packet
1363 * with nothing on the reassembly queue and
1364 * we have enough buffer space to take it.
1365 */
1366 /* Clean receiver SACK report if present */
1367 if (tp->sack_enable && tp->rcv_numsacks)
1368 tcp_clean_sackreport(tp);
1369 ++tcpstat.tcps_preddat;
1370 tp->rcv_nxt += tlen;
1371 /*
1372 * Pull snd_wl1 up to prevent seq wrap relative to
1373 * th_seq.
1374 */
1375 tp->snd_wl1 = th->th_seq;
1376 /*
1377 * Pull rcv_up up to prevent seq wrap relative to
1378 * rcv_nxt.
1379 */
1380 tp->rcv_up = tp->rcv_nxt;
1381 tcpstat.tcps_rcvpack++;
1382 tcpstat.tcps_rcvbyte += tlen;
1383 ND6_HINT(tp); /* some progress has been done */
1384 /*
1385 * Add data to socket buffer.
1386 */
1387 m_adj(m, drop_hdrlen); /* delayed header drop */
1388 if (sbappend(&so->so_rcv, m))
1389 sorwakeup(so);
1390 #if INET6
1391 if (isipv6) {
1392 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
1393 (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
1394 th->th_seq, th->th_ack, th->th_win);
1395 }
1396 else
1397 #endif
1398 {
1399 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
1400 (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
1401 th->th_seq, th->th_ack, th->th_win);
1402 }
1403 if (DELAY_ACK(tp)) {
1404 tp->t_flags |= TF_DELACK;
1405 } else {
1406 tp->t_flags |= TF_ACKNOW;
1407 tcp_output(tp);
1408 }
1409 tcp_unlock(so, 1, 0);
1410 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
1411 return;
1412 }
1413 }
1414
1415 /*
1416 * Calculate amount of space in receive window,
1417 * and then do TCP input processing.
1418 * Receive window is amount of space in rcv queue,
1419 * but not less than advertised window.
1420 */
1421 #if 1
1422 lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1423 #endif
1424 { int win;
1425
1426 win = sbspace(&so->so_rcv);
1427 if (win < 0)
1428 win = 0;
1429 else { /* clip rcv window to 4K for modems */
1430 if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0)
1431 win = min(win, slowlink_wsize);
1432 }
1433 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
1434 }
1435
1436 switch (tp->t_state) {
1437
1438 /*
1439 * If the state is LISTEN then ignore segment if it contains an RST.
1440 * If the segment contains an ACK then it is bad and send a RST.
1441 * If it does not contain a SYN then it is not interesting; drop it.
1442 * If it is from this socket, drop it, it must be forged.
1443 * Don't bother responding if the destination was a broadcast.
1444 * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial
1445 * tp->iss, and send a segment:
1446 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
1447 * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
1448 * Fill in remote peer address fields if not previously specified.
1449 * Enter SYN_RECEIVED state, and process any other fields of this
1450 * segment in this state.
1451 */
1452 case TCPS_LISTEN: {
1453 register struct sockaddr_in *sin;
1454 #if INET6
1455 register struct sockaddr_in6 *sin6;
1456 #endif
1457
1458 #if 1
1459 lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1460 #endif
1461 if (thflags & TH_RST)
1462 goto drop;
1463 if (thflags & TH_ACK) {
1464 rstreason = BANDLIM_RST_OPENPORT;
1465 goto dropwithreset;
1466 }
1467 if ((thflags & TH_SYN) == 0)
1468 goto drop;
1469 if (th->th_dport == th->th_sport) {
1470 #if INET6
1471 if (isipv6) {
1472 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
1473 &ip6->ip6_src))
1474 goto drop;
1475 } else
1476 #endif /* INET6 */
1477 if (ip->ip_dst.s_addr == ip->ip_src.s_addr)
1478 goto drop;
1479 }
1480 /*
1481 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
1482 * in_broadcast() should never return true on a received
1483 * packet with M_BCAST not set.
1484 *
1485 * Packets with a multicast source address should also
1486 * be discarded.
1487 */
1488 if (m->m_flags & (M_BCAST|M_MCAST))
1489 goto drop;
1490 #if INET6
1491 if (isipv6) {
1492 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
1493 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
1494 goto drop;
1495 } else
1496 #endif
1497 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
1498 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
1499 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
1500 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
1501 goto drop;
1502 #if INET6
1503 if (isipv6) {
1504 MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6,
1505 M_SONAME, M_NOWAIT);
1506 if (sin6 == NULL)
1507 goto drop;
1508 bzero(sin6, sizeof(*sin6));
1509 sin6->sin6_family = AF_INET6;
1510 sin6->sin6_len = sizeof(*sin6);
1511 sin6->sin6_addr = ip6->ip6_src;
1512 sin6->sin6_port = th->th_sport;
1513 laddr6 = inp->in6p_laddr;
1514 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
1515 inp->in6p_laddr = ip6->ip6_dst;
1516 if (in6_pcbconnect(inp, (struct sockaddr *)sin6,
1517 proc0)) {
1518 inp->in6p_laddr = laddr6;
1519 FREE(sin6, M_SONAME);
1520 goto drop;
1521 }
1522 FREE(sin6, M_SONAME);
1523 } else
1524 #endif
1525 {
1526 #if 1
1527 lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1528 #endif
1529 MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME,
1530 M_NOWAIT);
1531 if (sin == NULL)
1532 goto drop;
1533 sin->sin_family = AF_INET;
1534 sin->sin_len = sizeof(*sin);
1535 sin->sin_addr = ip->ip_src;
1536 sin->sin_port = th->th_sport;
1537 bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero));
1538 laddr = inp->inp_laddr;
1539 if (inp->inp_laddr.s_addr == INADDR_ANY)
1540 inp->inp_laddr = ip->ip_dst;
1541 if (in_pcbconnect(inp, (struct sockaddr *)sin, proc0)) {
1542 inp->inp_laddr = laddr;
1543 FREE(sin, M_SONAME);
1544 goto drop;
1545 }
1546 FREE(sin, M_SONAME);
1547 }
1548
1549 tcp_dooptions(tp, optp, optlen, th, &to);
1550
1551 if (tp->sack_enable) {
1552 if (!(to.to_flags & TOF_SACK))
1553 tp->sack_enable = 0;
1554 else
1555 tp->t_flags |= TF_SACK_PERMIT;
1556 }
1557
1558 if (iss)
1559 tp->iss = iss;
1560 else {
1561 tp->iss = tcp_new_isn(tp);
1562 }
1563 tp->irs = th->th_seq;
1564 tcp_sendseqinit(tp);
1565 tcp_rcvseqinit(tp);
1566 tp->snd_recover = tp->snd_una;
1567 /*
1568 * Initialization of the tcpcb for transaction;
1569 * set SND.WND = SEG.WND,
1570 * initialize CCsend and CCrecv.
1571 */
1572 tp->snd_wnd = tiwin; /* initial send-window */
1573 tp->t_flags |= TF_ACKNOW;
1574 tp->t_state = TCPS_SYN_RECEIVED;
1575 tp->t_timer[TCPT_KEEP] = tcp_keepinit;
1576 dropsocket = 0; /* committed to socket */
1577 tcpstat.tcps_accepts++;
1578 goto trimthenstep6;
1579 }
1580
1581 /*
1582 * If the state is SYN_RECEIVED:
1583 * if seg contains an ACK, but not for our SYN/ACK, send a RST.
1584 */
1585 case TCPS_SYN_RECEIVED:
1586 if ((thflags & TH_ACK) &&
1587 (SEQ_LEQ(th->th_ack, tp->snd_una) ||
1588 SEQ_GT(th->th_ack, tp->snd_max))) {
1589 rstreason = BANDLIM_RST_OPENPORT;
1590 goto dropwithreset;
1591 }
1592 break;
1593
1594 /*
1595 * If the state is SYN_SENT:
1596 * if seg contains an ACK, but not for our SYN, drop the input.
1597 * if seg contains a RST, then drop the connection.
1598 * if seg does not contain SYN, then drop it.
1599 * Otherwise this is an acceptable SYN segment
1600 * initialize tp->rcv_nxt and tp->irs
1601 * if seg contains ack then advance tp->snd_una
1602 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
1603 * arrange for segment to be acked (eventually)
1604 * continue processing rest of data/controls, beginning with URG
1605 */
1606 case TCPS_SYN_SENT:
1607 if ((thflags & TH_ACK) &&
1608 (SEQ_LEQ(th->th_ack, tp->iss) ||
1609 SEQ_GT(th->th_ack, tp->snd_max))) {
1610 rstreason = BANDLIM_UNLIMITED;
1611 goto dropwithreset;
1612 }
1613 if (thflags & TH_RST) {
1614 if (thflags & TH_ACK) {
1615 tp = tcp_drop(tp, ECONNREFUSED);
1616 postevent(so, 0, EV_RESET);
1617 }
1618 goto drop;
1619 }
1620 if ((thflags & TH_SYN) == 0)
1621 goto drop;
1622 tp->snd_wnd = th->th_win; /* initial send window */
1623
1624 tp->irs = th->th_seq;
1625 tcp_rcvseqinit(tp);
1626 if (thflags & TH_ACK) {
1627 tcpstat.tcps_connects++;
1628 soisconnected(so);
1629 /* Do window scaling on this connection? */
1630 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1631 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1632 tp->snd_scale = tp->requested_s_scale;
1633 tp->rcv_scale = tp->request_r_scale;
1634 }
1635 tp->rcv_adv += tp->rcv_wnd;
1636 tp->snd_una++; /* SYN is acked */
1637 /*
1638 * If there's data, delay ACK; if there's also a FIN
1639 * ACKNOW will be turned on later.
1640 */
1641 if (DELAY_ACK(tp) && tlen != 0) {
1642 tp->t_flags |= TF_DELACK;
1643 }
1644 else {
1645 tp->t_flags |= TF_ACKNOW;
1646 }
1647 /*
1648 * Received <SYN,ACK> in SYN_SENT[*] state.
1649 * Transitions:
1650 * SYN_SENT --> ESTABLISHED
1651 * SYN_SENT* --> FIN_WAIT_1
1652 */
1653 tp->t_starttime = 0;
1654 if (tp->t_flags & TF_NEEDFIN) {
1655 tp->t_state = TCPS_FIN_WAIT_1;
1656 tp->t_flags &= ~TF_NEEDFIN;
1657 thflags &= ~TH_SYN;
1658 } else {
1659 tp->t_state = TCPS_ESTABLISHED;
1660 tp->t_timer[TCPT_KEEP] = TCP_KEEPIDLE(tp);
1661 }
1662 } else {
1663 /*
1664 * Received initial SYN in SYN-SENT[*] state => simul-
1665 * taneous open. If segment contains CC option and there is
1666 * a cached CC, apply TAO test; if it succeeds, connection is
1667 * half-synchronized. Otherwise, do 3-way handshake:
1668 * SYN-SENT -> SYN-RECEIVED
1669 * SYN-SENT* -> SYN-RECEIVED*
1670 */
1671 tp->t_flags |= TF_ACKNOW;
1672 tp->t_timer[TCPT_REXMT] = 0;
1673 tp->t_state = TCPS_SYN_RECEIVED;
1674
1675 }
1676
1677 trimthenstep6:
1678 /*
1679 * Advance th->th_seq to correspond to first data byte.
1680 * If data, trim to stay within window,
1681 * dropping FIN if necessary.
1682 */
1683 th->th_seq++;
1684 if (tlen > tp->rcv_wnd) {
1685 todrop = tlen - tp->rcv_wnd;
1686 m_adj(m, -todrop);
1687 tlen = tp->rcv_wnd;
1688 thflags &= ~TH_FIN;
1689 tcpstat.tcps_rcvpackafterwin++;
1690 tcpstat.tcps_rcvbyteafterwin += todrop;
1691 }
1692 tp->snd_wl1 = th->th_seq - 1;
1693 tp->rcv_up = th->th_seq;
1694 /*
1695 * Client side of transaction: already sent SYN and data.
1696 * If the remote host used T/TCP to validate the SYN,
1697 * our data will be ACK'd; if so, enter normal data segment
1698 * processing in the middle of step 5, ack processing.
1699 * Otherwise, goto step 6.
1700 */
1701 if (thflags & TH_ACK)
1702 goto process_ACK;
1703 goto step6;
1704 /*
1705 * If the state is LAST_ACK or CLOSING or TIME_WAIT:
1706 * do normal processing.
1707 *
1708 * NB: Leftover from RFC1644 T/TCP. Cases to be reused later.
1709 */
1710 case TCPS_LAST_ACK:
1711 case TCPS_CLOSING:
1712 case TCPS_TIME_WAIT:
1713 break; /* continue normal processing */
1714
1715 /* Received a SYN while connection is already established.
1716 * This is a "half open connection and other anomalies" described
1717 * in RFC793 page 34, send an ACK so the remote reset the connection
1718 * or recovers by adjusting its sequence numberering
1719 */
1720 case TCPS_ESTABLISHED:
1721 if (thflags & TH_SYN)
1722 goto dropafterack;
1723 break;
1724 }
1725
1726 /*
1727 * States other than LISTEN or SYN_SENT.
1728 * First check the RST flag and sequence number since reset segments
1729 * are exempt from the timestamp and connection count tests. This
1730 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
1731 * below which allowed reset segments in half the sequence space
1732 * to fall though and be processed (which gives forged reset
1733 * segments with a random sequence number a 50 percent chance of
1734 * killing a connection).
1735 * Then check timestamp, if present.
1736 * Then check the connection count, if present.
1737 * Then check that at least some bytes of segment are within
1738 * receive window. If segment begins before rcv_nxt,
1739 * drop leading data (and SYN); if nothing left, just ack.
1740 *
1741 *
1742 * If the RST bit is set, check the sequence number to see
1743 * if this is a valid reset segment.
1744 * RFC 793 page 37:
1745 * In all states except SYN-SENT, all reset (RST) segments
1746 * are validated by checking their SEQ-fields. A reset is
1747 * valid if its sequence number is in the window.
1748 * Note: this does not take into account delayed ACKs, so
1749 * we should test against last_ack_sent instead of rcv_nxt.
1750 * The sequence number in the reset segment is normally an
1751 * echo of our outgoing acknowlegement numbers, but some hosts
1752 * send a reset with the sequence number at the rightmost edge
1753 * of our receive window, and we have to handle this case.
1754 * Note 2: Paul Watson's paper "Slipping in the Window" has shown
1755 * that brute force RST attacks are possible. To combat this,
1756 * we use a much stricter check while in the ESTABLISHED state,
1757 * only accepting RSTs where the sequence number is equal to
1758 * last_ack_sent. In all other states (the states in which a
1759 * RST is more likely), the more permissive check is used.
1760 * If we have multiple segments in flight, the intial reset
1761 * segment sequence numbers will be to the left of last_ack_sent,
1762 * but they will eventually catch up.
1763 * In any case, it never made sense to trim reset segments to
1764 * fit the receive window since RFC 1122 says:
1765 * 4.2.2.12 RST Segment: RFC-793 Section 3.4
1766 *
1767 * A TCP SHOULD allow a received RST segment to include data.
1768 *
1769 * DISCUSSION
1770 * It has been suggested that a RST segment could contain
1771 * ASCII text that encoded and explained the cause of the
1772 * RST. No standard has yet been established for such
1773 * data.
1774 *
1775 * If the reset segment passes the sequence number test examine
1776 * the state:
1777 * SYN_RECEIVED STATE:
1778 * If passive open, return to LISTEN state.
1779 * If active open, inform user that connection was refused.
1780 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES:
1781 * Inform user that connection was reset, and close tcb.
1782 * CLOSING, LAST_ACK STATES:
1783 * Close the tcb.
1784 * TIME_WAIT STATE:
1785 * Drop the segment - see Stevens, vol. 2, p. 964 and
1786 * RFC 1337.
1787 */
1788 if (thflags & TH_RST) {
1789 if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
1790 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
1791 (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
1792 switch (tp->t_state) {
1793
1794 case TCPS_SYN_RECEIVED:
1795 so->so_error = ECONNREFUSED;
1796 goto close;
1797
1798 case TCPS_ESTABLISHED:
1799 if (tp->last_ack_sent != th->th_seq) {
1800 goto drop;
1801 }
1802 case TCPS_FIN_WAIT_1:
1803 case TCPS_CLOSE_WAIT:
1804 /*
1805 Drop through ...
1806 */
1807 case TCPS_FIN_WAIT_2:
1808 so->so_error = ECONNRESET;
1809 close:
1810 postevent(so, 0, EV_RESET);
1811 tp->t_state = TCPS_CLOSED;
1812 tcpstat.tcps_drops++;
1813 tp = tcp_close(tp);
1814 break;
1815
1816 case TCPS_CLOSING:
1817 case TCPS_LAST_ACK:
1818 tp = tcp_close(tp);
1819 break;
1820
1821 case TCPS_TIME_WAIT:
1822 break;
1823 }
1824 }
1825 goto drop;
1826 }
1827
1828 #if 1
1829 lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1830 #endif
1831 /*
1832 * RFC 1323 PAWS: If we have a timestamp reply on this segment
1833 * and it's less than ts_recent, drop it.
1834 */
1835 if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
1836 TSTMP_LT(to.to_tsval, tp->ts_recent)) {
1837
1838 /* Check to see if ts_recent is over 24 days old. */
1839 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) {
1840 /*
1841 * Invalidate ts_recent. If this segment updates
1842 * ts_recent, the age will be reset later and ts_recent
1843 * will get a valid value. If it does not, setting
1844 * ts_recent to zero will at least satisfy the
1845 * requirement that zero be placed in the timestamp
1846 * echo reply when ts_recent isn't valid. The
1847 * age isn't reset until we get a valid ts_recent
1848 * because we don't want out-of-order segments to be
1849 * dropped when ts_recent is old.
1850 */
1851 tp->ts_recent = 0;
1852 } else {
1853 tcpstat.tcps_rcvduppack++;
1854 tcpstat.tcps_rcvdupbyte += tlen;
1855 tcpstat.tcps_pawsdrop++;
1856 if (tlen)
1857 goto dropafterack;
1858 goto drop;
1859 }
1860 }
1861
1862 /*
1863 * In the SYN-RECEIVED state, validate that the packet belongs to
1864 * this connection before trimming the data to fit the receive
1865 * window. Check the sequence number versus IRS since we know
1866 * the sequence numbers haven't wrapped. This is a partial fix
1867 * for the "LAND" DoS attack.
1868 */
1869 if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
1870 rstreason = BANDLIM_RST_OPENPORT;
1871 goto dropwithreset;
1872 }
1873
1874 todrop = tp->rcv_nxt - th->th_seq;
1875 if (todrop > 0) {
1876 if (thflags & TH_SYN) {
1877 thflags &= ~TH_SYN;
1878 th->th_seq++;
1879 if (th->th_urp > 1)
1880 th->th_urp--;
1881 else
1882 thflags &= ~TH_URG;
1883 todrop--;
1884 }
1885 /*
1886 * Following if statement from Stevens, vol. 2, p. 960.
1887 */
1888 if (todrop > tlen
1889 || (todrop == tlen && (thflags & TH_FIN) == 0)) {
1890 /*
1891 * Any valid FIN must be to the left of the window.
1892 * At this point the FIN must be a duplicate or out
1893 * of sequence; drop it.
1894 */
1895 thflags &= ~TH_FIN;
1896
1897 /*
1898 * Send an ACK to resynchronize and drop any data.
1899 * But keep on processing for RST or ACK.
1900 */
1901 tp->t_flags |= TF_ACKNOW;
1902 todrop = tlen;
1903 tcpstat.tcps_rcvduppack++;
1904 tcpstat.tcps_rcvdupbyte += todrop;
1905 } else {
1906 tcpstat.tcps_rcvpartduppack++;
1907 tcpstat.tcps_rcvpartdupbyte += todrop;
1908 }
1909 drop_hdrlen += todrop; /* drop from the top afterwards */
1910 th->th_seq += todrop;
1911 tlen -= todrop;
1912 if (th->th_urp > todrop)
1913 th->th_urp -= todrop;
1914 else {
1915 thflags &= ~TH_URG;
1916 th->th_urp = 0;
1917 }
1918 }
1919
1920 /*
1921 * If new data are received on a connection after the
1922 * user processes are gone, then RST the other end.
1923 */
1924 if ((so->so_state & SS_NOFDREF) &&
1925 tp->t_state > TCPS_CLOSE_WAIT && tlen) {
1926 tp = tcp_close(tp);
1927 tcpstat.tcps_rcvafterclose++;
1928 rstreason = BANDLIM_UNLIMITED;
1929 goto dropwithreset;
1930 }
1931
1932 /*
1933 * If segment ends after window, drop trailing data
1934 * (and PUSH and FIN); if nothing left, just ACK.
1935 */
1936 todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd);
1937 if (todrop > 0) {
1938 tcpstat.tcps_rcvpackafterwin++;
1939 if (todrop >= tlen) {
1940 tcpstat.tcps_rcvbyteafterwin += tlen;
1941 /*
1942 * If a new connection request is received
1943 * while in TIME_WAIT, drop the old connection
1944 * and start over if the sequence numbers
1945 * are above the previous ones.
1946 */
1947 if (thflags & TH_SYN &&
1948 tp->t_state == TCPS_TIME_WAIT &&
1949 SEQ_GT(th->th_seq, tp->rcv_nxt)) {
1950 iss = tcp_new_isn(tp);
1951 tp = tcp_close(tp);
1952 tcp_unlock(so, 1, 0);
1953 goto findpcb;
1954 }
1955 /*
1956 * If window is closed can only take segments at
1957 * window edge, and have to drop data and PUSH from
1958 * incoming segments. Continue processing, but
1959 * remember to ack. Otherwise, drop segment
1960 * and ack.
1961 */
1962 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
1963 tp->t_flags |= TF_ACKNOW;
1964 tcpstat.tcps_rcvwinprobe++;
1965 } else
1966 goto dropafterack;
1967 } else
1968 tcpstat.tcps_rcvbyteafterwin += todrop;
1969 m_adj(m, -todrop);
1970 tlen -= todrop;
1971 thflags &= ~(TH_PUSH|TH_FIN);
1972 }
1973
1974 /*
1975 * If last ACK falls within this segment's sequence numbers,
1976 * record its timestamp.
1977 * NOTE:
1978 * 1) That the test incorporates suggestions from the latest
1979 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
1980 * 2) That updating only on newer timestamps interferes with
1981 * our earlier PAWS tests, so this check should be solely
1982 * predicated on the sequence space of this segment.
1983 * 3) That we modify the segment boundary check to be
1984 * Last.ACK.Sent <= SEG.SEQ + SEG.Len
1985 * instead of RFC1323's
1986 * Last.ACK.Sent < SEG.SEQ + SEG.Len,
1987 * This modified check allows us to overcome RFC1323's
1988 * limitations as described in Stevens TCP/IP Illustrated
1989 * Vol. 2 p.869. In such cases, we can still calculate the
1990 * RTT correctly when RCV.NXT == Last.ACK.Sent.
1991 */
1992 if ((to.to_flags & TOF_TS) != 0 &&
1993 SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
1994 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
1995 ((thflags & (TH_SYN|TH_FIN)) != 0))) {
1996 tp->ts_recent_age = tcp_now;
1997 tp->ts_recent = to.to_tsval;
1998 }
1999
2000 /*
2001 * If a SYN is in the window, then this is an
2002 * error and we send an RST and drop the connection.
2003 */
2004 if (thflags & TH_SYN) {
2005 tp = tcp_drop(tp, ECONNRESET);
2006 rstreason = BANDLIM_UNLIMITED;
2007 postevent(so, 0, EV_RESET);
2008 goto dropwithreset;
2009 }
2010
2011 /*
2012 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN
2013 * flag is on (half-synchronized state), then queue data for
2014 * later processing; else drop segment and return.
2015 */
2016 if ((thflags & TH_ACK) == 0) {
2017 if (tp->t_state == TCPS_SYN_RECEIVED ||
2018 (tp->t_flags & TF_NEEDSYN))
2019 goto step6;
2020 else
2021 goto drop;
2022 }
2023
2024 /*
2025 * Ack processing.
2026 */
2027 switch (tp->t_state) {
2028
2029 /*
2030 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
2031 * ESTABLISHED state and continue processing.
2032 * The ACK was checked above.
2033 */
2034 case TCPS_SYN_RECEIVED:
2035
2036 tcpstat.tcps_connects++;
2037 soisconnected(so);
2038
2039 /* Do window scaling? */
2040 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
2041 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
2042 tp->snd_scale = tp->requested_s_scale;
2043 tp->rcv_scale = tp->request_r_scale;
2044 }
2045 /*
2046 * Make transitions:
2047 * SYN-RECEIVED -> ESTABLISHED
2048 * SYN-RECEIVED* -> FIN-WAIT-1
2049 */
2050 tp->t_starttime = 0;
2051 if (tp->t_flags & TF_NEEDFIN) {
2052 tp->t_state = TCPS_FIN_WAIT_1;
2053 tp->t_flags &= ~TF_NEEDFIN;
2054 } else {
2055 tp->t_state = TCPS_ESTABLISHED;
2056 tp->t_timer[TCPT_KEEP] = TCP_KEEPIDLE(tp);
2057 }
2058 /*
2059 * If segment contains data or ACK, will call tcp_reass()
2060 * later; if not, do so now to pass queued data to user.
2061 */
2062 if (tlen == 0 && (thflags & TH_FIN) == 0)
2063 (void) tcp_reass(tp, (struct tcphdr *)0, 0,
2064 (struct mbuf *)0);
2065 tp->snd_wl1 = th->th_seq - 1;
2066 /* FALLTHROUGH */
2067
2068 /*
2069 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
2070 * ACKs. If the ack is in the range
2071 * tp->snd_una < th->th_ack <= tp->snd_max
2072 * then advance tp->snd_una to th->th_ack and drop
2073 * data from the retransmission queue. If this ACK reflects
2074 * more up to date window information we update our window information.
2075 */
2076 case TCPS_ESTABLISHED:
2077 case TCPS_FIN_WAIT_1:
2078 case TCPS_FIN_WAIT_2:
2079 case TCPS_CLOSE_WAIT:
2080 case TCPS_CLOSING:
2081 case TCPS_LAST_ACK:
2082 case TCPS_TIME_WAIT:
2083 if (SEQ_GT(th->th_ack, tp->snd_max)) {
2084 tcpstat.tcps_rcvacktoomuch++;
2085 goto dropafterack;
2086 }
2087 if (tp->sack_enable &&
2088 (to.to_nsacks > 0 || !TAILQ_EMPTY(&tp->snd_holes)))
2089 tcp_sack_doack(tp, &to, th->th_ack);
2090 if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
2091 if (tlen == 0 && tiwin == tp->snd_wnd) {
2092 tcpstat.tcps_rcvdupack++;
2093 /*
2094 * If we have outstanding data (other than
2095 * a window probe), this is a completely
2096 * duplicate ack (ie, window info didn't
2097 * change), the ack is the biggest we've
2098 * seen and we've seen exactly our rexmt
2099 * threshhold of them, assume a packet
2100 * has been dropped and retransmit it.
2101 * Kludge snd_nxt & the congestion
2102 * window so we send only this one
2103 * packet.
2104 *
2105 * We know we're losing at the current
2106 * window size so do congestion avoidance
2107 * (set ssthresh to half the current window
2108 * and pull our congestion window back to
2109 * the new ssthresh).
2110 *
2111 * Dup acks mean that packets have left the
2112 * network (they're now cached at the receiver)
2113 * so bump cwnd by the amount in the receiver
2114 * to keep a constant cwnd packets in the
2115 * network.
2116 */
2117 if (tp->t_timer[TCPT_REXMT] == 0 ||
2118 th->th_ack != tp->snd_una)
2119 tp->t_dupacks = 0;
2120 else if (++tp->t_dupacks > tcprexmtthresh ||
2121 ((tcp_do_newreno || tp->sack_enable) &&
2122 IN_FASTRECOVERY(tp))) {
2123 if (tp->sack_enable && IN_FASTRECOVERY(tp)) {
2124 int awnd;
2125
2126 /*
2127 * Compute the amount of data in flight first.
2128 * We can inject new data into the pipe iff
2129 * we have less than 1/2 the original window's
2130 * worth of data in flight.
2131 */
2132 awnd = (tp->snd_nxt - tp->snd_fack) +
2133 tp->sackhint.sack_bytes_rexmit;
2134 if (awnd < tp->snd_ssthresh) {
2135 tp->snd_cwnd += tp->t_maxseg;
2136 if (tp->snd_cwnd > tp->snd_ssthresh)
2137 tp->snd_cwnd = tp->snd_ssthresh;
2138 }
2139 } else
2140 tp->snd_cwnd += tp->t_maxseg;
2141 (void) tcp_output(tp);
2142 goto drop;
2143 } else if (tp->t_dupacks == tcprexmtthresh) {
2144 tcp_seq onxt = tp->snd_nxt;
2145 u_int win;
2146
2147 /*
2148 * If we're doing sack, check to
2149 * see if we're already in sack
2150 * recovery. If we're not doing sack,
2151 * check to see if we're in newreno
2152 * recovery.
2153 */
2154 if (tp->sack_enable) {
2155 if (IN_FASTRECOVERY(tp)) {
2156 tp->t_dupacks = 0;
2157 break;
2158 }
2159 } else if (tcp_do_newreno) {
2160 if (SEQ_LEQ(th->th_ack,
2161 tp->snd_recover)) {
2162 tp->t_dupacks = 0;
2163 break;
2164 }
2165 }
2166 win = min(tp->snd_wnd, tp->snd_cwnd) /
2167 2 / tp->t_maxseg;
2168 if (win < 2)
2169 win = 2;
2170 tp->snd_ssthresh = win * tp->t_maxseg;
2171 ENTER_FASTRECOVERY(tp);
2172 tp->snd_recover = tp->snd_max;
2173 tp->t_timer[TCPT_REXMT] = 0;
2174 tp->t_rtttime = 0;
2175 if (tp->sack_enable) {
2176 tcpstat.tcps_sack_recovery_episode++;
2177 tp->sack_newdata = tp->snd_nxt;
2178 tp->snd_cwnd = tp->t_maxseg;
2179 (void) tcp_output(tp);
2180 goto drop;
2181 }
2182 tp->snd_nxt = th->th_ack;
2183 tp->snd_cwnd = tp->t_maxseg;
2184 (void) tcp_output(tp);
2185 tp->snd_cwnd = tp->snd_ssthresh +
2186 tp->t_maxseg * tp->t_dupacks;
2187 if (SEQ_GT(onxt, tp->snd_nxt))
2188 tp->snd_nxt = onxt;
2189 goto drop;
2190 }
2191 } else
2192 tp->t_dupacks = 0;
2193 break;
2194 }
2195 /*
2196 * If the congestion window was inflated to account
2197 * for the other side's cached packets, retract it.
2198 */
2199 if (tcp_do_newreno || tp->sack_enable) {
2200 if (IN_FASTRECOVERY(tp)) {
2201 if (SEQ_LT(th->th_ack, tp->snd_recover)) {
2202 if (tp->sack_enable)
2203 tcp_sack_partialack(tp, th);
2204 else
2205 tcp_newreno_partial_ack(tp, th);
2206 } else {
2207 /*
2208 * Out of fast recovery.
2209 * Window inflation should have left us
2210 * with approximately snd_ssthresh
2211 * outstanding data.
2212 * But in case we would be inclined to
2213 * send a burst, better to do it via
2214 * the slow start mechanism.
2215 */
2216 if (SEQ_GT(th->th_ack +
2217 tp->snd_ssthresh,
2218 tp->snd_max))
2219 tp->snd_cwnd = tp->snd_max -
2220 th->th_ack +
2221 tp->t_maxseg;
2222 else
2223 tp->snd_cwnd = tp->snd_ssthresh;
2224 }
2225 }
2226 } else {
2227 if (tp->t_dupacks >= tcprexmtthresh &&
2228 tp->snd_cwnd > tp->snd_ssthresh)
2229 tp->snd_cwnd = tp->snd_ssthresh;
2230 }
2231 tp->t_dupacks = 0;
2232 /*
2233 * If we reach this point, ACK is not a duplicate,
2234 * i.e., it ACKs something we sent.
2235 */
2236 if (tp->t_flags & TF_NEEDSYN) {
2237 /*
2238 * T/TCP: Connection was half-synchronized, and our
2239 * SYN has been ACK'd (so connection is now fully
2240 * synchronized). Go to non-starred state,
2241 * increment snd_una for ACK of SYN, and check if
2242 * we can do window scaling.
2243 */
2244 tp->t_flags &= ~TF_NEEDSYN;
2245 tp->snd_una++;
2246 /* Do window scaling? */
2247 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
2248 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
2249 tp->snd_scale = tp->requested_s_scale;
2250 tp->rcv_scale = tp->request_r_scale;
2251 }
2252 }
2253
2254 process_ACK:
2255 acked = th->th_ack - tp->snd_una;
2256 tcpstat.tcps_rcvackpack++;
2257 tcpstat.tcps_rcvackbyte += acked;
2258
2259 /*
2260 * If we just performed our first retransmit, and the ACK
2261 * arrives within our recovery window, then it was a mistake
2262 * to do the retransmit in the first place. Recover our
2263 * original cwnd and ssthresh, and proceed to transmit where
2264 * we left off.
2265 */
2266 if (tp->t_rxtshift == 1 && tcp_now < tp->t_badrxtwin) {
2267 tp->snd_cwnd = tp->snd_cwnd_prev;
2268 tp->snd_ssthresh = tp->snd_ssthresh_prev;
2269 tp->snd_recover = tp->snd_recover_prev;
2270 if (tp->t_flags & TF_WASFRECOVERY)
2271 ENTER_FASTRECOVERY(tp);
2272 tp->snd_nxt = tp->snd_max;
2273 tp->t_badrxtwin = 0; /* XXX probably not required */
2274 }
2275
2276 /*
2277 * If we have a timestamp reply, update smoothed
2278 * round trip time. If no timestamp is present but
2279 * transmit timer is running and timed sequence
2280 * number was acked, update smoothed round trip time.
2281 * Since we now have an rtt measurement, cancel the
2282 * timer backoff (cf., Phil Karn's retransmit alg.).
2283 * Recompute the initial retransmit timer.
2284 * Also makes sure we have a valid time stamp in hand
2285 */
2286 if (((to.to_flags & TOF_TS) != 0) && (to.to_tsecr != 0))
2287 tcp_xmit_timer(tp, tcp_now - to.to_tsecr + 1);
2288 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
2289 tcp_xmit_timer(tp, tp->t_rtttime);
2290
2291 /*
2292 * If all outstanding data is acked, stop retransmit
2293 * timer and remember to restart (more output or persist).
2294 * If there is more data to be acked, restart retransmit
2295 * timer, using current (possibly backed-off) value.
2296 */
2297 if (th->th_ack == tp->snd_max) {
2298 tp->t_timer[TCPT_REXMT] = 0;
2299 needoutput = 1;
2300 } else if (tp->t_timer[TCPT_PERSIST] == 0)
2301 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
2302
2303 /*
2304 * If no data (only SYN) was ACK'd,
2305 * skip rest of ACK processing.
2306 */
2307 if (acked == 0)
2308 goto step6;
2309
2310 /*
2311 * When new data is acked, open the congestion window.
2312 * If the window gives us less than ssthresh packets
2313 * in flight, open exponentially (maxseg per packet).
2314 * Otherwise open linearly: maxseg per window
2315 * (maxseg^2 / cwnd per packet).
2316 */
2317 if ((!tcp_do_newreno && !tp->sack_enable) ||
2318 !IN_FASTRECOVERY(tp)) {
2319 register u_int cw = tp->snd_cwnd;
2320 register u_int incr = tp->t_maxseg;
2321 if (cw > tp->snd_ssthresh)
2322 incr = incr * incr / cw;
2323 tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<<tp->snd_scale);
2324 }
2325 if (acked > so->so_snd.sb_cc) {
2326 tp->snd_wnd -= so->so_snd.sb_cc;
2327 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
2328 ourfinisacked = 1;
2329 } else {
2330 sbdrop(&so->so_snd, acked);
2331 tp->snd_wnd -= acked;
2332 ourfinisacked = 0;
2333 }
2334 sowwakeup(so);
2335 /* detect una wraparound */
2336 if ((tcp_do_newreno || tp->sack_enable) &&
2337 !IN_FASTRECOVERY(tp) &&
2338 SEQ_GT(tp->snd_una, tp->snd_recover) &&
2339 SEQ_LEQ(th->th_ack, tp->snd_recover))
2340 tp->snd_recover = th->th_ack - 1;
2341 if ((tcp_do_newreno || tp->sack_enable) &&
2342 IN_FASTRECOVERY(tp) &&
2343 SEQ_GEQ(th->th_ack, tp->snd_recover))
2344 EXIT_FASTRECOVERY(tp);
2345 tp->snd_una = th->th_ack;
2346 if (tp->sack_enable) {
2347 if (SEQ_GT(tp->snd_una, tp->snd_recover))
2348 tp->snd_recover = tp->snd_una;
2349 }
2350 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
2351 tp->snd_nxt = tp->snd_una;
2352
2353 switch (tp->t_state) {
2354
2355 /*
2356 * In FIN_WAIT_1 STATE in addition to the processing
2357 * for the ESTABLISHED state if our FIN is now acknowledged
2358 * then enter FIN_WAIT_2.
2359 */
2360 case TCPS_FIN_WAIT_1:
2361 if (ourfinisacked) {
2362 /*
2363 * If we can't receive any more
2364 * data, then closing user can proceed.
2365 * Starting the timer is contrary to the
2366 * specification, but if we don't get a FIN
2367 * we'll hang forever.
2368 */
2369 if (so->so_state & SS_CANTRCVMORE) {
2370 soisdisconnected(so);
2371 tp->t_timer[TCPT_2MSL] = tcp_maxidle;
2372 }
2373 add_to_time_wait(tp);
2374 tp->t_state = TCPS_FIN_WAIT_2;
2375 goto drop;
2376 }
2377 break;
2378
2379 /*
2380 * In CLOSING STATE in addition to the processing for
2381 * the ESTABLISHED state if the ACK acknowledges our FIN
2382 * then enter the TIME-WAIT state, otherwise ignore
2383 * the segment.
2384 */
2385 case TCPS_CLOSING:
2386 if (ourfinisacked) {
2387 tp->t_state = TCPS_TIME_WAIT;
2388 tcp_canceltimers(tp);
2389 /* Shorten TIME_WAIT [RFC-1644, p.28] */
2390 if (tp->cc_recv != 0 &&
2391 tp->t_starttime < tcp_msl)
2392 tp->t_timer[TCPT_2MSL] =
2393 tp->t_rxtcur * TCPTV_TWTRUNC;
2394 else
2395 tp->t_timer[TCPT_2MSL] = 2 * tcp_msl;
2396 add_to_time_wait(tp);
2397 soisdisconnected(so);
2398 }
2399 break;
2400
2401 /*
2402 * In LAST_ACK, we may still be waiting for data to drain
2403 * and/or to be acked, as well as for the ack of our FIN.
2404 * If our FIN is now acknowledged, delete the TCB,
2405 * enter the closed state and return.
2406 */
2407 case TCPS_LAST_ACK:
2408 if (ourfinisacked) {
2409 tp = tcp_close(tp);
2410 goto drop;
2411 }
2412 break;
2413
2414 /*
2415 * In TIME_WAIT state the only thing that should arrive
2416 * is a retransmission of the remote FIN. Acknowledge
2417 * it and restart the finack timer.
2418 */
2419 case TCPS_TIME_WAIT:
2420 tp->t_timer[TCPT_2MSL] = 2 * tcp_msl;
2421 add_to_time_wait(tp);
2422 goto dropafterack;
2423 }
2424 }
2425
2426 step6:
2427 /*
2428 * Update window information.
2429 * Don't look at window if no ACK: TAC's send garbage on first SYN.
2430 */
2431 if ((thflags & TH_ACK) &&
2432 (SEQ_LT(tp->snd_wl1, th->th_seq) ||
2433 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
2434 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
2435 /* keep track of pure window updates */
2436 if (tlen == 0 &&
2437 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
2438 tcpstat.tcps_rcvwinupd++;
2439 tp->snd_wnd = tiwin;
2440 tp->snd_wl1 = th->th_seq;
2441 tp->snd_wl2 = th->th_ack;
2442 if (tp->snd_wnd > tp->max_sndwnd)
2443 tp->max_sndwnd = tp->snd_wnd;
2444 needoutput = 1;
2445 }
2446
2447 /*
2448 * Process segments with URG.
2449 */
2450 if ((thflags & TH_URG) && th->th_urp &&
2451 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2452 /*
2453 * This is a kludge, but if we receive and accept
2454 * random urgent pointers, we'll crash in
2455 * soreceive. It's hard to imagine someone
2456 * actually wanting to send this much urgent data.
2457 */
2458 if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
2459 th->th_urp = 0; /* XXX */
2460 thflags &= ~TH_URG; /* XXX */
2461 goto dodata; /* XXX */
2462 }
2463 /*
2464 * If this segment advances the known urgent pointer,
2465 * then mark the data stream. This should not happen
2466 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
2467 * a FIN has been received from the remote side.
2468 * In these states we ignore the URG.
2469 *
2470 * According to RFC961 (Assigned Protocols),
2471 * the urgent pointer points to the last octet
2472 * of urgent data. We continue, however,
2473 * to consider it to indicate the first octet
2474 * of data past the urgent section as the original
2475 * spec states (in one of two places).
2476 */
2477 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
2478 tp->rcv_up = th->th_seq + th->th_urp;
2479 so->so_oobmark = so->so_rcv.sb_cc +
2480 (tp->rcv_up - tp->rcv_nxt) - 1;
2481 if (so->so_oobmark == 0) {
2482 so->so_state |= SS_RCVATMARK;
2483 postevent(so, 0, EV_OOB);
2484 }
2485 sohasoutofband(so);
2486 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
2487 }
2488 /*
2489 * Remove out of band data so doesn't get presented to user.
2490 * This can happen independent of advancing the URG pointer,
2491 * but if two URG's are pending at once, some out-of-band
2492 * data may creep in... ick.
2493 */
2494 if (th->th_urp <= (u_long)tlen
2495 #if SO_OOBINLINE
2496 && (so->so_options & SO_OOBINLINE) == 0
2497 #endif
2498 )
2499 tcp_pulloutofband(so, th, m,
2500 drop_hdrlen); /* hdr drop is delayed */
2501 } else
2502 /*
2503 * If no out of band data is expected,
2504 * pull receive urgent pointer along
2505 * with the receive window.
2506 */
2507 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
2508 tp->rcv_up = tp->rcv_nxt;
2509 dodata: /* XXX */
2510
2511 /*
2512 * Process the segment text, merging it into the TCP sequencing queue,
2513 * and arranging for acknowledgment of receipt if necessary.
2514 * This process logically involves adjusting tp->rcv_wnd as data
2515 * is presented to the user (this happens in tcp_usrreq.c,
2516 * case PRU_RCVD). If a FIN has already been received on this
2517 * connection then we just ignore the text.
2518 */
2519 if ((tlen || (thflags&TH_FIN)) &&
2520 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2521 tcp_seq save_start = th->th_seq;
2522 tcp_seq save_end = th->th_seq + tlen;
2523 m_adj(m, drop_hdrlen); /* delayed header drop */
2524 /*
2525 * Insert segment which includes th into TCP reassembly queue
2526 * with control block tp. Set thflags to whether reassembly now
2527 * includes a segment with FIN. This handles the common case
2528 * inline (segment is the next to be received on an established
2529 * connection, and the queue is empty), avoiding linkage into
2530 * and removal from the queue and repetition of various
2531 * conversions.
2532 * Set DELACK for segments received in order, but ack
2533 * immediately when segments are out of order (so
2534 * fast retransmit can work).
2535 */
2536 if (th->th_seq == tp->rcv_nxt &&
2537 LIST_EMPTY(&tp->t_segq) &&
2538 TCPS_HAVEESTABLISHED(tp->t_state)) {
2539 if (DELAY_ACK(tp) && ((tp->t_flags & TF_ACKNOW) == 0)) {
2540 tp->t_flags |= TF_DELACK;
2541 }
2542 else {
2543 tp->t_flags |= TF_ACKNOW;
2544 }
2545 tp->rcv_nxt += tlen;
2546 thflags = th->th_flags & TH_FIN;
2547 tcpstat.tcps_rcvpack++;
2548 tcpstat.tcps_rcvbyte += tlen;
2549 ND6_HINT(tp);
2550 if (sbappend(&so->so_rcv, m))
2551 sorwakeup(so);
2552 } else {
2553 thflags = tcp_reass(tp, th, &tlen, m);
2554 tp->t_flags |= TF_ACKNOW;
2555 }
2556
2557 if (tlen > 0 && tp->sack_enable)
2558 tcp_update_sack_list(tp, save_start, save_end);
2559
2560 if (tp->t_flags & TF_DELACK)
2561 {
2562 #if INET6
2563 if (isipv6) {
2564 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
2565 (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
2566 th->th_seq, th->th_ack, th->th_win);
2567 }
2568 else
2569 #endif
2570 {
2571 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
2572 (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
2573 th->th_seq, th->th_ack, th->th_win);
2574 }
2575
2576 }
2577 /*
2578 * Note the amount of data that peer has sent into
2579 * our window, in order to estimate the sender's
2580 * buffer size.
2581 */
2582 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
2583 } else {
2584 m_freem(m);
2585 thflags &= ~TH_FIN;
2586 }
2587
2588 /*
2589 * If FIN is received ACK the FIN and let the user know
2590 * that the connection is closing.
2591 */
2592 if (thflags & TH_FIN) {
2593 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2594 socantrcvmore(so);
2595 postevent(so, 0, EV_FIN);
2596 /*
2597 * If connection is half-synchronized
2598 * (ie NEEDSYN flag on) then delay ACK,
2599 * If connection is half-synchronized
2600 * (ie NEEDSYN flag on) then delay ACK,
2601 * so it may be piggybacked when SYN is sent.
2602 * Otherwise, since we received a FIN then no
2603 * more input can be expected, send ACK now.
2604 */
2605 if (DELAY_ACK(tp) && (tp->t_flags & TF_NEEDSYN)) {
2606 tp->t_flags |= TF_DELACK;
2607 }
2608 else {
2609 tp->t_flags |= TF_ACKNOW;
2610 }
2611 tp->rcv_nxt++;
2612 }
2613 switch (tp->t_state) {
2614
2615 /*
2616 * In SYN_RECEIVED and ESTABLISHED STATES
2617 * enter the CLOSE_WAIT state.
2618 */
2619 case TCPS_SYN_RECEIVED:
2620 tp->t_starttime = 0;
2621 case TCPS_ESTABLISHED:
2622 tp->t_state = TCPS_CLOSE_WAIT;
2623 break;
2624
2625 /*
2626 * If still in FIN_WAIT_1 STATE FIN has not been acked so
2627 * enter the CLOSING state.
2628 */
2629 case TCPS_FIN_WAIT_1:
2630 tp->t_state = TCPS_CLOSING;
2631 break;
2632
2633 /*
2634 * In FIN_WAIT_2 state enter the TIME_WAIT state,
2635 * starting the time-wait timer, turning off the other
2636 * standard timers.
2637 */
2638 case TCPS_FIN_WAIT_2:
2639 tp->t_state = TCPS_TIME_WAIT;
2640 tcp_canceltimers(tp);
2641 /* Shorten TIME_WAIT [RFC-1644, p.28] */
2642 if (tp->cc_recv != 0 &&
2643 tp->t_starttime < tcp_msl) {
2644 tp->t_timer[TCPT_2MSL] =
2645 tp->t_rxtcur * TCPTV_TWTRUNC;
2646 /* For transaction client, force ACK now. */
2647 tp->t_flags |= TF_ACKNOW;
2648 }
2649 else
2650 tp->t_timer[TCPT_2MSL] = 2 * tcp_msl;
2651
2652 add_to_time_wait(tp);
2653 soisdisconnected(so);
2654 break;
2655
2656 /*
2657 * In TIME_WAIT state restart the 2 MSL time_wait timer.
2658 */
2659 case TCPS_TIME_WAIT:
2660 tp->t_timer[TCPT_2MSL] = 2 * tcp_msl;
2661 add_to_time_wait(tp);
2662 break;
2663 }
2664 }
2665 #if TCPDEBUG
2666 if (so->so_options & SO_DEBUG)
2667 tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
2668 &tcp_savetcp, 0);
2669 #endif
2670
2671 /*
2672 * Return any desired output.
2673 */
2674 if (needoutput || (tp->t_flags & TF_ACKNOW))
2675 (void) tcp_output(tp);
2676 tcp_unlock(so, 1, 0);
2677 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
2678 return;
2679
2680 dropafterack:
2681 /*
2682 * Generate an ACK dropping incoming segment if it occupies
2683 * sequence space, where the ACK reflects our state.
2684 *
2685 * We can now skip the test for the RST flag since all
2686 * paths to this code happen after packets containing
2687 * RST have been dropped.
2688 *
2689 * In the SYN-RECEIVED state, don't send an ACK unless the
2690 * segment we received passes the SYN-RECEIVED ACK test.
2691 * If it fails send a RST. This breaks the loop in the
2692 * "LAND" DoS attack, and also prevents an ACK storm
2693 * between two listening ports that have been sent forged
2694 * SYN segments, each with the source address of the other.
2695 */
2696 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
2697 (SEQ_GT(tp->snd_una, th->th_ack) ||
2698 SEQ_GT(th->th_ack, tp->snd_max)) ) {
2699 rstreason = BANDLIM_RST_OPENPORT;
2700 goto dropwithreset;
2701 }
2702 #if TCPDEBUG
2703 if (so->so_options & SO_DEBUG)
2704 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
2705 &tcp_savetcp, 0);
2706 #endif
2707 m_freem(m);
2708 tp->t_flags |= TF_ACKNOW;
2709 (void) tcp_output(tp);
2710 tcp_unlock(so, 1, 0);
2711 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
2712 return;
2713 dropwithresetnosock:
2714 nosock = 1;
2715 dropwithreset:
2716 /*
2717 * Generate a RST, dropping incoming segment.
2718 * Make ACK acceptable to originator of segment.
2719 * Don't bother to respond if destination was broadcast/multicast.
2720 */
2721 if ((thflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST))
2722 goto drop;
2723 #if INET6
2724 if (isipv6) {
2725 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
2726 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
2727 goto drop;
2728 } else
2729 #endif /* INET6 */
2730 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
2731 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
2732 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
2733 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
2734 goto drop;
2735 /* IPv6 anycast check is done at tcp6_input() */
2736
2737 /*
2738 * Perform bandwidth limiting.
2739 */
2740 #if ICMP_BANDLIM
2741 if (badport_bandlim(rstreason) < 0)
2742 goto drop;
2743 #endif
2744
2745 #if TCPDEBUG
2746 if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
2747 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
2748 &tcp_savetcp, 0);
2749 #endif
2750 if (thflags & TH_ACK)
2751 /* mtod() below is safe as long as hdr dropping is delayed */
2752 tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack,
2753 TH_RST);
2754 else {
2755 if (thflags & TH_SYN)
2756 tlen++;
2757 /* mtod() below is safe as long as hdr dropping is delayed */
2758 tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
2759 (tcp_seq)0, TH_RST|TH_ACK);
2760 }
2761 /* destroy temporarily created socket */
2762 if (dropsocket) {
2763 (void) soabort(so);
2764 tcp_unlock(so, 1, 0);
2765 }
2766 else
2767 if ((inp != NULL) && (nosock == 0))
2768 tcp_unlock(so, 1, 0);
2769 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
2770 return;
2771 dropnosock:
2772 nosock = 1;
2773 drop:
2774 /*
2775 * Drop space held by incoming segment and return.
2776 */
2777 #if TCPDEBUG
2778 if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
2779 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
2780 &tcp_savetcp, 0);
2781 #endif
2782 m_freem(m);
2783 /* destroy temporarily created socket */
2784 if (dropsocket) {
2785 (void) soabort(so);
2786 tcp_unlock(so, 1, 0);
2787 }
2788 else
2789 if (nosock == 0)
2790 tcp_unlock(so, 1, 0);
2791 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
2792 return;
2793 }
2794
2795 static void
2796 tcp_dooptions(tp, cp, cnt, th, to)
2797 /*
2798 * Parse TCP options and place in tcpopt.
2799 */
2800 struct tcpcb *tp;
2801 u_char *cp;
2802 int cnt;
2803 struct tcphdr *th;
2804 struct tcpopt *to;
2805 {
2806 u_short mss = 0;
2807 int opt, optlen;
2808
2809 for (; cnt > 0; cnt -= optlen, cp += optlen) {
2810 opt = cp[0];
2811 if (opt == TCPOPT_EOL)
2812 break;
2813 if (opt == TCPOPT_NOP)
2814 optlen = 1;
2815 else {
2816 if (cnt < 2)
2817 break;
2818 optlen = cp[1];
2819 if (optlen < 2 || optlen > cnt)
2820 break;
2821 }
2822 switch (opt) {
2823
2824 default:
2825 continue;
2826
2827 case TCPOPT_MAXSEG:
2828 if (optlen != TCPOLEN_MAXSEG)
2829 continue;
2830 if (!(th->th_flags & TH_SYN))
2831 continue;
2832 bcopy((char *) cp + 2, (char *) &mss, sizeof(mss));
2833 NTOHS(mss);
2834 break;
2835
2836 case TCPOPT_WINDOW:
2837 if (optlen != TCPOLEN_WINDOW)
2838 continue;
2839 if (!(th->th_flags & TH_SYN))
2840 continue;
2841 tp->t_flags |= TF_RCVD_SCALE;
2842 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
2843 break;
2844
2845 case TCPOPT_TIMESTAMP:
2846 if (optlen != TCPOLEN_TIMESTAMP)
2847 continue;
2848 to->to_flags |= TOF_TS;
2849 bcopy((char *)cp + 2,
2850 (char *)&to->to_tsval, sizeof(to->to_tsval));
2851 NTOHL(to->to_tsval);
2852 bcopy((char *)cp + 6,
2853 (char *)&to->to_tsecr, sizeof(to->to_tsecr));
2854 NTOHL(to->to_tsecr);
2855
2856 /*
2857 * A timestamp received in a SYN makes
2858 * it ok to send timestamp requests and replies.
2859 */
2860 if (th->th_flags & TH_SYN) {
2861 tp->t_flags |= TF_RCVD_TSTMP;
2862 tp->ts_recent = to->to_tsval;
2863 tp->ts_recent_age = tcp_now;
2864 }
2865 break;
2866 case TCPOPT_SACK_PERMITTED:
2867 if (!tcp_do_sack ||
2868 optlen != TCPOLEN_SACK_PERMITTED)
2869 continue;
2870 if (th->th_flags & TH_SYN)
2871 to->to_flags |= TOF_SACK;
2872 break;
2873 case TCPOPT_SACK:
2874 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
2875 continue;
2876 to->to_nsacks = (optlen - 2) / TCPOLEN_SACK;
2877 to->to_sacks = cp + 2;
2878 tcpstat.tcps_sack_rcv_blocks++;
2879
2880 break;
2881 }
2882 }
2883 if (th->th_flags & TH_SYN)
2884 tcp_mss(tp, mss); /* sets t_maxseg */
2885 }
2886
2887 /*
2888 * Pull out of band byte out of a segment so
2889 * it doesn't appear in the user's data queue.
2890 * It is still reflected in the segment length for
2891 * sequencing purposes.
2892 */
2893 static void
2894 tcp_pulloutofband(so, th, m, off)
2895 struct socket *so;
2896 struct tcphdr *th;
2897 register struct mbuf *m;
2898 int off; /* delayed to be droped hdrlen */
2899 {
2900 int cnt = off + th->th_urp - 1;
2901
2902 while (cnt >= 0) {
2903 if (m->m_len > cnt) {
2904 char *cp = mtod(m, caddr_t) + cnt;
2905 struct tcpcb *tp = sototcpcb(so);
2906
2907 tp->t_iobc = *cp;
2908 tp->t_oobflags |= TCPOOB_HAVEDATA;
2909 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
2910 m->m_len--;
2911 if (m->m_flags & M_PKTHDR)
2912 m->m_pkthdr.len--;
2913 return;
2914 }
2915 cnt -= m->m_len;
2916 m = m->m_next;
2917 if (m == 0)
2918 break;
2919 }
2920 panic("tcp_pulloutofband");
2921 }
2922
2923 /*
2924 * Collect new round-trip time estimate
2925 * and update averages and current timeout.
2926 */
2927 static void
2928 tcp_xmit_timer(tp, rtt)
2929 register struct tcpcb *tp;
2930 int rtt;
2931 {
2932 register int delta;
2933
2934 tcpstat.tcps_rttupdated++;
2935 tp->t_rttupdated++;
2936 if (tp->t_srtt != 0) {
2937 /*
2938 * srtt is stored as fixed point with 5 bits after the
2939 * binary point (i.e., scaled by 8). The following magic
2940 * is equivalent to the smoothing algorithm in rfc793 with
2941 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
2942 * point). Adjust rtt to origin 0.
2943 */
2944 delta = ((rtt - 1) << TCP_DELTA_SHIFT)
2945 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
2946
2947 if ((tp->t_srtt += delta) <= 0)
2948 tp->t_srtt = 1;
2949
2950 /*
2951 * We accumulate a smoothed rtt variance (actually, a
2952 * smoothed mean difference), then set the retransmit
2953 * timer to smoothed rtt + 4 times the smoothed variance.
2954 * rttvar is stored as fixed point with 4 bits after the
2955 * binary point (scaled by 16). The following is
2956 * equivalent to rfc793 smoothing with an alpha of .75
2957 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
2958 * rfc793's wired-in beta.
2959 */
2960 if (delta < 0)
2961 delta = -delta;
2962 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
2963 if ((tp->t_rttvar += delta) <= 0)
2964 tp->t_rttvar = 1;
2965 } else {
2966 /*
2967 * No rtt measurement yet - use the unsmoothed rtt.
2968 * Set the variance to half the rtt (so our first
2969 * retransmit happens at 3*rtt).
2970 */
2971 tp->t_srtt = rtt << TCP_RTT_SHIFT;
2972 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
2973 }
2974 tp->t_rtttime = 0;
2975 tp->t_rxtshift = 0;
2976
2977 /*
2978 * the retransmit should happen at rtt + 4 * rttvar.
2979 * Because of the way we do the smoothing, srtt and rttvar
2980 * will each average +1/2 tick of bias. When we compute
2981 * the retransmit timer, we want 1/2 tick of rounding and
2982 * 1 extra tick because of +-1/2 tick uncertainty in the
2983 * firing of the timer. The bias will give us exactly the
2984 * 1.5 tick we need. But, because the bias is
2985 * statistical, we have to test that we don't drop below
2986 * the minimum feasible timer (which is 2 ticks).
2987 */
2988 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
2989 max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
2990
2991 /*
2992 * We received an ack for a packet that wasn't retransmitted;
2993 * it is probably safe to discard any error indications we've
2994 * received recently. This isn't quite right, but close enough
2995 * for now (a route might have failed after we sent a segment,
2996 * and the return path might not be symmetrical).
2997 */
2998 tp->t_softerror = 0;
2999 }
3000
3001 /*
3002 * Determine a reasonable value for maxseg size.
3003 * If the route is known, check route for mtu.
3004 * If none, use an mss that can be handled on the outgoing
3005 * interface without forcing IP to fragment; if bigger than
3006 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
3007 * to utilize large mbufs. If no route is found, route has no mtu,
3008 * or the destination isn't local, use a default, hopefully conservative
3009 * size (usually 512 or the default IP max size, but no more than the mtu
3010 * of the interface), as we can't discover anything about intervening
3011 * gateways or networks. We also initialize the congestion/slow start
3012 * window to be a single segment if the destination isn't local.
3013 * While looking at the routing entry, we also initialize other path-dependent
3014 * parameters from pre-set or cached values in the routing entry.
3015 *
3016 * Also take into account the space needed for options that we
3017 * send regularly. Make maxseg shorter by that amount to assure
3018 * that we can send maxseg amount of data even when the options
3019 * are present. Store the upper limit of the length of options plus
3020 * data in maxopd.
3021 *
3022 * NOTE that this routine is only called when we process an incoming
3023 * segment, for outgoing segments only tcp_mssopt is called.
3024 *
3025 * In case of T/TCP, we call this routine during implicit connection
3026 * setup as well (offer = -1), to initialize maxseg from the cached
3027 * MSS of our peer.
3028 */
3029 void
3030 tcp_mss(tp, offer)
3031 struct tcpcb *tp;
3032 int offer;
3033 {
3034 register struct rtentry *rt;
3035 struct ifnet *ifp;
3036 register int rtt, mss;
3037 u_long bufsize;
3038 struct inpcb *inp;
3039 struct socket *so;
3040 struct rmxp_tao *taop;
3041 int origoffer = offer;
3042 #if INET6
3043 int isipv6;
3044 int min_protoh;
3045 #endif
3046
3047 inp = tp->t_inpcb;
3048 #if INET6
3049 isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
3050 min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr)
3051 : sizeof (struct tcpiphdr);
3052 #else
3053 #define min_protoh (sizeof (struct tcpiphdr))
3054 #endif
3055 #if INET6
3056 if (isipv6)
3057 rt = tcp_rtlookup6(inp);
3058 else
3059 #endif /* INET6 */
3060 rt = tcp_rtlookup(inp);
3061 if (rt == NULL) {
3062 tp->t_maxopd = tp->t_maxseg =
3063 #if INET6
3064 isipv6 ? tcp_v6mssdflt :
3065 #endif /* INET6 */
3066 tcp_mssdflt;
3067 return;
3068 }
3069 ifp = rt->rt_ifp;
3070 /*
3071 * Slower link window correction:
3072 * If a value is specificied for slowlink_wsize use it for PPP links
3073 * believed to be on a serial modem (speed <128Kbps). Excludes 9600bps as
3074 * it is the default value adversized by pseudo-devices over ppp.
3075 */
3076 if (ifp->if_type == IFT_PPP && slowlink_wsize > 0 &&
3077 ifp->if_baudrate > 9600 && ifp->if_baudrate <= 128000) {
3078 tp->t_flags |= TF_SLOWLINK;
3079 }
3080 so = inp->inp_socket;
3081
3082 taop = rmx_taop(rt->rt_rmx);
3083 /*
3084 * Offer == -1 means that we didn't receive SYN yet,
3085 * use cached value in that case;
3086 */
3087 if (offer == -1)
3088 offer = taop->tao_mssopt;
3089 /*
3090 * Offer == 0 means that there was no MSS on the SYN segment,
3091 * in this case we use tcp_mssdflt.
3092 */
3093 if (offer == 0)
3094 offer =
3095 #if INET6
3096 isipv6 ? tcp_v6mssdflt :
3097 #endif /* INET6 */
3098 tcp_mssdflt;
3099 else {
3100 /*
3101 * Prevent DoS attack with too small MSS. Round up
3102 * to at least minmss.
3103 */
3104 offer = max(offer, tcp_minmss);
3105 /*
3106 * Sanity check: make sure that maxopd will be large
3107 * enough to allow some data on segments even is the
3108 * all the option space is used (40bytes). Otherwise
3109 * funny things may happen in tcp_output.
3110 */
3111 offer = max(offer, 64);
3112 }
3113 taop->tao_mssopt = offer;
3114
3115 /*
3116 * While we're here, check if there's an initial rtt
3117 * or rttvar. Convert from the route-table units
3118 * to scaled multiples of the slow timeout timer.
3119 */
3120 if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) {
3121 /*
3122 * XXX the lock bit for RTT indicates that the value
3123 * is also a minimum value; this is subject to time.
3124 */
3125 if (rt->rt_rmx.rmx_locks & RTV_RTT)
3126 tp->t_rttmin = rtt / (RTM_RTTUNIT / PR_SLOWHZ);
3127 tp->t_srtt = rtt / (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTT_SCALE));
3128 tcpstat.tcps_usedrtt++;
3129 if (rt->rt_rmx.rmx_rttvar) {
3130 tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
3131 (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTTVAR_SCALE));
3132 tcpstat.tcps_usedrttvar++;
3133 } else {
3134 /* default variation is +- 1 rtt */
3135 tp->t_rttvar =
3136 tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
3137 }
3138 TCPT_RANGESET(tp->t_rxtcur,
3139 ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
3140 tp->t_rttmin, TCPTV_REXMTMAX);
3141 }
3142 /*
3143 * if there's an mtu associated with the route, use it
3144 * else, use the link mtu.
3145 */
3146 if (rt->rt_rmx.rmx_mtu)
3147 mss = rt->rt_rmx.rmx_mtu - min_protoh;
3148 else
3149 {
3150 mss =
3151 #if INET6
3152 (isipv6 ? nd_ifinfo[rt->rt_ifp->if_index].linkmtu :
3153 #endif
3154 ifp->if_mtu
3155 #if INET6
3156 )
3157 #endif
3158 - min_protoh;
3159 #if INET6
3160 if (isipv6) {
3161 if (!in6_localaddr(&inp->in6p_faddr))
3162 mss = min(mss, tcp_v6mssdflt);
3163 } else
3164 #endif /* INET6 */
3165 if (!in_localaddr(inp->inp_faddr))
3166 mss = min(mss, tcp_mssdflt);
3167 }
3168 mss = min(mss, offer);
3169 /*
3170 * maxopd stores the maximum length of data AND options
3171 * in a segment; maxseg is the amount of data in a normal
3172 * segment. We need to store this value (maxopd) apart
3173 * from maxseg, because now every segment carries options
3174 * and thus we normally have somewhat less data in segments.
3175 */
3176 tp->t_maxopd = mss;
3177
3178 /*
3179 * origoffer==-1 indicates, that no segments were received yet.
3180 * In this case we just guess.
3181 */
3182 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
3183 (origoffer == -1 ||
3184 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
3185 mss -= TCPOLEN_TSTAMP_APPA;
3186 tp->t_maxseg = mss;
3187
3188 /*
3189 * If there's a pipesize (ie loopback), change the socket
3190 * buffer to that size only if it's bigger than the current
3191 * sockbuf size. Make the socket buffers an integral
3192 * number of mss units; if the mss is larger than
3193 * the socket buffer, decrease the mss.
3194 */
3195 #if RTV_SPIPE
3196 bufsize = rt->rt_rmx.rmx_sendpipe;
3197 if (bufsize < so->so_snd.sb_hiwat)
3198 #endif
3199 bufsize = so->so_snd.sb_hiwat;
3200 if (bufsize < mss)
3201 mss = bufsize;
3202 else {
3203 bufsize = roundup(bufsize, mss);
3204 if (bufsize > sb_max)
3205 bufsize = sb_max;
3206 (void)sbreserve(&so->so_snd, bufsize);
3207 }
3208 tp->t_maxseg = mss;
3209
3210 #if RTV_RPIPE
3211 bufsize = rt->rt_rmx.rmx_recvpipe;
3212 if (bufsize < so->so_rcv.sb_hiwat)
3213 #endif
3214 bufsize = so->so_rcv.sb_hiwat;
3215 if (bufsize > mss) {
3216 bufsize = roundup(bufsize, mss);
3217 if (bufsize > sb_max)
3218 bufsize = sb_max;
3219 (void)sbreserve(&so->so_rcv, bufsize);
3220 }
3221
3222 /*
3223 * Set the slow-start flight size depending on whether this
3224 * is a local network or not.
3225 */
3226 if (
3227 #if INET6
3228 (isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
3229 (!isipv6 &&
3230 #endif
3231 in_localaddr(inp->inp_faddr)
3232 #if INET6
3233 )
3234 #endif
3235 )
3236 tp->snd_cwnd = mss * ss_fltsz_local;
3237 else
3238 tp->snd_cwnd = mss * ss_fltsz;
3239
3240 if (rt->rt_rmx.rmx_ssthresh) {
3241 /*
3242 * There's some sort of gateway or interface
3243 * buffer limit on the path. Use this to set
3244 * the slow start threshhold, but set the
3245 * threshold to no less than 2*mss.
3246 */
3247 tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
3248 tcpstat.tcps_usedssthresh++;
3249 }
3250 }
3251
3252 /*
3253 * Determine the MSS option to send on an outgoing SYN.
3254 */
3255 int
3256 tcp_mssopt(tp)
3257 struct tcpcb *tp;
3258 {
3259 struct rtentry *rt;
3260 int mss;
3261 #if INET6
3262 int isipv6;
3263 int min_protoh;
3264 #endif
3265
3266 #if INET6
3267 isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
3268 min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr)
3269 : sizeof (struct tcpiphdr);
3270 #else
3271 #define min_protoh (sizeof (struct tcpiphdr))
3272 #endif
3273 #if INET6
3274 if (isipv6)
3275 rt = tcp_rtlookup6(tp->t_inpcb);
3276 else
3277 #endif /* INET6 */
3278 rt = tcp_rtlookup(tp->t_inpcb);
3279 if (rt == NULL)
3280 return
3281 #if INET6
3282 isipv6 ? tcp_v6mssdflt :
3283 #endif /* INET6 */
3284 tcp_mssdflt;
3285 /*
3286 * Slower link window correction:
3287 * If a value is specificied for slowlink_wsize use it for PPP links
3288 * believed to be on a serial modem (speed <128Kbps). Excludes 9600bps as
3289 * it is the default value adversized by pseudo-devices over ppp.
3290 */
3291 if (rt->rt_ifp->if_type == IFT_PPP && slowlink_wsize > 0 &&
3292 rt->rt_ifp->if_baudrate > 9600 && rt->rt_ifp->if_baudrate <= 128000) {
3293 tp->t_flags |= TF_SLOWLINK;
3294 }
3295
3296 if (rt->rt_rmx.rmx_mtu)
3297 mss = rt->rt_rmx.rmx_mtu;
3298 else {
3299 mss =
3300 #if INET6
3301 (isipv6 ? nd_ifinfo[rt->rt_ifp->if_index].linkmtu :
3302 #endif
3303 rt->rt_ifp->if_mtu
3304 #if INET6
3305 );
3306 #endif
3307 }
3308 return (mss - min_protoh);
3309 }
3310
3311 /*
3312 * On a partial ack arrives, force the retransmission of the
3313 * next unacknowledged segment. Do not clear tp->t_dupacks.
3314 * By setting snd_nxt to ti_ack, this forces retransmission timer to
3315 * be started again.
3316 */
3317 static void
3318 tcp_newreno_partial_ack(tp, th)
3319 struct tcpcb *tp;
3320 struct tcphdr *th;
3321 {
3322 tcp_seq onxt = tp->snd_nxt;
3323 u_long ocwnd = tp->snd_cwnd;
3324 tp->t_timer[TCPT_REXMT] = 0;
3325 tp->t_rtttime = 0;
3326 tp->snd_nxt = th->th_ack;
3327 /*
3328 * Set snd_cwnd to one segment beyond acknowledged offset
3329 * (tp->snd_una has not yet been updated when this function
3330 * is called)
3331 */
3332 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una);
3333 tp->t_flags |= TF_ACKNOW;
3334 (void) tcp_output(tp);
3335 tp->snd_cwnd = ocwnd;
3336 if (SEQ_GT(onxt, tp->snd_nxt))
3337 tp->snd_nxt = onxt;
3338 /*
3339 * Partial window deflation. Relies on fact that tp->snd_una
3340 * not updated yet.
3341 */
3342 tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_maxseg);
3343 }
3344
3345 /*
3346 * Drop a random TCP connection that hasn't been serviced yet and
3347 * is eligible for discard. There is a one in qlen chance that
3348 * we will return a null, saying that there are no dropable
3349 * requests. In this case, the protocol specific code should drop
3350 * the new request. This insures fairness.
3351 *
3352 * The listening TCP socket "head" must be locked
3353 */
3354 static int
3355 tcpdropdropablreq(struct socket *head)
3356 {
3357 struct socket *so;
3358 unsigned int i, j, qlen;
3359 static int rnd;
3360 static struct timeval old_runtime;
3361 static unsigned int cur_cnt, old_cnt;
3362 struct timeval tv;
3363 struct inpcb *inp = NULL;
3364 struct tcpcb *tp;
3365
3366 microtime(&tv);
3367 if ((i = (tv.tv_sec - old_runtime.tv_sec)) != 0) {
3368 old_runtime = tv;
3369 old_cnt = cur_cnt / i;
3370 cur_cnt = 0;
3371 }
3372
3373 so = TAILQ_FIRST(&head->so_incomp);
3374 if (!so)
3375 return 0;
3376
3377 qlen = head->so_incqlen;
3378 if (++cur_cnt > qlen || old_cnt > qlen) {
3379 rnd = (314159 * rnd + 66329) & 0xffff;
3380 j = ((qlen + 1) * rnd) >> 16;
3381
3382 while (j-- && so)
3383 so = TAILQ_NEXT(so, so_list);
3384 }
3385 /* Find a connection that is not already closing */
3386 while (so) {
3387 inp = (struct inpcb *)so->so_pcb;
3388
3389 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) != WNT_STOPUSING)
3390 break;
3391
3392 so = TAILQ_NEXT(so, so_list);
3393 }
3394 if (!so)
3395 return 0;
3396
3397 head->so_incqlen--;
3398 head->so_qlen--;
3399 TAILQ_REMOVE(&head->so_incomp, so, so_list);
3400 tcp_unlock(head, 0, 0);
3401
3402 /* Let's remove this connection from the incomplete list */
3403 tcp_lock(so, 1, 0);
3404
3405 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
3406 tcp_unlock(so, 1, 0);
3407 return 0;
3408 }
3409
3410 so->so_head = NULL;
3411 so->so_usecount--; /* No more held by so_head */
3412
3413 /*
3414 * We do not want to lose track of the PCB right away in case we receive
3415 * more segments from the peer
3416 */
3417 tp = sototcpcb(so);
3418 tp->t_flags |= TF_LQ_OVERFLOW;
3419 tp->t_state = TCPS_CLOSED;
3420 (void) tcp_output(tp);
3421 tcpstat.tcps_drops++;
3422 soisdisconnected(so);
3423 tcp_canceltimers(tp);
3424 add_to_time_wait(tp);
3425
3426 tcp_unlock(so, 1, 0);
3427 tcp_lock(head, 0, 0);
3428
3429 return 1;
3430
3431 }
3432
3433 static int
3434 tcp_getstat SYSCTL_HANDLER_ARGS
3435 {
3436
3437 int error;
3438
3439 if (req->oldptr == 0) {
3440 req->oldlen= (size_t)sizeof(struct tcpstat);
3441 }
3442
3443 error = SYSCTL_OUT(req, &tcpstat, (size_t) req->oldlen);
3444
3445 return (error);
3446
3447 }
3448
3449 SYSCTL_PROC(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RD, 0, 0,
3450 tcp_getstat, "S,tcpstat", "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
3451
3452