]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/tcp_input.c
459ef641c6febbba817f99df406e18bb6da01702
[apple/xnu.git] / bsd / netinet / tcp_input.c
1 /*
2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23 /*
24 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
25 * The Regents of the University of California. All rights reserved.
26 *
27 * Redistribution and use in source and binary forms, with or without
28 * modification, are permitted provided that the following conditions
29 * are met:
30 * 1. Redistributions of source code must retain the above copyright
31 * notice, this list of conditions and the following disclaimer.
32 * 2. Redistributions in binary form must reproduce the above copyright
33 * notice, this list of conditions and the following disclaimer in the
34 * documentation and/or other materials provided with the distribution.
35 * 3. All advertising materials mentioning features or use of this software
36 * must display the following acknowledgement:
37 * This product includes software developed by the University of
38 * California, Berkeley and its contributors.
39 * 4. Neither the name of the University nor the names of its contributors
40 * may be used to endorse or promote products derived from this software
41 * without specific prior written permission.
42 *
43 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
44 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
45 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
46 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
47 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53 * SUCH DAMAGE.
54 *
55 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
56 * $FreeBSD: src/sys/netinet/tcp_input.c,v 1.107.2.16 2001/08/22 00:59:12 silby Exp $
57 */
58
59
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/kernel.h>
63 #include <sys/sysctl.h>
64 #include <sys/malloc.h>
65 #include <sys/mbuf.h>
66 #include <sys/proc.h> /* for proc0 declaration */
67 #include <sys/protosw.h>
68 #include <sys/socket.h>
69 #include <sys/socketvar.h>
70 #include <sys/syslog.h>
71
72 #include <kern/cpu_number.h> /* before tcp_seq.h, for tcp_random18() */
73
74 #include <net/if.h>
75 #include <net/if_types.h>
76 #include <net/route.h>
77
78 #include <netinet/in.h>
79 #include <netinet/in_systm.h>
80 #include <netinet/ip.h>
81 #include <netinet/ip_icmp.h> /* for ICMP_BANDLIM */
82 #include <netinet/in_var.h>
83 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
84 #include <netinet/in_pcb.h>
85 #include <netinet/ip_var.h>
86 #if INET6
87 #include <netinet/ip6.h>
88 #include <netinet/icmp6.h>
89 #include <netinet6/nd6.h>
90 #include <netinet6/ip6_var.h>
91 #include <netinet6/in6_pcb.h>
92 #endif
93 #include <netinet/tcp.h>
94 #include <netinet/tcp_fsm.h>
95 #include <netinet/tcp_seq.h>
96 #include <netinet/tcp_timer.h>
97 #include <netinet/tcp_var.h>
98 #if INET6
99 #include <netinet6/tcp6_var.h>
100 #endif
101 #include <netinet/tcpip.h>
102 #if TCPDEBUG
103 #include <netinet/tcp_debug.h>
104 u_char tcp_saveipgen[40]; /* the size must be of max ip header, now IPv6 */
105 struct tcphdr tcp_savetcp;
106 #endif /* TCPDEBUG */
107
108 #if IPSEC
109 #include <netinet6/ipsec.h>
110 #if INET6
111 #include <netinet6/ipsec6.h>
112 #endif
113 #include <netkey/key.h>
114 #endif /*IPSEC*/
115
116 #include <sys/kdebug.h>
117
118 #ifndef __APPLE__
119 MALLOC_DEFINE(M_TSEGQ, "tseg_qent", "TCP segment queue entry");
120 #endif
121
122 #define DBG_LAYER_BEG NETDBG_CODE(DBG_NETTCP, 0)
123 #define DBG_LAYER_END NETDBG_CODE(DBG_NETTCP, 2)
124 #define DBG_FNC_TCP_INPUT NETDBG_CODE(DBG_NETTCP, (3 << 8))
125 #define DBG_FNC_TCP_NEWCONN NETDBG_CODE(DBG_NETTCP, (7 << 8))
126
127 static int tcprexmtthresh = 3;
128 tcp_cc tcp_ccgen;
129 extern int apple_hwcksum_rx;
130
131 #if IPSEC
132 extern int ipsec_bypass;
133 extern lck_mtx_t *sadb_mutex;
134 #endif
135
136 struct tcpstat tcpstat;
137 SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RD,
138 &tcpstat , tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
139
140 static int log_in_vain = 0;
141 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW,
142 &log_in_vain, 0, "Log all incoming TCP connections");
143
144 static int blackhole = 0;
145 SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW,
146 &blackhole, 0, "Do not send RST when dropping refused connections");
147
148 int tcp_delack_enabled = 1;
149 SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW,
150 &tcp_delack_enabled, 0,
151 "Delay ACK to try and piggyback it onto a data packet");
152
153 int tcp_lq_overflow = 1;
154 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_lq_overflow, CTLFLAG_RW,
155 &tcp_lq_overflow, 0,
156 "Listen Queue Overflow");
157
158 #if TCP_DROP_SYNFIN
159 static int drop_synfin = 1;
160 SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW,
161 &drop_synfin, 0, "Drop TCP packets with SYN+FIN set");
162 #endif
163
164 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0,
165 "TCP Segment Reassembly Queue");
166
167 __private_extern__ int tcp_reass_maxseg = 0;
168 SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RW,
169 &tcp_reass_maxseg, 0,
170 "Global maximum number of TCP Segments in Reassembly Queue");
171
172 __private_extern__ int tcp_reass_qsize = 0;
173 SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_RD,
174 &tcp_reass_qsize, 0,
175 "Global number of TCP Segments currently in Reassembly Queue");
176
177 static int tcp_reass_overflows = 0;
178 SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD,
179 &tcp_reass_overflows, 0,
180 "Global number of TCP Segment Reassembly Queue Overflows");
181
182
183 __private_extern__ int slowlink_wsize = 8192;
184 SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowlink_wsize, CTLFLAG_RW,
185 &slowlink_wsize, 0, "Maximum advertised window size for slowlink");
186
187
188 u_long tcp_now;
189 struct inpcbhead tcb;
190 #define tcb6 tcb /* for KAME src sync over BSD*'s */
191 struct inpcbinfo tcbinfo;
192
193 static void tcp_dooptions(struct tcpcb *,
194 u_char *, int, struct tcphdr *, struct tcpopt *);
195 static void tcp_pulloutofband(struct socket *,
196 struct tcphdr *, struct mbuf *, int);
197 static int tcp_reass(struct tcpcb *, struct tcphdr *, int *,
198 struct mbuf *);
199 static void tcp_xmit_timer(struct tcpcb *, int);
200 static int tcp_newreno __P((struct tcpcb *, struct tcphdr *));
201
202 /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
203 #if INET6
204 #define ND6_HINT(tp) \
205 do { \
206 if ((tp) && (tp)->t_inpcb && \
207 ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0 && \
208 (tp)->t_inpcb->in6p_route.ro_rt) \
209 nd6_nud_hint((tp)->t_inpcb->in6p_route.ro_rt, NULL, 0); \
210 } while (0)
211 #else
212 #define ND6_HINT(tp)
213 #endif
214
215 extern u_long *delack_bitmask;
216
217 extern void ipfwsyslog( int level, char *format,...);
218 extern int ChkAddressOK( __uint32_t dstaddr, __uint32_t srcaddr );
219 extern int fw_verbose;
220
221 #define log_in_vain_log( a ) { \
222 if ( (log_in_vain == 3 ) && (fw_verbose == 2)) { /* Apple logging, log to ipfw.log */ \
223 ipfwsyslog a ; \
224 } \
225 else log a ; \
226 }
227
228 /*
229 * Indicate whether this ack should be delayed.
230 * We can delay the ack if:
231 * - delayed acks are enabled (set to 1) and
232 * - our last ack wasn't a 0-sized window. We never want to delay
233 * the ack that opens up a 0-sized window.
234 * - delayed acks are enabled (set to 2, "more compatible") and
235 * - our last ack wasn't a 0-sized window.
236 * - if the peer hasn't sent us a TH_PUSH data packet (this solves 3649245)
237 * - the peer hasn't sent us a TH_PUSH data packet, if he did, take this as a clue that we
238 * need to ACK with no delay. This helps higher level protocols who won't send
239 * us more data even if the window is open because their last "segment" hasn't been ACKed
240 *
241 *
242 */
243 #define DELAY_ACK(tp) \
244 (((tcp_delack_enabled == 1) && ((tp->t_flags & TF_RXWIN0SENT) == 0)) || \
245 (((tcp_delack_enabled == 2) && (tp->t_flags & TF_RXWIN0SENT) == 0) && \
246 ((thflags & TH_PUSH) == 0) && ((tp->t_flags & TF_DELACK) == 0)))
247
248
249 static int tcpdropdropablreq(struct socket *head);
250
251
252 static int
253 tcp_reass(tp, th, tlenp, m)
254 register struct tcpcb *tp;
255 register struct tcphdr *th;
256 int *tlenp;
257 struct mbuf *m;
258 {
259 struct tseg_qent *q;
260 struct tseg_qent *p = NULL;
261 struct tseg_qent *nq;
262 struct tseg_qent *te;
263 struct socket *so = tp->t_inpcb->inp_socket;
264 int flags;
265 int dowakeup = 0;
266
267 /*
268 * Call with th==0 after become established to
269 * force pre-ESTABLISHED data up to user socket.
270 */
271 if (th == 0)
272 goto present;
273
274 /*
275 * Limit the number of segments in the reassembly queue to prevent
276 * holding on to too many segments (and thus running out of mbufs).
277 * Make sure to let the missing segment through which caused this
278 * queue. Always keep one global queue entry spare to be able to
279 * process the missing segment.
280 */
281 if (th->th_seq != tp->rcv_nxt &&
282 tcp_reass_qsize + 1 >= tcp_reass_maxseg) {
283 tcp_reass_overflows++;
284 tcpstat.tcps_rcvmemdrop++;
285 m_freem(m);
286 return (0);
287 }
288
289 /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
290 MALLOC(te, struct tseg_qent *, sizeof (struct tseg_qent), M_TSEGQ,
291 M_NOWAIT);
292 if (te == NULL) {
293 tcpstat.tcps_rcvmemdrop++;
294 m_freem(m);
295 return (0);
296 }
297 tcp_reass_qsize++;
298
299 /*
300 * Find a segment which begins after this one does.
301 */
302 LIST_FOREACH(q, &tp->t_segq, tqe_q) {
303 if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
304 break;
305 p = q;
306 }
307
308 /*
309 * If there is a preceding segment, it may provide some of
310 * our data already. If so, drop the data from the incoming
311 * segment. If it provides all of our data, drop us.
312 */
313 if (p != NULL) {
314 register int i;
315 /* conversion to int (in i) handles seq wraparound */
316 i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
317 if (i > 0) {
318 if (i >= *tlenp) {
319 tcpstat.tcps_rcvduppack++;
320 tcpstat.tcps_rcvdupbyte += *tlenp;
321 m_freem(m);
322 FREE(te, M_TSEGQ);
323 tcp_reass_qsize--;
324 /*
325 * Try to present any queued data
326 * at the left window edge to the user.
327 * This is needed after the 3-WHS
328 * completes.
329 */
330 goto present; /* ??? */
331 }
332 m_adj(m, i);
333 *tlenp -= i;
334 th->th_seq += i;
335 }
336 }
337 tcpstat.tcps_rcvoopack++;
338 tcpstat.tcps_rcvoobyte += *tlenp;
339
340 /*
341 * While we overlap succeeding segments trim them or,
342 * if they are completely covered, dequeue them.
343 */
344 while (q) {
345 register int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
346 if (i <= 0)
347 break;
348 if (i < q->tqe_len) {
349 q->tqe_th->th_seq += i;
350 q->tqe_len -= i;
351 m_adj(q->tqe_m, i);
352 break;
353 }
354
355 nq = LIST_NEXT(q, tqe_q);
356 LIST_REMOVE(q, tqe_q);
357 m_freem(q->tqe_m);
358 FREE(q, M_TSEGQ);
359 tcp_reass_qsize--;
360 q = nq;
361 }
362
363 /* Insert the new segment queue entry into place. */
364 te->tqe_m = m;
365 te->tqe_th = th;
366 te->tqe_len = *tlenp;
367
368 if (p == NULL) {
369 LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
370 } else {
371 LIST_INSERT_AFTER(p, te, tqe_q);
372 }
373
374 present:
375 /*
376 * Present data to user, advancing rcv_nxt through
377 * completed sequence space.
378 */
379 if (!TCPS_HAVEESTABLISHED(tp->t_state))
380 return (0);
381 q = LIST_FIRST(&tp->t_segq);
382 if (!q || q->tqe_th->th_seq != tp->rcv_nxt)
383 return (0);
384 do {
385 tp->rcv_nxt += q->tqe_len;
386 flags = q->tqe_th->th_flags & TH_FIN;
387 nq = LIST_NEXT(q, tqe_q);
388 LIST_REMOVE(q, tqe_q);
389 if (so->so_state & SS_CANTRCVMORE)
390 m_freem(q->tqe_m);
391 else {
392 if (sbappend(&so->so_rcv, q->tqe_m))
393 dowakeup = 1;
394 }
395 FREE(q, M_TSEGQ);
396 tcp_reass_qsize--;
397 q = nq;
398 } while (q && q->tqe_th->th_seq == tp->rcv_nxt);
399 ND6_HINT(tp);
400
401 #if INET6
402 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
403
404 KERNEL_DEBUG(DBG_LAYER_BEG,
405 ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
406 (((tp->t_inpcb->in6p_laddr.s6_addr16[0] & 0xffff) << 16) |
407 (tp->t_inpcb->in6p_faddr.s6_addr16[0] & 0xffff)),
408 0,0,0);
409 }
410 else
411 #endif
412 {
413 KERNEL_DEBUG(DBG_LAYER_BEG,
414 ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
415 (((tp->t_inpcb->inp_laddr.s_addr & 0xffff) << 16) |
416 (tp->t_inpcb->inp_faddr.s_addr & 0xffff)),
417 0,0,0);
418 }
419 if (dowakeup)
420 sorwakeup(so); /* done with socket lock held */
421 return (flags);
422
423 }
424
425
426 /*
427 * TCP input routine, follows pages 65-76 of the
428 * protocol specification dated September, 1981 very closely.
429 */
430 #if INET6
431 int
432 tcp6_input(mp, offp)
433 struct mbuf **mp;
434 int *offp;
435 {
436 register struct mbuf *m = *mp;
437 struct in6_ifaddr *ia6;
438
439 IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), return IPPROTO_DONE);
440
441 /*
442 * draft-itojun-ipv6-tcp-to-anycast
443 * better place to put this in?
444 */
445 ia6 = ip6_getdstifaddr(m);
446 if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) {
447 struct ip6_hdr *ip6;
448
449 ip6 = mtod(m, struct ip6_hdr *);
450 icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
451 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
452 return IPPROTO_DONE;
453 }
454
455 tcp_input(m, *offp);
456 return IPPROTO_DONE;
457 }
458 #endif
459
460 void
461 tcp_input(m, off0)
462 struct mbuf *m;
463 int off0;
464 {
465 register struct tcphdr *th;
466 register struct ip *ip = NULL;
467 register struct ipovly *ipov;
468 register struct inpcb *inp;
469 u_char *optp = NULL;
470 int optlen = 0;
471 int len, tlen, off;
472 int drop_hdrlen;
473 register struct tcpcb *tp = 0;
474 register int thflags;
475 struct socket *so = 0;
476 int todrop, acked, ourfinisacked, needoutput = 0;
477 struct in_addr laddr;
478 #if INET6
479 struct in6_addr laddr6;
480 #endif
481 int dropsocket = 0;
482 int iss = 0;
483 int nosock = 0;
484 u_long tiwin;
485 struct tcpopt to; /* options in this segment */
486 struct rmxp_tao *taop; /* pointer to our TAO cache entry */
487 struct rmxp_tao tao_noncached; /* in case there's no cached entry */
488 struct sockaddr_in *next_hop = NULL;
489 #if TCPDEBUG
490 short ostate = 0;
491 #endif
492 struct m_tag *fwd_tag;
493
494 /* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */
495 fwd_tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFORWARD, NULL);
496 if (fwd_tag != NULL) {
497 struct ip_fwd_tag *ipfwd_tag = (struct ip_fwd_tag *)(fwd_tag+1);
498
499 next_hop = ipfwd_tag->next_hop;
500 m_tag_delete(m, fwd_tag);
501 }
502
503 #if INET6
504 struct ip6_hdr *ip6 = NULL;
505 int isipv6;
506 #endif /* INET6 */
507 int rstreason; /* For badport_bandlim accounting purposes */
508 struct proc *proc0=current_proc();
509
510 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_START,0,0,0,0,0);
511
512 #if INET6
513 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
514 #endif
515 bzero((char *)&to, sizeof(to));
516
517 tcpstat.tcps_rcvtotal++;
518
519
520
521 #if INET6
522 if (isipv6) {
523 /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */
524 ip6 = mtod(m, struct ip6_hdr *);
525 tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
526 if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) {
527 tcpstat.tcps_rcvbadsum++;
528 goto dropnosock;
529 }
530 th = (struct tcphdr *)((caddr_t)ip6 + off0);
531
532 KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport),
533 (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
534 th->th_seq, th->th_ack, th->th_win);
535 /*
536 * Be proactive about unspecified IPv6 address in source.
537 * As we use all-zero to indicate unbounded/unconnected pcb,
538 * unspecified IPv6 address can be used to confuse us.
539 *
540 * Note that packets with unspecified IPv6 destination is
541 * already dropped in ip6_input.
542 */
543 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
544 /* XXX stat */
545 goto dropnosock;
546 }
547 } else
548 #endif /* INET6 */
549 {
550 /*
551 * Get IP and TCP header together in first mbuf.
552 * Note: IP leaves IP header in first mbuf.
553 */
554 if (off0 > sizeof (struct ip)) {
555 ip_stripoptions(m, (struct mbuf *)0);
556 off0 = sizeof(struct ip);
557 if (m->m_pkthdr.csum_flags & CSUM_TCP_SUM16)
558 m->m_pkthdr.csum_flags = 0; /* invalidate hwcksuming */
559
560 }
561 if (m->m_len < sizeof (struct tcpiphdr)) {
562 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) {
563 tcpstat.tcps_rcvshort++;
564 return;
565 }
566 }
567 ip = mtod(m, struct ip *);
568 ipov = (struct ipovly *)ip;
569 th = (struct tcphdr *)((caddr_t)ip + off0);
570 tlen = ip->ip_len;
571
572 KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport),
573 (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
574 th->th_seq, th->th_ack, th->th_win);
575
576 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
577 if (apple_hwcksum_rx && (m->m_pkthdr.csum_flags & CSUM_TCP_SUM16)) {
578 u_short pseudo;
579 char b[9];
580 *(uint32_t*)&b[0] = *(uint32_t*)&ipov->ih_x1[0];
581 *(uint32_t*)&b[4] = *(uint32_t*)&ipov->ih_x1[4];
582 *(uint8_t*)&b[8] = *(uint8_t*)&ipov->ih_x1[8];
583
584 bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
585 ipov->ih_len = (u_short)tlen;
586 HTONS(ipov->ih_len);
587 pseudo = in_cksum(m, sizeof (struct ip));
588
589 *(uint32_t*)&ipov->ih_x1[0] = *(uint32_t*)&b[0];
590 *(uint32_t*)&ipov->ih_x1[4] = *(uint32_t*)&b[4];
591 *(uint8_t*)&ipov->ih_x1[8] = *(uint8_t*)&b[8];
592
593 th->th_sum = in_addword(pseudo, (m->m_pkthdr.csum_data & 0xFFFF));
594 } else {
595 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
596 th->th_sum = m->m_pkthdr.csum_data;
597 else
598 th->th_sum = in_pseudo(ip->ip_src.s_addr,
599 ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data +
600 ip->ip_len + IPPROTO_TCP));
601 }
602 th->th_sum ^= 0xffff;
603 } else {
604 char b[9];
605 /*
606 * Checksum extended TCP header and data.
607 */
608 *(uint32_t*)&b[0] = *(uint32_t*)&ipov->ih_x1[0];
609 *(uint32_t*)&b[4] = *(uint32_t*)&ipov->ih_x1[4];
610 *(uint8_t*)&b[8] = *(uint8_t*)&ipov->ih_x1[8];
611
612 len = sizeof (struct ip) + tlen;
613 bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
614 ipov->ih_len = (u_short)tlen;
615 HTONS(ipov->ih_len);
616 th->th_sum = in_cksum(m, len);
617
618 *(uint32_t*)&ipov->ih_x1[0] = *(uint32_t*)&b[0];
619 *(uint32_t*)&ipov->ih_x1[4] = *(uint32_t*)&b[4];
620 *(uint8_t*)&ipov->ih_x1[8] = *(uint8_t*)&b[8];
621 }
622 if (th->th_sum) {
623 tcpstat.tcps_rcvbadsum++;
624 goto dropnosock;
625 }
626 #if INET6
627 /* Re-initialization for later version check */
628 ip->ip_v = IPVERSION;
629 #endif
630 }
631
632 /*
633 * Check that TCP offset makes sense,
634 * pull out TCP options and adjust length. XXX
635 */
636 off = th->th_off << 2;
637 if (off < sizeof (struct tcphdr) || off > tlen) {
638 tcpstat.tcps_rcvbadoff++;
639 goto dropnosock;
640 }
641 tlen -= off; /* tlen is used instead of ti->ti_len */
642 if (off > sizeof (struct tcphdr)) {
643 #if INET6
644 if (isipv6) {
645 IP6_EXTHDR_CHECK(m, off0, off, return);
646 ip6 = mtod(m, struct ip6_hdr *);
647 th = (struct tcphdr *)((caddr_t)ip6 + off0);
648 } else
649 #endif /* INET6 */
650 {
651 if (m->m_len < sizeof(struct ip) + off) {
652 if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) {
653 tcpstat.tcps_rcvshort++;
654 return;
655 }
656 ip = mtod(m, struct ip *);
657 ipov = (struct ipovly *)ip;
658 th = (struct tcphdr *)((caddr_t)ip + off0);
659 }
660 }
661 optlen = off - sizeof (struct tcphdr);
662 optp = (u_char *)(th + 1);
663 /*
664 * Do quick retrieval of timestamp options ("options
665 * prediction?"). If timestamp is the only option and it's
666 * formatted as recommended in RFC 1323 appendix A, we
667 * quickly get the values now and not bother calling
668 * tcp_dooptions(), etc.
669 */
670 if ((optlen == TCPOLEN_TSTAMP_APPA ||
671 (optlen > TCPOLEN_TSTAMP_APPA &&
672 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
673 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
674 (th->th_flags & TH_SYN) == 0) {
675 to.to_flag |= TOF_TS;
676 to.to_tsval = ntohl(*(u_int32_t *)(optp + 4));
677 to.to_tsecr = ntohl(*(u_int32_t *)(optp + 8));
678 optp = NULL; /* we've parsed the options */
679 }
680 }
681 thflags = th->th_flags;
682
683 #if TCP_DROP_SYNFIN
684 /*
685 * If the drop_synfin option is enabled, drop all packets with
686 * both the SYN and FIN bits set. This prevents e.g. nmap from
687 * identifying the TCP/IP stack.
688 *
689 * This is incompatible with RFC1644 extensions (T/TCP).
690 */
691 if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN))
692 goto dropnosock;
693 #endif
694
695 /*
696 * Convert TCP protocol specific fields to host format.
697 */
698 NTOHL(th->th_seq);
699 NTOHL(th->th_ack);
700 NTOHS(th->th_win);
701 NTOHS(th->th_urp);
702
703 /*
704 * Delay droping TCP, IP headers, IPv6 ext headers, and TCP options,
705 * until after ip6_savecontrol() is called and before other functions
706 * which don't want those proto headers.
707 * Because ip6_savecontrol() is going to parse the mbuf to
708 * search for data to be passed up to user-land, it wants mbuf
709 * parameters to be unchanged.
710 */
711 drop_hdrlen = off0 + off;
712
713 /*
714 * Locate pcb for segment.
715 */
716 findpcb:
717 #if IPFIREWALL_FORWARD
718 if (next_hop != NULL
719 #if INET6
720 && isipv6 == NULL /* IPv6 support is not yet */
721 #endif /* INET6 */
722 ) {
723 /*
724 * Diverted. Pretend to be the destination.
725 * already got one like this?
726 */
727 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
728 ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif);
729 if (!inp) {
730 /*
731 * No, then it's new. Try find the ambushing socket
732 */
733 if (!next_hop->sin_port) {
734 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src,
735 th->th_sport, next_hop->sin_addr,
736 th->th_dport, 1, m->m_pkthdr.rcvif);
737 } else {
738 inp = in_pcblookup_hash(&tcbinfo,
739 ip->ip_src, th->th_sport,
740 next_hop->sin_addr,
741 ntohs(next_hop->sin_port), 1,
742 m->m_pkthdr.rcvif);
743 }
744 }
745 } else
746 #endif /* IPFIREWALL_FORWARD */
747 {
748 #if INET6
749 if (isipv6)
750 inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport,
751 &ip6->ip6_dst, th->th_dport, 1,
752 m->m_pkthdr.rcvif);
753 else
754 #endif /* INET6 */
755 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
756 ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif);
757 }
758
759 #if IPSEC
760 if (ipsec_bypass == 0) {
761 lck_mtx_lock(sadb_mutex);
762 #if INET6
763 if (isipv6) {
764 if (inp != NULL && ipsec6_in_reject_so(m, inp->inp_socket)) {
765 ipsec6stat.in_polvio++;
766 lck_mtx_unlock(sadb_mutex);
767 goto dropnosock;
768 }
769 } else
770 #endif /* INET6 */
771 if (inp != NULL && ipsec4_in_reject_so(m, inp->inp_socket)) {
772 ipsecstat.in_polvio++;
773 lck_mtx_unlock(sadb_mutex);
774 goto dropnosock;
775 }
776 lck_mtx_unlock(sadb_mutex);
777 }
778 #endif /*IPSEC*/
779
780 /*
781 * If the state is CLOSED (i.e., TCB does not exist) then
782 * all data in the incoming segment is discarded.
783 * If the TCB exists but is in CLOSED state, it is embryonic,
784 * but should either do a listen or a connect soon.
785 */
786 if (inp == NULL) {
787 if (log_in_vain) {
788 #if INET6
789 char dbuf[MAX_IPv6_STR_LEN], sbuf[MAX_IPv6_STR_LEN];
790 #else /* INET6 */
791 char dbuf[MAX_IPv4_STR_LEN], sbuf[MAX_IPv4_STR_LEN];
792 #endif /* INET6 */
793
794 #if INET6
795 if (isipv6) {
796 inet_ntop(AF_INET6, &ip6->ip6_dst, dbuf, sizeof(dbuf));
797 inet_ntop(AF_INET6, &ip6->ip6_src, sbuf, sizeof(sbuf));
798 } else
799 #endif
800 {
801 inet_ntop(AF_INET, &ip->ip_dst, dbuf, sizeof(dbuf));
802 inet_ntop(AF_INET, &ip->ip_src, sbuf, sizeof(sbuf));
803 }
804 switch (log_in_vain) {
805 case 1:
806 if(thflags & TH_SYN)
807 log(LOG_INFO,
808 "Connection attempt to TCP %s:%d from %s:%d\n",
809 dbuf, ntohs(th->th_dport),
810 sbuf,
811 ntohs(th->th_sport));
812 break;
813 case 2:
814 log(LOG_INFO,
815 "Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n",
816 dbuf, ntohs(th->th_dport), sbuf,
817 ntohs(th->th_sport), thflags);
818 break;
819 case 3:
820 if ((thflags & TH_SYN) &&
821 !(m->m_flags & (M_BCAST | M_MCAST)) &&
822 #if INET6
823 ((isipv6 && !IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) ||
824 (!isipv6 && ip->ip_dst.s_addr != ip->ip_src.s_addr))
825 #else
826 ip->ip_dst.s_addr != ip->ip_src.s_addr
827 #endif
828 )
829 log_in_vain_log((LOG_INFO,
830 "Stealth Mode connection attempt to TCP %s:%d from %s:%d\n",
831 dbuf, ntohs(th->th_dport),
832 sbuf,
833 ntohs(th->th_sport)));
834 break;
835 default:
836 break;
837 }
838 }
839 if (blackhole) {
840 if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type != IFT_LOOP)
841 switch (blackhole) {
842 case 1:
843 if (thflags & TH_SYN)
844 goto dropnosock;
845 break;
846 case 2:
847 goto dropnosock;
848 default:
849 goto dropnosock;
850 }
851 }
852 rstreason = BANDLIM_RST_CLOSEDPORT;
853 goto dropwithresetnosock;
854 }
855 so = inp->inp_socket;
856 if (so == NULL) {
857 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING)
858 inp = NULL; // pretend we didn't find it
859 #if TEMPDEBUG
860 printf("tcp_input: no more socket for inp=%x\n", inp);
861 #endif
862 goto dropnosock;
863 }
864 tcp_lock(so, 1, 2);
865 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
866 tcp_unlock(so, 1, 2);
867 inp = NULL; // pretend we didn't find it
868 goto dropnosock;
869 }
870
871 tp = intotcpcb(inp);
872 if (tp == 0) {
873 rstreason = BANDLIM_RST_CLOSEDPORT;
874 goto dropwithreset;
875 }
876 if (tp->t_state == TCPS_CLOSED)
877 goto drop;
878
879 #ifdef __APPLE__
880 /*
881 * Bogus state when listening port owned by SharedIP with loopback as the
882 * only configured interface: BlueBox does not filters loopback
883 */
884 if (tp->t_state == TCP_NSTATES)
885 goto drop;
886 #endif
887
888 /* Unscale the window into a 32-bit value. */
889 if ((thflags & TH_SYN) == 0)
890 tiwin = th->th_win << tp->snd_scale;
891 else
892 tiwin = th->th_win;
893
894 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
895 #if TCPDEBUG
896 if (so->so_options & SO_DEBUG) {
897 ostate = tp->t_state;
898 #if INET6
899 if (isipv6)
900 bcopy((char *)ip6, (char *)tcp_saveipgen,
901 sizeof(*ip6));
902 else
903 #endif /* INET6 */
904 bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip));
905 tcp_savetcp = *th;
906 }
907 #endif
908 if (so->so_options & SO_ACCEPTCONN) {
909 register struct tcpcb *tp0 = tp;
910 struct socket *so2;
911 struct socket *oso;
912 struct sockaddr_storage from;
913 #if INET6
914 struct inpcb *oinp = sotoinpcb(so);
915 #endif /* INET6 */
916 int ogencnt = so->so_gencnt;
917
918 #if !IPSEC
919 /*
920 * Current IPsec implementation makes incorrect IPsec
921 * cache if this check is done here.
922 * So delay this until duplicated socket is created.
923 */
924 if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
925 /*
926 * Note: dropwithreset makes sure we don't
927 * send a RST in response to a RST.
928 */
929 if (thflags & TH_ACK) {
930 tcpstat.tcps_badsyn++;
931 rstreason = BANDLIM_RST_OPENPORT;
932 goto dropwithreset;
933 }
934 goto drop;
935 }
936 #endif
937 KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_START,0,0,0,0,0);
938
939 #if INET6
940 /*
941 * If deprecated address is forbidden,
942 * we do not accept SYN to deprecated interface
943 * address to prevent any new inbound connection from
944 * getting established.
945 * When we do not accept SYN, we send a TCP RST,
946 * with deprecated source address (instead of dropping
947 * it). We compromise it as it is much better for peer
948 * to send a RST, and RST will be the final packet
949 * for the exchange.
950 *
951 * If we do not forbid deprecated addresses, we accept
952 * the SYN packet. RFC2462 does not suggest dropping
953 * SYN in this case.
954 * If we decipher RFC2462 5.5.4, it says like this:
955 * 1. use of deprecated addr with existing
956 * communication is okay - "SHOULD continue to be
957 * used"
958 * 2. use of it with new communication:
959 * (2a) "SHOULD NOT be used if alternate address
960 * with sufficient scope is available"
961 * (2b) nothing mentioned otherwise.
962 * Here we fall into (2b) case as we have no choice in
963 * our source address selection - we must obey the peer.
964 *
965 * The wording in RFC2462 is confusing, and there are
966 * multiple description text for deprecated address
967 * handling - worse, they are not exactly the same.
968 * I believe 5.5.4 is the best one, so we follow 5.5.4.
969 */
970 if (isipv6 && !ip6_use_deprecated) {
971 struct in6_ifaddr *ia6;
972
973 if ((ia6 = ip6_getdstifaddr(m)) &&
974 (ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
975 tp = NULL;
976 rstreason = BANDLIM_RST_OPENPORT;
977 goto dropwithreset;
978 }
979 }
980 #endif
981 if (so->so_filt) {
982 if (isipv6) {
983 struct sockaddr_in6 *sin6 = (struct sockaddr_in6*)&from;
984
985 sin6->sin6_len = sizeof(*sin6);
986 sin6->sin6_family = AF_INET6;
987 sin6->sin6_port = th->th_sport;
988 sin6->sin6_flowinfo = 0;
989 sin6->sin6_addr = ip6->ip6_src;
990 sin6->sin6_scope_id = 0;
991 } else {
992 struct sockaddr_in *sin = (struct sockaddr_in*)&from;
993
994 sin->sin_len = sizeof(*sin);
995 sin->sin_family = AF_INET;
996 sin->sin_port = th->th_sport;
997 sin->sin_addr = ip->ip_src;
998 }
999 so2 = sonewconn(so, 0, (struct sockaddr*)&from);
1000 } else {
1001 so2 = sonewconn(so, 0, NULL);
1002 }
1003 if (so2 == 0) {
1004 tcpstat.tcps_listendrop++;
1005 if (tcpdropdropablreq(so)) {
1006 if (so->so_filt)
1007 so2 = sonewconn(so, 0, (struct sockaddr*)&from);
1008 else
1009 so2 = sonewconn(so, 0, NULL);
1010 }
1011 if (!so2)
1012 goto drop;
1013 }
1014 /*
1015 * Make sure listening socket did not get closed during socket allocation,
1016 * not only this is incorrect but it is know to cause panic
1017 */
1018 if (so->so_gencnt != ogencnt)
1019 goto drop;
1020
1021 oso = so;
1022 tcp_unlock(so, 0, 0); /* Unlock but keep a reference on listener for now */
1023
1024 so = so2;
1025 tcp_lock(so, 1, 0);
1026 /*
1027 * This is ugly, but ....
1028 *
1029 * Mark socket as temporary until we're
1030 * committed to keeping it. The code at
1031 * ``drop'' and ``dropwithreset'' check the
1032 * flag dropsocket to see if the temporary
1033 * socket created here should be discarded.
1034 * We mark the socket as discardable until
1035 * we're committed to it below in TCPS_LISTEN.
1036 */
1037 dropsocket++;
1038 inp = (struct inpcb *)so->so_pcb;
1039 #if INET6
1040 if (isipv6)
1041 inp->in6p_laddr = ip6->ip6_dst;
1042 else {
1043 inp->inp_vflag &= ~INP_IPV6;
1044 inp->inp_vflag |= INP_IPV4;
1045 #endif /* INET6 */
1046 inp->inp_laddr = ip->ip_dst;
1047 #if INET6
1048 }
1049 #endif /* INET6 */
1050 inp->inp_lport = th->th_dport;
1051 if (in_pcbinshash(inp, 0) != 0) {
1052 /*
1053 * Undo the assignments above if we failed to
1054 * put the PCB on the hash lists.
1055 */
1056 #if INET6
1057 if (isipv6)
1058 inp->in6p_laddr = in6addr_any;
1059 else
1060 #endif /* INET6 */
1061 inp->inp_laddr.s_addr = INADDR_ANY;
1062 inp->inp_lport = 0;
1063 tcp_lock(oso, 0, 0); /* release ref on parent */
1064 tcp_unlock(oso, 1, 0);
1065 goto drop;
1066 }
1067 #if IPSEC
1068 /*
1069 * To avoid creating incorrectly cached IPsec
1070 * association, this is need to be done here.
1071 *
1072 * Subject: (KAME-snap 748)
1073 * From: Wayne Knowles <w.knowles@niwa.cri.nz>
1074 * ftp://ftp.kame.net/pub/mail-list/snap-users/748
1075 */
1076 if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
1077 /*
1078 * Note: dropwithreset makes sure we don't
1079 * send a RST in response to a RST.
1080 */
1081 tcp_lock(oso, 0, 0); /* release ref on parent */
1082 tcp_unlock(oso, 1, 0);
1083 if (thflags & TH_ACK) {
1084 tcpstat.tcps_badsyn++;
1085 rstreason = BANDLIM_RST_OPENPORT;
1086 goto dropwithreset;
1087 }
1088 goto drop;
1089 }
1090 #endif
1091 #if INET6
1092 if (isipv6) {
1093 /*
1094 * Inherit socket options from the listening
1095 * socket.
1096 * Note that in6p_inputopts are not (even
1097 * should not be) copied, since it stores
1098 * previously received options and is used to
1099 * detect if each new option is different than
1100 * the previous one and hence should be passed
1101 * to a user.
1102 * If we copied in6p_inputopts, a user would
1103 * not be able to receive options just after
1104 * calling the accept system call.
1105 */
1106 inp->inp_flags |=
1107 oinp->inp_flags & INP_CONTROLOPTS;
1108 if (oinp->in6p_outputopts)
1109 inp->in6p_outputopts =
1110 ip6_copypktopts(oinp->in6p_outputopts,
1111 M_NOWAIT);
1112 } else
1113 #endif /* INET6 */
1114 inp->inp_options = ip_srcroute();
1115 tcp_lock(oso, 0, 0);
1116 #if IPSEC
1117 /* copy old policy into new socket's */
1118 if (sotoinpcb(oso)->inp_sp)
1119 {
1120 int error = 0;
1121 lck_mtx_lock(sadb_mutex);
1122 /* Is it a security hole here to silently fail to copy the policy? */
1123 if (inp->inp_sp != NULL)
1124 error = ipsec_init_policy(so, &inp->inp_sp);
1125 if (error != 0 || ipsec_copy_policy(sotoinpcb(oso)->inp_sp, inp->inp_sp))
1126 printf("tcp_input: could not copy policy\n");
1127 lck_mtx_unlock(sadb_mutex);
1128 }
1129 #endif
1130 tcp_unlock(oso, 1, 0); /* now drop the reference on the listener */
1131 tp = intotcpcb(inp);
1132 tp->t_state = TCPS_LISTEN;
1133 tp->t_flags |= tp0->t_flags & (TF_NOPUSH|TF_NOOPT|TF_NODELAY);
1134 tp->t_inpcb->inp_ip_ttl = tp0->t_inpcb->inp_ip_ttl;
1135 /* Compute proper scaling value from buffer space */
1136 while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
1137 TCP_MAXWIN << tp->request_r_scale <
1138 so->so_rcv.sb_hiwat)
1139 tp->request_r_scale++;
1140
1141 KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_END,0,0,0,0,0);
1142 }
1143 }
1144
1145 #if 1
1146 lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1147 #endif
1148 /*
1149 * Radar 3529618
1150 * This is the second part of the MSS DoS prevention code (after
1151 * minmss on the sending side) and it deals with too many too small
1152 * tcp packets in a too short timeframe (1 second).
1153 *
1154 * For every full second we count the number of received packets
1155 * and bytes. If we get a lot of packets per second for this connection
1156 * (tcp_minmssoverload) we take a closer look at it and compute the
1157 * average packet size for the past second. If that is less than
1158 * tcp_minmss we get too many packets with very small payload which
1159 * is not good and burdens our system (and every packet generates
1160 * a wakeup to the process connected to our socket). We can reasonable
1161 * expect this to be small packet DoS attack to exhaust our CPU
1162 * cycles.
1163 *
1164 * Care has to be taken for the minimum packet overload value. This
1165 * value defines the minimum number of packets per second before we
1166 * start to worry. This must not be too low to avoid killing for
1167 * example interactive connections with many small packets like
1168 * telnet or SSH.
1169 *
1170 * Setting either tcp_minmssoverload or tcp_minmss to "0" disables
1171 * this check.
1172 *
1173 * Account for packet if payload packet, skip over ACK, etc.
1174 */
1175 if (tcp_minmss && tcp_minmssoverload &&
1176 tp->t_state == TCPS_ESTABLISHED && tlen > 0) {
1177 if (tp->rcv_reset > tcp_now) {
1178 tp->rcv_pps++;
1179 tp->rcv_byps += tlen + off;
1180 if (tp->rcv_pps > tcp_minmssoverload) {
1181 if ((tp->rcv_byps / tp->rcv_pps) < tcp_minmss) {
1182 char ipstrbuf[MAX_IPv6_STR_LEN];
1183 printf("too many small tcp packets from "
1184 "%s:%u, av. %lubyte/packet, "
1185 "dropping connection\n",
1186 #ifdef INET6
1187 isipv6 ?
1188 inet_ntop(AF_INET6, &inp->in6p_faddr, ipstrbuf,
1189 sizeof(ipstrbuf)) :
1190 #endif
1191 inet_ntop(AF_INET, &inp->inp_faddr, ipstrbuf,
1192 sizeof(ipstrbuf)),
1193 inp->inp_fport,
1194 tp->rcv_byps / tp->rcv_pps);
1195 tp = tcp_drop(tp, ECONNRESET);
1196 /* tcpstat.tcps_minmssdrops++; */
1197 goto drop;
1198 }
1199 }
1200 } else {
1201 tp->rcv_reset = tcp_now + PR_SLOWHZ;
1202 tp->rcv_pps = 1;
1203 tp->rcv_byps = tlen + off;
1204 }
1205 }
1206
1207 /*
1208 * Segment received on connection.
1209 * Reset idle time and keep-alive timer.
1210 */
1211 tp->t_rcvtime = 0;
1212 if (TCPS_HAVEESTABLISHED(tp->t_state))
1213 tp->t_timer[TCPT_KEEP] = TCP_KEEPIDLE(tp);
1214
1215 /*
1216 * Process options if not in LISTEN state,
1217 * else do it below (after getting remote address).
1218 */
1219 if (tp->t_state != TCPS_LISTEN && optp)
1220 tcp_dooptions(tp, optp, optlen, th, &to);
1221
1222 /*
1223 * Header prediction: check for the two common cases
1224 * of a uni-directional data xfer. If the packet has
1225 * no control flags, is in-sequence, the window didn't
1226 * change and we're not retransmitting, it's a
1227 * candidate. If the length is zero and the ack moved
1228 * forward, we're the sender side of the xfer. Just
1229 * free the data acked & wake any higher level process
1230 * that was blocked waiting for space. If the length
1231 * is non-zero and the ack didn't move, we're the
1232 * receiver side. If we're getting packets in-order
1233 * (the reassembly queue is empty), add the data to
1234 * the socket buffer and note that we need a delayed ack.
1235 * Make sure that the hidden state-flags are also off.
1236 * Since we check for TCPS_ESTABLISHED above, it can only
1237 * be TH_NEEDSYN.
1238 */
1239 if (tp->t_state == TCPS_ESTABLISHED &&
1240 (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
1241 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
1242 ((to.to_flag & TOF_TS) == 0 ||
1243 TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
1244 /*
1245 * Using the CC option is compulsory if once started:
1246 * the segment is OK if no T/TCP was negotiated or
1247 * if the segment has a CC option equal to CCrecv
1248 */
1249 ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) != (TF_REQ_CC|TF_RCVD_CC) ||
1250 ((to.to_flag & TOF_CC) != 0 && to.to_cc == tp->cc_recv)) &&
1251 th->th_seq == tp->rcv_nxt &&
1252 tiwin && tiwin == tp->snd_wnd &&
1253 tp->snd_nxt == tp->snd_max) {
1254
1255 /*
1256 * If last ACK falls within this segment's sequence numbers,
1257 * record the timestamp.
1258 * NOTE that the test is modified according to the latest
1259 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
1260 */
1261 if ((to.to_flag & TOF_TS) != 0 &&
1262 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
1263 tp->ts_recent_age = tcp_now;
1264 tp->ts_recent = to.to_tsval;
1265 }
1266
1267 if (tlen == 0) {
1268 if (SEQ_GT(th->th_ack, tp->snd_una) &&
1269 SEQ_LEQ(th->th_ack, tp->snd_max) &&
1270 tp->snd_cwnd >= tp->snd_wnd &&
1271 tp->t_dupacks < tcprexmtthresh) {
1272 /*
1273 * this is a pure ack for outstanding data.
1274 */
1275 ++tcpstat.tcps_predack;
1276 /*
1277 * "bad retransmit" recovery
1278 */
1279 if (tp->t_rxtshift == 1 &&
1280 tcp_now < tp->t_badrxtwin) {
1281 tp->snd_cwnd = tp->snd_cwnd_prev;
1282 tp->snd_ssthresh =
1283 tp->snd_ssthresh_prev;
1284 tp->snd_nxt = tp->snd_max;
1285 tp->t_badrxtwin = 0;
1286 }
1287 if (((to.to_flag & TOF_TS) != 0) && (to.to_tsecr != 0)) /* Makes sure we already have a TS */
1288 tcp_xmit_timer(tp,
1289 tcp_now - to.to_tsecr + 1);
1290 else if (tp->t_rtttime &&
1291 SEQ_GT(th->th_ack, tp->t_rtseq))
1292 tcp_xmit_timer(tp, tp->t_rtttime);
1293 acked = th->th_ack - tp->snd_una;
1294 tcpstat.tcps_rcvackpack++;
1295 tcpstat.tcps_rcvackbyte += acked;
1296 sbdrop(&so->so_snd, acked);
1297 tp->snd_una = th->th_ack;
1298 m_freem(m);
1299 ND6_HINT(tp); /* some progress has been done */
1300
1301 /*
1302 * If all outstanding data are acked, stop
1303 * retransmit timer, otherwise restart timer
1304 * using current (possibly backed-off) value.
1305 * If process is waiting for space,
1306 * wakeup/selwakeup/signal. If data
1307 * are ready to send, let tcp_output
1308 * decide between more output or persist.
1309 */
1310 if (tp->snd_una == tp->snd_max)
1311 tp->t_timer[TCPT_REXMT] = 0;
1312 else if (tp->t_timer[TCPT_PERSIST] == 0)
1313 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
1314
1315 sowwakeup(so); /* has to be done with socket lock held */
1316 if ((so->so_snd.sb_cc) || (tp->t_flags & TF_ACKNOW))
1317 (void) tcp_output(tp);
1318 tcp_unlock(so, 1, 0);
1319 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
1320 return;
1321 }
1322 } else if (th->th_ack == tp->snd_una &&
1323 LIST_EMPTY(&tp->t_segq) &&
1324 tlen <= sbspace(&so->so_rcv)) {
1325 /*
1326 * this is a pure, in-sequence data packet
1327 * with nothing on the reassembly queue and
1328 * we have enough buffer space to take it.
1329 */
1330 ++tcpstat.tcps_preddat;
1331 tp->rcv_nxt += tlen;
1332 tcpstat.tcps_rcvpack++;
1333 tcpstat.tcps_rcvbyte += tlen;
1334 ND6_HINT(tp); /* some progress has been done */
1335 /*
1336 * Add data to socket buffer.
1337 */
1338 m_adj(m, drop_hdrlen); /* delayed header drop */
1339 if (sbappend(&so->so_rcv, m))
1340 sorwakeup(so);
1341 #if INET6
1342 if (isipv6) {
1343 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
1344 (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
1345 th->th_seq, th->th_ack, th->th_win);
1346 }
1347 else
1348 #endif
1349 {
1350 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
1351 (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
1352 th->th_seq, th->th_ack, th->th_win);
1353 }
1354 if (DELAY_ACK(tp)) {
1355 tp->t_flags |= TF_DELACK;
1356 } else {
1357 tp->t_flags |= TF_ACKNOW;
1358 tcp_output(tp);
1359 }
1360 tcp_unlock(so, 1, 0);
1361 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
1362 return;
1363 }
1364 }
1365
1366 /*
1367 * Calculate amount of space in receive window,
1368 * and then do TCP input processing.
1369 * Receive window is amount of space in rcv queue,
1370 * but not less than advertised window.
1371 */
1372 #if 1
1373 lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1374 #endif
1375 { int win;
1376
1377 win = sbspace(&so->so_rcv);
1378 if (win < 0)
1379 win = 0;
1380 else { /* clip rcv window to 4K for modems */
1381 if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0)
1382 win = min(win, slowlink_wsize);
1383 }
1384 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
1385 }
1386
1387 switch (tp->t_state) {
1388
1389 /*
1390 * If the state is LISTEN then ignore segment if it contains an RST.
1391 * If the segment contains an ACK then it is bad and send a RST.
1392 * If it does not contain a SYN then it is not interesting; drop it.
1393 * If it is from this socket, drop it, it must be forged.
1394 * Don't bother responding if the destination was a broadcast.
1395 * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial
1396 * tp->iss, and send a segment:
1397 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
1398 * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
1399 * Fill in remote peer address fields if not previously specified.
1400 * Enter SYN_RECEIVED state, and process any other fields of this
1401 * segment in this state.
1402 */
1403 case TCPS_LISTEN: {
1404 register struct sockaddr_in *sin;
1405 #if INET6
1406 register struct sockaddr_in6 *sin6;
1407 #endif
1408
1409 #if 1
1410 lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1411 #endif
1412 if (thflags & TH_RST)
1413 goto drop;
1414 if (thflags & TH_ACK) {
1415 rstreason = BANDLIM_RST_OPENPORT;
1416 goto dropwithreset;
1417 }
1418 if ((thflags & TH_SYN) == 0)
1419 goto drop;
1420 if (th->th_dport == th->th_sport) {
1421 #if INET6
1422 if (isipv6) {
1423 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
1424 &ip6->ip6_src))
1425 goto drop;
1426 } else
1427 #endif /* INET6 */
1428 if (ip->ip_dst.s_addr == ip->ip_src.s_addr)
1429 goto drop;
1430 }
1431 /*
1432 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
1433 * in_broadcast() should never return true on a received
1434 * packet with M_BCAST not set.
1435 *
1436 * Packets with a multicast source address should also
1437 * be discarded.
1438 */
1439 if (m->m_flags & (M_BCAST|M_MCAST))
1440 goto drop;
1441 #if INET6
1442 if (isipv6) {
1443 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
1444 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
1445 goto drop;
1446 } else
1447 #endif
1448 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
1449 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
1450 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
1451 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
1452 goto drop;
1453 #if INET6
1454 if (isipv6) {
1455 MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6,
1456 M_SONAME, M_NOWAIT);
1457 if (sin6 == NULL)
1458 goto drop;
1459 bzero(sin6, sizeof(*sin6));
1460 sin6->sin6_family = AF_INET6;
1461 sin6->sin6_len = sizeof(*sin6);
1462 sin6->sin6_addr = ip6->ip6_src;
1463 sin6->sin6_port = th->th_sport;
1464 laddr6 = inp->in6p_laddr;
1465 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
1466 inp->in6p_laddr = ip6->ip6_dst;
1467 if (in6_pcbconnect(inp, (struct sockaddr *)sin6,
1468 proc0)) {
1469 inp->in6p_laddr = laddr6;
1470 FREE(sin6, M_SONAME);
1471 goto drop;
1472 }
1473 FREE(sin6, M_SONAME);
1474 } else
1475 #endif
1476 {
1477 #if 1
1478 lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1479 #endif
1480 MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME,
1481 M_NOWAIT);
1482 if (sin == NULL)
1483 goto drop;
1484 sin->sin_family = AF_INET;
1485 sin->sin_len = sizeof(*sin);
1486 sin->sin_addr = ip->ip_src;
1487 sin->sin_port = th->th_sport;
1488 bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero));
1489 laddr = inp->inp_laddr;
1490 if (inp->inp_laddr.s_addr == INADDR_ANY)
1491 inp->inp_laddr = ip->ip_dst;
1492 if (in_pcbconnect(inp, (struct sockaddr *)sin, proc0)) {
1493 inp->inp_laddr = laddr;
1494 FREE(sin, M_SONAME);
1495 goto drop;
1496 }
1497 FREE(sin, M_SONAME);
1498 }
1499 if ((taop = tcp_gettaocache(inp)) == NULL) {
1500 taop = &tao_noncached;
1501 bzero(taop, sizeof(*taop));
1502 }
1503 tcp_dooptions(tp, optp, optlen, th, &to);
1504 if (iss)
1505 tp->iss = iss;
1506 else {
1507 tp->iss = tcp_new_isn(tp);
1508 }
1509 tp->irs = th->th_seq;
1510 tcp_sendseqinit(tp);
1511 tcp_rcvseqinit(tp);
1512 tp->snd_recover = tp->snd_una;
1513 /*
1514 * Initialization of the tcpcb for transaction;
1515 * set SND.WND = SEG.WND,
1516 * initialize CCsend and CCrecv.
1517 */
1518 tp->snd_wnd = tiwin; /* initial send-window */
1519 tp->cc_send = CC_INC(tcp_ccgen);
1520 tp->cc_recv = to.to_cc;
1521 /*
1522 * Perform TAO test on incoming CC (SEG.CC) option, if any.
1523 * - compare SEG.CC against cached CC from the same host,
1524 * if any.
1525 * - if SEG.CC > chached value, SYN must be new and is accepted
1526 * immediately: save new CC in the cache, mark the socket
1527 * connected, enter ESTABLISHED state, turn on flag to
1528 * send a SYN in the next segment.
1529 * A virtual advertised window is set in rcv_adv to
1530 * initialize SWS prevention. Then enter normal segment
1531 * processing: drop SYN, process data and FIN.
1532 * - otherwise do a normal 3-way handshake.
1533 */
1534 if ((to.to_flag & TOF_CC) != 0) {
1535 if (((tp->t_flags & TF_NOPUSH) != 0) &&
1536 taop->tao_cc != 0 && CC_GT(to.to_cc, taop->tao_cc)) {
1537
1538 taop->tao_cc = to.to_cc;
1539
1540 tp->t_state = TCPS_ESTABLISHED;
1541
1542 /*
1543 * If there is a FIN, or if there is data and the
1544 * connection is local, then delay SYN,ACK(SYN) in
1545 * the hope of piggy-backing it on a response
1546 * segment. Otherwise must send ACK now in case
1547 * the other side is slow starting.
1548 */
1549 if (DELAY_ACK(tp) && ((thflags & TH_FIN) ||
1550 (tlen != 0 &&
1551 #if INET6
1552 (isipv6 && in6_localaddr(&inp->in6p_faddr))
1553 ||
1554 (!isipv6 &&
1555 #endif /* INET6 */
1556 in_localaddr(inp->inp_faddr)
1557 #if INET6
1558 )
1559 #endif /* INET6 */
1560 ))) {
1561 tp->t_flags |= (TF_DELACK | TF_NEEDSYN);
1562 }
1563 else {
1564 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
1565 }
1566
1567 /*
1568 * Limit the `virtual advertised window' to TCP_MAXWIN
1569 * here. Even if we requested window scaling, it will
1570 * become effective only later when our SYN is acked.
1571 */
1572 if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0) /* clip window size for for slow link */
1573 tp->rcv_adv += min(tp->rcv_wnd, slowlink_wsize);
1574 else
1575 tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN);
1576 tcpstat.tcps_connects++;
1577 soisconnected(so);
1578 tp->t_timer[TCPT_KEEP] = tcp_keepinit;
1579 dropsocket = 0; /* committed to socket */
1580 tcpstat.tcps_accepts++;
1581 goto trimthenstep6;
1582 }
1583 /* else do standard 3-way handshake */
1584 } else {
1585 /*
1586 * No CC option, but maybe CC.NEW:
1587 * invalidate cached value.
1588 */
1589 taop->tao_cc = 0;
1590 }
1591 /*
1592 * TAO test failed or there was no CC option,
1593 * do a standard 3-way handshake.
1594 */
1595 tp->t_flags |= TF_ACKNOW;
1596 tp->t_state = TCPS_SYN_RECEIVED;
1597 tp->t_timer[TCPT_KEEP] = tcp_keepinit;
1598 dropsocket = 0; /* committed to socket */
1599 tcpstat.tcps_accepts++;
1600 goto trimthenstep6;
1601 }
1602
1603 /*
1604 * If the state is SYN_RECEIVED:
1605 * if seg contains an ACK, but not for our SYN/ACK, send a RST.
1606 */
1607 case TCPS_SYN_RECEIVED:
1608 if ((thflags & TH_ACK) &&
1609 (SEQ_LEQ(th->th_ack, tp->snd_una) ||
1610 SEQ_GT(th->th_ack, tp->snd_max))) {
1611 rstreason = BANDLIM_RST_OPENPORT;
1612 goto dropwithreset;
1613 }
1614 break;
1615
1616 /*
1617 * If the state is SYN_SENT:
1618 * if seg contains an ACK, but not for our SYN, drop the input.
1619 * if seg contains a RST, then drop the connection.
1620 * if seg does not contain SYN, then drop it.
1621 * Otherwise this is an acceptable SYN segment
1622 * initialize tp->rcv_nxt and tp->irs
1623 * if seg contains ack then advance tp->snd_una
1624 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
1625 * arrange for segment to be acked (eventually)
1626 * continue processing rest of data/controls, beginning with URG
1627 */
1628 case TCPS_SYN_SENT:
1629 if ((taop = tcp_gettaocache(inp)) == NULL) {
1630 taop = &tao_noncached;
1631 bzero(taop, sizeof(*taop));
1632 }
1633
1634 if ((thflags & TH_ACK) &&
1635 (SEQ_LEQ(th->th_ack, tp->iss) ||
1636 SEQ_GT(th->th_ack, tp->snd_max))) {
1637 /*
1638 * If we have a cached CCsent for the remote host,
1639 * hence we haven't just crashed and restarted,
1640 * do not send a RST. This may be a retransmission
1641 * from the other side after our earlier ACK was lost.
1642 * Our new SYN, when it arrives, will serve as the
1643 * needed ACK.
1644 */
1645 if (taop->tao_ccsent != 0)
1646 goto drop;
1647 else {
1648 rstreason = BANDLIM_UNLIMITED;
1649 goto dropwithreset;
1650 }
1651 }
1652 if (thflags & TH_RST) {
1653 if (thflags & TH_ACK) {
1654 tp = tcp_drop(tp, ECONNREFUSED);
1655 postevent(so, 0, EV_RESET);
1656 }
1657 goto drop;
1658 }
1659 if ((thflags & TH_SYN) == 0)
1660 goto drop;
1661 tp->snd_wnd = th->th_win; /* initial send window */
1662 tp->cc_recv = to.to_cc; /* foreign CC */
1663
1664 tp->irs = th->th_seq;
1665 tcp_rcvseqinit(tp);
1666 if (thflags & TH_ACK) {
1667 /*
1668 * Our SYN was acked. If segment contains CC.ECHO
1669 * option, check it to make sure this segment really
1670 * matches our SYN. If not, just drop it as old
1671 * duplicate, but send an RST if we're still playing
1672 * by the old rules. If no CC.ECHO option, make sure
1673 * we don't get fooled into using T/TCP.
1674 */
1675 if (to.to_flag & TOF_CCECHO) {
1676 if (tp->cc_send != to.to_ccecho) {
1677 if (taop->tao_ccsent != 0)
1678 goto drop;
1679 else {
1680 rstreason = BANDLIM_UNLIMITED;
1681 goto dropwithreset;
1682 }
1683 }
1684 } else
1685 tp->t_flags &= ~TF_RCVD_CC;
1686 tcpstat.tcps_connects++;
1687 soisconnected(so);
1688 /* Do window scaling on this connection? */
1689 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1690 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1691 tp->snd_scale = tp->requested_s_scale;
1692 tp->rcv_scale = tp->request_r_scale;
1693 }
1694 /* Segment is acceptable, update cache if undefined. */
1695 if (taop->tao_ccsent == 0)
1696 taop->tao_ccsent = to.to_ccecho;
1697
1698 tp->rcv_adv += tp->rcv_wnd;
1699 tp->snd_una++; /* SYN is acked */
1700 /*
1701 * If there's data, delay ACK; if there's also a FIN
1702 * ACKNOW will be turned on later.
1703 */
1704 if (DELAY_ACK(tp) && tlen != 0) {
1705 tp->t_flags |= TF_DELACK;
1706 }
1707 else {
1708 tp->t_flags |= TF_ACKNOW;
1709 }
1710 /*
1711 * Received <SYN,ACK> in SYN_SENT[*] state.
1712 * Transitions:
1713 * SYN_SENT --> ESTABLISHED
1714 * SYN_SENT* --> FIN_WAIT_1
1715 */
1716 if (tp->t_flags & TF_NEEDFIN) {
1717 tp->t_state = TCPS_FIN_WAIT_1;
1718 tp->t_flags &= ~TF_NEEDFIN;
1719 thflags &= ~TH_SYN;
1720 } else {
1721 tp->t_state = TCPS_ESTABLISHED;
1722 tp->t_timer[TCPT_KEEP] = TCP_KEEPIDLE(tp);
1723 }
1724 } else {
1725 /*
1726 * Received initial SYN in SYN-SENT[*] state => simul-
1727 * taneous open. If segment contains CC option and there is
1728 * a cached CC, apply TAO test; if it succeeds, connection is
1729 * half-synchronized. Otherwise, do 3-way handshake:
1730 * SYN-SENT -> SYN-RECEIVED
1731 * SYN-SENT* -> SYN-RECEIVED*
1732 * If there was no CC option, clear cached CC value.
1733 */
1734 tp->t_flags |= TF_ACKNOW;
1735 tp->t_timer[TCPT_REXMT] = 0;
1736 if (to.to_flag & TOF_CC) {
1737 if (taop->tao_cc != 0 &&
1738 CC_GT(to.to_cc, taop->tao_cc)) {
1739 /*
1740 * update cache and make transition:
1741 * SYN-SENT -> ESTABLISHED*
1742 * SYN-SENT* -> FIN-WAIT-1*
1743 */
1744 taop->tao_cc = to.to_cc;
1745 if (tp->t_flags & TF_NEEDFIN) {
1746 tp->t_state = TCPS_FIN_WAIT_1;
1747 tp->t_flags &= ~TF_NEEDFIN;
1748 } else {
1749 tp->t_state = TCPS_ESTABLISHED;
1750 tp->t_timer[TCPT_KEEP] = TCP_KEEPIDLE(tp);
1751 }
1752 tp->t_flags |= TF_NEEDSYN;
1753 } else
1754 tp->t_state = TCPS_SYN_RECEIVED;
1755 } else {
1756 /* CC.NEW or no option => invalidate cache */
1757 taop->tao_cc = 0;
1758 tp->t_state = TCPS_SYN_RECEIVED;
1759 }
1760 }
1761
1762 trimthenstep6:
1763 /*
1764 * Advance th->th_seq to correspond to first data byte.
1765 * If data, trim to stay within window,
1766 * dropping FIN if necessary.
1767 */
1768 th->th_seq++;
1769 if (tlen > tp->rcv_wnd) {
1770 todrop = tlen - tp->rcv_wnd;
1771 m_adj(m, -todrop);
1772 tlen = tp->rcv_wnd;
1773 thflags &= ~TH_FIN;
1774 tcpstat.tcps_rcvpackafterwin++;
1775 tcpstat.tcps_rcvbyteafterwin += todrop;
1776 }
1777 tp->snd_wl1 = th->th_seq - 1;
1778 tp->rcv_up = th->th_seq;
1779 /*
1780 * Client side of transaction: already sent SYN and data.
1781 * If the remote host used T/TCP to validate the SYN,
1782 * our data will be ACK'd; if so, enter normal data segment
1783 * processing in the middle of step 5, ack processing.
1784 * Otherwise, goto step 6.
1785 */
1786 if (thflags & TH_ACK)
1787 goto process_ACK;
1788 goto step6;
1789 /*
1790 * If the state is LAST_ACK or CLOSING or TIME_WAIT:
1791 * if segment contains a SYN and CC [not CC.NEW] option:
1792 * if state == TIME_WAIT and connection duration > MSL,
1793 * drop packet and send RST;
1794 *
1795 * if SEG.CC > CCrecv then is new SYN, and can implicitly
1796 * ack the FIN (and data) in retransmission queue.
1797 * Complete close and delete TCPCB. Then reprocess
1798 * segment, hoping to find new TCPCB in LISTEN state;
1799 *
1800 * else must be old SYN; drop it.
1801 * else do normal processing.
1802 */
1803 case TCPS_LAST_ACK:
1804 case TCPS_CLOSING:
1805 case TCPS_TIME_WAIT:
1806 if ((thflags & TH_SYN) &&
1807 (to.to_flag & TOF_CC) && tp->cc_recv != 0) {
1808 if (tp->t_state == TCPS_TIME_WAIT &&
1809 tp->t_starttime > tcp_msl) {
1810 rstreason = BANDLIM_UNLIMITED;
1811 goto dropwithreset;
1812 }
1813 if (CC_GT(to.to_cc, tp->cc_recv)) {
1814 tp = tcp_close(tp);
1815 tcp_unlock(so, 1, 50);
1816 goto findpcb;
1817 }
1818 else
1819 goto drop;
1820 }
1821 break; /* continue normal processing */
1822
1823 /* Received a SYN while connection is already established.
1824 * This is a "half open connection and other anomalies" described
1825 * in RFC793 page 34, send an ACK so the remote reset the connection
1826 * or recovers by adjusting its sequence numberering
1827 */
1828 case TCPS_ESTABLISHED:
1829 if (thflags & TH_SYN)
1830 goto dropafterack;
1831 break;
1832 }
1833
1834 /*
1835 * States other than LISTEN or SYN_SENT.
1836 * First check the RST flag and sequence number since reset segments
1837 * are exempt from the timestamp and connection count tests. This
1838 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
1839 * below which allowed reset segments in half the sequence space
1840 * to fall though and be processed (which gives forged reset
1841 * segments with a random sequence number a 50 percent chance of
1842 * killing a connection).
1843 * Then check timestamp, if present.
1844 * Then check the connection count, if present.
1845 * Then check that at least some bytes of segment are within
1846 * receive window. If segment begins before rcv_nxt,
1847 * drop leading data (and SYN); if nothing left, just ack.
1848 *
1849 *
1850 * If the RST bit is set, check the sequence number to see
1851 * if this is a valid reset segment.
1852 * RFC 793 page 37:
1853 * In all states except SYN-SENT, all reset (RST) segments
1854 * are validated by checking their SEQ-fields. A reset is
1855 * valid if its sequence number is in the window.
1856 * Note: this does not take into account delayed ACKs, so
1857 * we should test against last_ack_sent instead of rcv_nxt.
1858 * The sequence number in the reset segment is normally an
1859 * echo of our outgoing acknowlegement numbers, but some hosts
1860 * send a reset with the sequence number at the rightmost edge
1861 * of our receive window, and we have to handle this case.
1862 * If we have multiple segments in flight, the intial reset
1863 * segment sequence numbers will be to the left of last_ack_sent,
1864 * but they will eventually catch up.
1865 * In any case, it never made sense to trim reset segments to
1866 * fit the receive window since RFC 1122 says:
1867 * 4.2.2.12 RST Segment: RFC-793 Section 3.4
1868 *
1869 * A TCP SHOULD allow a received RST segment to include data.
1870 *
1871 * DISCUSSION
1872 * It has been suggested that a RST segment could contain
1873 * ASCII text that encoded and explained the cause of the
1874 * RST. No standard has yet been established for such
1875 * data.
1876 *
1877 * If the reset segment passes the sequence number test examine
1878 * the state:
1879 * SYN_RECEIVED STATE:
1880 * If passive open, return to LISTEN state.
1881 * If active open, inform user that connection was refused.
1882 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
1883 * Inform user that connection was reset, and close tcb.
1884 * CLOSING, LAST_ACK STATES:
1885 * Close the tcb.
1886 * TIME_WAIT STATE:
1887 * Drop the segment - see Stevens, vol. 2, p. 964 and
1888 * RFC 1337.
1889 */
1890 if (thflags & TH_RST) {
1891 if (SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
1892 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
1893 switch (tp->t_state) {
1894
1895 case TCPS_SYN_RECEIVED:
1896 so->so_error = ECONNREFUSED;
1897 goto close;
1898
1899 case TCPS_ESTABLISHED:
1900 case TCPS_FIN_WAIT_1:
1901 case TCPS_CLOSE_WAIT:
1902 /*
1903 Drop through ...
1904 */
1905 case TCPS_FIN_WAIT_2:
1906 so->so_error = ECONNRESET;
1907 close:
1908 postevent(so, 0, EV_RESET);
1909 tp->t_state = TCPS_CLOSED;
1910 tcpstat.tcps_drops++;
1911 tp = tcp_close(tp);
1912 break;
1913
1914 case TCPS_CLOSING:
1915 case TCPS_LAST_ACK:
1916 tp = tcp_close(tp);
1917 break;
1918
1919 case TCPS_TIME_WAIT:
1920 break;
1921 }
1922 }
1923 goto drop;
1924 }
1925
1926 #if 1
1927 lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1928 #endif
1929 /*
1930 * RFC 1323 PAWS: If we have a timestamp reply on this segment
1931 * and it's less than ts_recent, drop it.
1932 */
1933 if ((to.to_flag & TOF_TS) != 0 && tp->ts_recent &&
1934 TSTMP_LT(to.to_tsval, tp->ts_recent)) {
1935
1936 /* Check to see if ts_recent is over 24 days old. */
1937 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) {
1938 /*
1939 * Invalidate ts_recent. If this segment updates
1940 * ts_recent, the age will be reset later and ts_recent
1941 * will get a valid value. If it does not, setting
1942 * ts_recent to zero will at least satisfy the
1943 * requirement that zero be placed in the timestamp
1944 * echo reply when ts_recent isn't valid. The
1945 * age isn't reset until we get a valid ts_recent
1946 * because we don't want out-of-order segments to be
1947 * dropped when ts_recent is old.
1948 */
1949 tp->ts_recent = 0;
1950 } else {
1951 tcpstat.tcps_rcvduppack++;
1952 tcpstat.tcps_rcvdupbyte += tlen;
1953 tcpstat.tcps_pawsdrop++;
1954 goto dropafterack;
1955 }
1956 }
1957
1958 /*
1959 * T/TCP mechanism
1960 * If T/TCP was negotiated and the segment doesn't have CC,
1961 * or if its CC is wrong then drop the segment.
1962 * RST segments do not have to comply with this.
1963 */
1964 if ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) == (TF_REQ_CC|TF_RCVD_CC) &&
1965 ((to.to_flag & TOF_CC) == 0 || tp->cc_recv != to.to_cc))
1966 goto dropafterack;
1967
1968 /*
1969 * In the SYN-RECEIVED state, validate that the packet belongs to
1970 * this connection before trimming the data to fit the receive
1971 * window. Check the sequence number versus IRS since we know
1972 * the sequence numbers haven't wrapped. This is a partial fix
1973 * for the "LAND" DoS attack.
1974 */
1975 if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
1976 rstreason = BANDLIM_RST_OPENPORT;
1977 goto dropwithreset;
1978 }
1979
1980 todrop = tp->rcv_nxt - th->th_seq;
1981 if (todrop > 0) {
1982 if (thflags & TH_SYN) {
1983 thflags &= ~TH_SYN;
1984 th->th_seq++;
1985 if (th->th_urp > 1)
1986 th->th_urp--;
1987 else
1988 thflags &= ~TH_URG;
1989 todrop--;
1990 }
1991 /*
1992 * Following if statement from Stevens, vol. 2, p. 960.
1993 */
1994 if (todrop > tlen
1995 || (todrop == tlen && (thflags & TH_FIN) == 0)) {
1996 /*
1997 * Any valid FIN must be to the left of the window.
1998 * At this point the FIN must be a duplicate or out
1999 * of sequence; drop it.
2000 */
2001 thflags &= ~TH_FIN;
2002
2003 /*
2004 * Send an ACK to resynchronize and drop any data.
2005 * But keep on processing for RST or ACK.
2006 */
2007 tp->t_flags |= TF_ACKNOW;
2008 todrop = tlen;
2009 tcpstat.tcps_rcvduppack++;
2010 tcpstat.tcps_rcvdupbyte += todrop;
2011 } else {
2012 tcpstat.tcps_rcvpartduppack++;
2013 tcpstat.tcps_rcvpartdupbyte += todrop;
2014 }
2015 drop_hdrlen += todrop; /* drop from the top afterwards */
2016 th->th_seq += todrop;
2017 tlen -= todrop;
2018 if (th->th_urp > todrop)
2019 th->th_urp -= todrop;
2020 else {
2021 thflags &= ~TH_URG;
2022 th->th_urp = 0;
2023 }
2024 }
2025
2026 /*
2027 * If new data are received on a connection after the
2028 * user processes are gone, then RST the other end.
2029 */
2030 if ((so->so_state & SS_NOFDREF) &&
2031 tp->t_state > TCPS_CLOSE_WAIT && tlen) {
2032 tp = tcp_close(tp);
2033 tcpstat.tcps_rcvafterclose++;
2034 rstreason = BANDLIM_UNLIMITED;
2035 goto dropwithreset;
2036 }
2037
2038 /*
2039 * If segment ends after window, drop trailing data
2040 * (and PUSH and FIN); if nothing left, just ACK.
2041 */
2042 todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd);
2043 if (todrop > 0) {
2044 tcpstat.tcps_rcvpackafterwin++;
2045 if (todrop >= tlen) {
2046 tcpstat.tcps_rcvbyteafterwin += tlen;
2047 /*
2048 * If a new connection request is received
2049 * while in TIME_WAIT, drop the old connection
2050 * and start over if the sequence numbers
2051 * are above the previous ones.
2052 */
2053 if (thflags & TH_SYN &&
2054 tp->t_state == TCPS_TIME_WAIT &&
2055 SEQ_GT(th->th_seq, tp->rcv_nxt)) {
2056 iss = tcp_new_isn(tp);
2057 tp = tcp_close(tp);
2058 tcp_unlock(so, 1, 0);
2059 goto findpcb;
2060 }
2061 /*
2062 * If window is closed can only take segments at
2063 * window edge, and have to drop data and PUSH from
2064 * incoming segments. Continue processing, but
2065 * remember to ack. Otherwise, drop segment
2066 * and ack.
2067 */
2068 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
2069 tp->t_flags |= TF_ACKNOW;
2070 tcpstat.tcps_rcvwinprobe++;
2071 } else
2072 goto dropafterack;
2073 } else
2074 tcpstat.tcps_rcvbyteafterwin += todrop;
2075 m_adj(m, -todrop);
2076 tlen -= todrop;
2077 thflags &= ~(TH_PUSH|TH_FIN);
2078 }
2079
2080 /*
2081 * If last ACK falls within this segment's sequence numbers,
2082 * record its timestamp.
2083 * NOTE that the test is modified according to the latest
2084 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
2085 */
2086 if ((to.to_flag & TOF_TS) != 0 &&
2087 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
2088 tp->ts_recent_age = tcp_now;
2089 tp->ts_recent = to.to_tsval;
2090 }
2091
2092 /*
2093 * If a SYN is in the window, then this is an
2094 * error and we send an RST and drop the connection.
2095 */
2096 if (thflags & TH_SYN) {
2097 tp = tcp_drop(tp, ECONNRESET);
2098 rstreason = BANDLIM_UNLIMITED;
2099 postevent(so, 0, EV_RESET);
2100 goto dropwithreset;
2101 }
2102
2103 /*
2104 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN
2105 * flag is on (half-synchronized state), then queue data for
2106 * later processing; else drop segment and return.
2107 */
2108 if ((thflags & TH_ACK) == 0) {
2109 if (tp->t_state == TCPS_SYN_RECEIVED ||
2110 (tp->t_flags & TF_NEEDSYN))
2111 goto step6;
2112 else
2113 goto drop;
2114 }
2115
2116 /*
2117 * Ack processing.
2118 */
2119 switch (tp->t_state) {
2120
2121 /*
2122 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
2123 * ESTABLISHED state and continue processing.
2124 * The ACK was checked above.
2125 */
2126 case TCPS_SYN_RECEIVED:
2127
2128 tcpstat.tcps_connects++;
2129 soisconnected(so);
2130
2131 /* Do window scaling? */
2132 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
2133 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
2134 tp->snd_scale = tp->requested_s_scale;
2135 tp->rcv_scale = tp->request_r_scale;
2136 }
2137 /*
2138 * Upon successful completion of 3-way handshake,
2139 * update cache.CC if it was undefined, pass any queued
2140 * data to the user, and advance state appropriately.
2141 */
2142 if ((taop = tcp_gettaocache(inp)) != NULL &&
2143 taop->tao_cc == 0)
2144 taop->tao_cc = tp->cc_recv;
2145
2146 /*
2147 * Make transitions:
2148 * SYN-RECEIVED -> ESTABLISHED
2149 * SYN-RECEIVED* -> FIN-WAIT-1
2150 */
2151 if (tp->t_flags & TF_NEEDFIN) {
2152 tp->t_state = TCPS_FIN_WAIT_1;
2153 tp->t_flags &= ~TF_NEEDFIN;
2154 } else {
2155 tp->t_state = TCPS_ESTABLISHED;
2156 tp->t_timer[TCPT_KEEP] = TCP_KEEPIDLE(tp);
2157 }
2158 /*
2159 * If segment contains data or ACK, will call tcp_reass()
2160 * later; if not, do so now to pass queued data to user.
2161 */
2162 if (tlen == 0 && (thflags & TH_FIN) == 0)
2163 (void) tcp_reass(tp, (struct tcphdr *)0, 0,
2164 (struct mbuf *)0);
2165 tp->snd_wl1 = th->th_seq - 1;
2166 /* fall into ... */
2167
2168 /*
2169 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
2170 * ACKs. If the ack is in the range
2171 * tp->snd_una < th->th_ack <= tp->snd_max
2172 * then advance tp->snd_una to th->th_ack and drop
2173 * data from the retransmission queue. If this ACK reflects
2174 * more up to date window information we update our window information.
2175 */
2176 case TCPS_ESTABLISHED:
2177 case TCPS_FIN_WAIT_1:
2178 case TCPS_FIN_WAIT_2:
2179 case TCPS_CLOSE_WAIT:
2180 case TCPS_CLOSING:
2181 case TCPS_LAST_ACK:
2182 case TCPS_TIME_WAIT:
2183
2184 if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
2185 if (tlen == 0 && tiwin == tp->snd_wnd) {
2186 tcpstat.tcps_rcvdupack++;
2187 /*
2188 * If we have outstanding data (other than
2189 * a window probe), this is a completely
2190 * duplicate ack (ie, window info didn't
2191 * change), the ack is the biggest we've
2192 * seen and we've seen exactly our rexmt
2193 * threshhold of them, assume a packet
2194 * has been dropped and retransmit it.
2195 * Kludge snd_nxt & the congestion
2196 * window so we send only this one
2197 * packet.
2198 *
2199 * We know we're losing at the current
2200 * window size so do congestion avoidance
2201 * (set ssthresh to half the current window
2202 * and pull our congestion window back to
2203 * the new ssthresh).
2204 *
2205 * Dup acks mean that packets have left the
2206 * network (they're now cached at the receiver)
2207 * so bump cwnd by the amount in the receiver
2208 * to keep a constant cwnd packets in the
2209 * network.
2210 */
2211 if (tp->t_timer[TCPT_REXMT] == 0 ||
2212 th->th_ack != tp->snd_una)
2213 tp->t_dupacks = 0;
2214 else if (++tp->t_dupacks == tcprexmtthresh) {
2215 tcp_seq onxt = tp->snd_nxt;
2216 u_int win =
2217 min(tp->snd_wnd, tp->snd_cwnd) / 2 /
2218 tp->t_maxseg;
2219 if (tcp_do_newreno && SEQ_LT(th->th_ack,
2220 tp->snd_recover)) {
2221 /* False retransmit, should not
2222 * cut window
2223 */
2224 tp->snd_cwnd += tp->t_maxseg;
2225 tp->t_dupacks = 0;
2226 (void) tcp_output(tp);
2227 goto drop;
2228 }
2229 if (win < 2)
2230 win = 2;
2231 tp->snd_ssthresh = win * tp->t_maxseg;
2232 tp->snd_recover = tp->snd_max;
2233 tp->t_timer[TCPT_REXMT] = 0;
2234 tp->t_rtttime = 0;
2235 tp->snd_nxt = th->th_ack;
2236 tp->snd_cwnd = tp->t_maxseg;
2237 (void) tcp_output(tp);
2238 tp->snd_cwnd = tp->snd_ssthresh +
2239 tp->t_maxseg * tp->t_dupacks;
2240 if (SEQ_GT(onxt, tp->snd_nxt))
2241 tp->snd_nxt = onxt;
2242 goto drop;
2243 } else if (tp->t_dupacks > tcprexmtthresh) {
2244 tp->snd_cwnd += tp->t_maxseg;
2245 (void) tcp_output(tp);
2246 goto drop;
2247 }
2248 } else
2249 tp->t_dupacks = 0;
2250 break;
2251 }
2252 /*
2253 * If the congestion window was inflated to account
2254 * for the other side's cached packets, retract it.
2255 */
2256 if (tcp_do_newreno == 0) {
2257 if (tp->t_dupacks >= tcprexmtthresh &&
2258 tp->snd_cwnd > tp->snd_ssthresh)
2259 tp->snd_cwnd = tp->snd_ssthresh;
2260 tp->t_dupacks = 0;
2261 } else if (tp->t_dupacks >= tcprexmtthresh &&
2262 !tcp_newreno(tp, th)) {
2263 /*
2264 * Window inflation should have left us with approx.
2265 * snd_ssthresh outstanding data. But in case we
2266 * would be inclined to send a burst, better to do
2267 * it via the slow start mechanism.
2268 */
2269 if (SEQ_GT(th->th_ack + tp->snd_ssthresh, tp->snd_max))
2270 tp->snd_cwnd =
2271 tp->snd_max - th->th_ack + tp->t_maxseg;
2272 else
2273 tp->snd_cwnd = tp->snd_ssthresh;
2274 tp->t_dupacks = 0;
2275 }
2276
2277 if (tp->t_dupacks < tcprexmtthresh)
2278 tp->t_dupacks = 0;
2279
2280 if (SEQ_GT(th->th_ack, tp->snd_max)) {
2281 tcpstat.tcps_rcvacktoomuch++;
2282 goto dropafterack;
2283 }
2284 /*
2285 * If we reach this point, ACK is not a duplicate,
2286 * i.e., it ACKs something we sent.
2287 */
2288 if (tp->t_flags & TF_NEEDSYN) {
2289 /*
2290 * T/TCP: Connection was half-synchronized, and our
2291 * SYN has been ACK'd (so connection is now fully
2292 * synchronized). Go to non-starred state,
2293 * increment snd_una for ACK of SYN, and check if
2294 * we can do window scaling.
2295 */
2296 tp->t_flags &= ~TF_NEEDSYN;
2297 tp->snd_una++;
2298 /* Do window scaling? */
2299 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
2300 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
2301 tp->snd_scale = tp->requested_s_scale;
2302 tp->rcv_scale = tp->request_r_scale;
2303 }
2304 }
2305
2306 process_ACK:
2307 acked = th->th_ack - tp->snd_una;
2308 tcpstat.tcps_rcvackpack++;
2309 tcpstat.tcps_rcvackbyte += acked;
2310
2311 /*
2312 * If we just performed our first retransmit, and the ACK
2313 * arrives within our recovery window, then it was a mistake
2314 * to do the retransmit in the first place. Recover our
2315 * original cwnd and ssthresh, and proceed to transmit where
2316 * we left off.
2317 */
2318 if (tp->t_rxtshift == 1 && tcp_now < tp->t_badrxtwin) {
2319 tp->snd_cwnd = tp->snd_cwnd_prev;
2320 tp->snd_ssthresh = tp->snd_ssthresh_prev;
2321 tp->snd_nxt = tp->snd_max;
2322 tp->t_badrxtwin = 0; /* XXX probably not required */
2323 }
2324
2325 /*
2326 * If we have a timestamp reply, update smoothed
2327 * round trip time. If no timestamp is present but
2328 * transmit timer is running and timed sequence
2329 * number was acked, update smoothed round trip time.
2330 * Since we now have an rtt measurement, cancel the
2331 * timer backoff (cf., Phil Karn's retransmit alg.).
2332 * Recompute the initial retransmit timer.
2333 * Also makes sure we have a valid time stamp in hand
2334 */
2335 if (((to.to_flag & TOF_TS) != 0) && (to.to_tsecr != 0))
2336 tcp_xmit_timer(tp, tcp_now - to.to_tsecr + 1);
2337 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
2338 tcp_xmit_timer(tp, tp->t_rtttime);
2339
2340 /*
2341 * If all outstanding data is acked, stop retransmit
2342 * timer and remember to restart (more output or persist).
2343 * If there is more data to be acked, restart retransmit
2344 * timer, using current (possibly backed-off) value.
2345 */
2346 if (th->th_ack == tp->snd_max) {
2347 tp->t_timer[TCPT_REXMT] = 0;
2348 needoutput = 1;
2349 } else if (tp->t_timer[TCPT_PERSIST] == 0)
2350 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
2351
2352 /*
2353 * If no data (only SYN) was ACK'd,
2354 * skip rest of ACK processing.
2355 */
2356 if (acked == 0)
2357 goto step6;
2358
2359 /*
2360 * When new data is acked, open the congestion window.
2361 * If the window gives us less than ssthresh packets
2362 * in flight, open exponentially (maxseg per packet).
2363 * Otherwise open linearly: maxseg per window
2364 * (maxseg^2 / cwnd per packet).
2365 */
2366 {
2367 register u_int cw = tp->snd_cwnd;
2368 register u_int incr = tp->t_maxseg;
2369
2370 if (cw > tp->snd_ssthresh)
2371 incr = incr * incr / cw;
2372 /*
2373 * If t_dupacks != 0 here, it indicates that we are still
2374 * in NewReno fast recovery mode, so we leave the congestion
2375 * window alone.
2376 */
2377 if (tcp_do_newreno == 0 || tp->t_dupacks == 0)
2378 tp->snd_cwnd = min(cw + incr,TCP_MAXWIN<<tp->snd_scale);
2379 }
2380 if (acked > so->so_snd.sb_cc) {
2381 tp->snd_wnd -= so->so_snd.sb_cc;
2382 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
2383 ourfinisacked = 1;
2384 } else {
2385 sbdrop(&so->so_snd, acked);
2386 tp->snd_wnd -= acked;
2387 ourfinisacked = 0;
2388 }
2389 sowwakeup(so);
2390 /* detect una wraparound */
2391 if (SEQ_GEQ(tp->snd_una, tp->snd_recover) &&
2392 SEQ_LT(th->th_ack, tp->snd_recover))
2393 tp->snd_recover = th->th_ack;
2394 if (SEQ_GT(tp->snd_una, tp->snd_high) &&
2395 SEQ_LEQ(th->th_ack, tp->snd_high))
2396 tp->snd_high = th->th_ack - 1;
2397 tp->snd_una = th->th_ack;
2398 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
2399 tp->snd_nxt = tp->snd_una;
2400
2401 switch (tp->t_state) {
2402
2403 /*
2404 * In FIN_WAIT_1 STATE in addition to the processing
2405 * for the ESTABLISHED state if our FIN is now acknowledged
2406 * then enter FIN_WAIT_2.
2407 */
2408 case TCPS_FIN_WAIT_1:
2409 if (ourfinisacked) {
2410 /*
2411 * If we can't receive any more
2412 * data, then closing user can proceed.
2413 * Starting the timer is contrary to the
2414 * specification, but if we don't get a FIN
2415 * we'll hang forever.
2416 */
2417 if (so->so_state & SS_CANTRCVMORE) {
2418 soisdisconnected(so);
2419 tp->t_timer[TCPT_2MSL] = tcp_maxidle;
2420 }
2421 add_to_time_wait(tp);
2422 tp->t_state = TCPS_FIN_WAIT_2;
2423 goto drop;
2424 }
2425 break;
2426
2427 /*
2428 * In CLOSING STATE in addition to the processing for
2429 * the ESTABLISHED state if the ACK acknowledges our FIN
2430 * then enter the TIME-WAIT state, otherwise ignore
2431 * the segment.
2432 */
2433 case TCPS_CLOSING:
2434 if (ourfinisacked) {
2435 tp->t_state = TCPS_TIME_WAIT;
2436 tcp_canceltimers(tp);
2437 /* Shorten TIME_WAIT [RFC-1644, p.28] */
2438 if (tp->cc_recv != 0 &&
2439 tp->t_starttime < tcp_msl)
2440 tp->t_timer[TCPT_2MSL] =
2441 tp->t_rxtcur * TCPTV_TWTRUNC;
2442 else
2443 tp->t_timer[TCPT_2MSL] = 2 * tcp_msl;
2444 add_to_time_wait(tp);
2445 soisdisconnected(so);
2446 }
2447 break;
2448
2449 /*
2450 * In LAST_ACK, we may still be waiting for data to drain
2451 * and/or to be acked, as well as for the ack of our FIN.
2452 * If our FIN is now acknowledged, delete the TCB,
2453 * enter the closed state and return.
2454 */
2455 case TCPS_LAST_ACK:
2456 if (ourfinisacked) {
2457 tp = tcp_close(tp);
2458 goto drop;
2459 }
2460 break;
2461
2462 /*
2463 * In TIME_WAIT state the only thing that should arrive
2464 * is a retransmission of the remote FIN. Acknowledge
2465 * it and restart the finack timer.
2466 */
2467 case TCPS_TIME_WAIT:
2468 tp->t_timer[TCPT_2MSL] = 2 * tcp_msl;
2469 add_to_time_wait(tp);
2470 goto dropafterack;
2471 }
2472 }
2473
2474 step6:
2475 /*
2476 * Update window information.
2477 * Don't look at window if no ACK: TAC's send garbage on first SYN.
2478 */
2479 if ((thflags & TH_ACK) &&
2480 (SEQ_LT(tp->snd_wl1, th->th_seq) ||
2481 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
2482 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
2483 /* keep track of pure window updates */
2484 if (tlen == 0 &&
2485 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
2486 tcpstat.tcps_rcvwinupd++;
2487 tp->snd_wnd = tiwin;
2488 tp->snd_wl1 = th->th_seq;
2489 tp->snd_wl2 = th->th_ack;
2490 if (tp->snd_wnd > tp->max_sndwnd)
2491 tp->max_sndwnd = tp->snd_wnd;
2492 needoutput = 1;
2493 }
2494
2495 /*
2496 * Process segments with URG.
2497 */
2498 if ((thflags & TH_URG) && th->th_urp &&
2499 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2500 /*
2501 * This is a kludge, but if we receive and accept
2502 * random urgent pointers, we'll crash in
2503 * soreceive. It's hard to imagine someone
2504 * actually wanting to send this much urgent data.
2505 */
2506 if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
2507 th->th_urp = 0; /* XXX */
2508 thflags &= ~TH_URG; /* XXX */
2509 goto dodata; /* XXX */
2510 }
2511 /*
2512 * If this segment advances the known urgent pointer,
2513 * then mark the data stream. This should not happen
2514 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
2515 * a FIN has been received from the remote side.
2516 * In these states we ignore the URG.
2517 *
2518 * According to RFC961 (Assigned Protocols),
2519 * the urgent pointer points to the last octet
2520 * of urgent data. We continue, however,
2521 * to consider it to indicate the first octet
2522 * of data past the urgent section as the original
2523 * spec states (in one of two places).
2524 */
2525 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
2526 tp->rcv_up = th->th_seq + th->th_urp;
2527 so->so_oobmark = so->so_rcv.sb_cc +
2528 (tp->rcv_up - tp->rcv_nxt) - 1;
2529 if (so->so_oobmark == 0) {
2530 so->so_state |= SS_RCVATMARK;
2531 postevent(so, 0, EV_OOB);
2532 }
2533 sohasoutofband(so);
2534 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
2535 }
2536 /*
2537 * Remove out of band data so doesn't get presented to user.
2538 * This can happen independent of advancing the URG pointer,
2539 * but if two URG's are pending at once, some out-of-band
2540 * data may creep in... ick.
2541 */
2542 if (th->th_urp <= (u_long)tlen
2543 #if SO_OOBINLINE
2544 && (so->so_options & SO_OOBINLINE) == 0
2545 #endif
2546 )
2547 tcp_pulloutofband(so, th, m,
2548 drop_hdrlen); /* hdr drop is delayed */
2549 } else
2550 /*
2551 * If no out of band data is expected,
2552 * pull receive urgent pointer along
2553 * with the receive window.
2554 */
2555 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
2556 tp->rcv_up = tp->rcv_nxt;
2557 dodata: /* XXX */
2558
2559 /*
2560 * Process the segment text, merging it into the TCP sequencing queue,
2561 * and arranging for acknowledgment of receipt if necessary.
2562 * This process logically involves adjusting tp->rcv_wnd as data
2563 * is presented to the user (this happens in tcp_usrreq.c,
2564 * case PRU_RCVD). If a FIN has already been received on this
2565 * connection then we just ignore the text.
2566 */
2567 if ((tlen || (thflags&TH_FIN)) &&
2568 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2569 m_adj(m, drop_hdrlen); /* delayed header drop */
2570 /*
2571 * Insert segment which inludes th into reassembly queue of tcp with
2572 * control block tp. Return TH_FIN if reassembly now includes
2573 * a segment with FIN. This handle the common case inline (segment
2574 * is the next to be received on an established connection, and the
2575 * queue is empty), avoiding linkage into and removal from the queue
2576 * and repetition of various conversions.
2577 * Set DELACK for segments received in order, but ack immediately
2578 * when segments are out of order (so fast retransmit can work).
2579 */
2580 if (th->th_seq == tp->rcv_nxt &&
2581 LIST_EMPTY(&tp->t_segq) &&
2582 TCPS_HAVEESTABLISHED(tp->t_state)) {
2583 if (DELAY_ACK(tp) && ((tp->t_flags & TF_ACKNOW) == 0)) {
2584 tp->t_flags |= TF_DELACK;
2585 }
2586 else {
2587 tp->t_flags |= TF_ACKNOW;
2588 }
2589 tp->rcv_nxt += tlen;
2590 thflags = th->th_flags & TH_FIN;
2591 tcpstat.tcps_rcvpack++;
2592 tcpstat.tcps_rcvbyte += tlen;
2593 ND6_HINT(tp);
2594 if (sbappend(&so->so_rcv, m))
2595 sorwakeup(so);
2596 } else {
2597 thflags = tcp_reass(tp, th, &tlen, m);
2598 tp->t_flags |= TF_ACKNOW;
2599 }
2600
2601 if (tp->t_flags & TF_DELACK)
2602 {
2603 #if INET6
2604 if (isipv6) {
2605 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
2606 (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
2607 th->th_seq, th->th_ack, th->th_win);
2608 }
2609 else
2610 #endif
2611 {
2612 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
2613 (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
2614 th->th_seq, th->th_ack, th->th_win);
2615 }
2616
2617 }
2618 /*
2619 * Note the amount of data that peer has sent into
2620 * our window, in order to estimate the sender's
2621 * buffer size.
2622 */
2623 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
2624 } else {
2625 m_freem(m);
2626 thflags &= ~TH_FIN;
2627 }
2628
2629 /*
2630 * If FIN is received ACK the FIN and let the user know
2631 * that the connection is closing.
2632 */
2633 if (thflags & TH_FIN) {
2634 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2635 socantrcvmore(so);
2636 postevent(so, 0, EV_FIN);
2637 /*
2638 * If connection is half-synchronized
2639 * (ie NEEDSYN flag on) then delay ACK,
2640 * so it may be piggybacked when SYN is sent.
2641 * Otherwise, since we received a FIN then no
2642 * more input can be expected, send ACK now.
2643 */
2644 if (DELAY_ACK(tp) && (tp->t_flags & TF_NEEDSYN)) {
2645 tp->t_flags |= TF_DELACK;
2646 }
2647 else {
2648 tp->t_flags |= TF_ACKNOW;
2649 }
2650 tp->rcv_nxt++;
2651 }
2652 switch (tp->t_state) {
2653
2654 /*
2655 * In SYN_RECEIVED and ESTABLISHED STATES
2656 * enter the CLOSE_WAIT state.
2657 */
2658 case TCPS_SYN_RECEIVED:
2659 /*FALLTHROUGH*/
2660 case TCPS_ESTABLISHED:
2661 tp->t_state = TCPS_CLOSE_WAIT;
2662 break;
2663
2664 /*
2665 * If still in FIN_WAIT_1 STATE FIN has not been acked so
2666 * enter the CLOSING state.
2667 */
2668 case TCPS_FIN_WAIT_1:
2669 tp->t_state = TCPS_CLOSING;
2670 break;
2671
2672 /*
2673 * In FIN_WAIT_2 state enter the TIME_WAIT state,
2674 * starting the time-wait timer, turning off the other
2675 * standard timers.
2676 */
2677 case TCPS_FIN_WAIT_2:
2678 tp->t_state = TCPS_TIME_WAIT;
2679 tcp_canceltimers(tp);
2680 /* Shorten TIME_WAIT [RFC-1644, p.28] */
2681 if (tp->cc_recv != 0 &&
2682 tp->t_starttime < tcp_msl) {
2683 tp->t_timer[TCPT_2MSL] =
2684 tp->t_rxtcur * TCPTV_TWTRUNC;
2685 /* For transaction client, force ACK now. */
2686 tp->t_flags |= TF_ACKNOW;
2687 }
2688 else
2689 tp->t_timer[TCPT_2MSL] = 2 * tcp_msl;
2690
2691 add_to_time_wait(tp);
2692 soisdisconnected(so);
2693 break;
2694
2695 /*
2696 * In TIME_WAIT state restart the 2 MSL time_wait timer.
2697 */
2698 case TCPS_TIME_WAIT:
2699 tp->t_timer[TCPT_2MSL] = 2 * tcp_msl;
2700 add_to_time_wait(tp);
2701 break;
2702 }
2703 }
2704 #if TCPDEBUG
2705 if (so->so_options & SO_DEBUG)
2706 tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
2707 &tcp_savetcp, 0);
2708 #endif
2709
2710 /*
2711 * Return any desired output.
2712 */
2713 if (needoutput || (tp->t_flags & TF_ACKNOW))
2714 (void) tcp_output(tp);
2715 tcp_unlock(so, 1, 0);
2716 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
2717 return;
2718
2719 dropafterack:
2720 /*
2721 * Generate an ACK dropping incoming segment if it occupies
2722 * sequence space, where the ACK reflects our state.
2723 *
2724 * We can now skip the test for the RST flag since all
2725 * paths to this code happen after packets containing
2726 * RST have been dropped.
2727 *
2728 * In the SYN-RECEIVED state, don't send an ACK unless the
2729 * segment we received passes the SYN-RECEIVED ACK test.
2730 * If it fails send a RST. This breaks the loop in the
2731 * "LAND" DoS attack, and also prevents an ACK storm
2732 * between two listening ports that have been sent forged
2733 * SYN segments, each with the source address of the other.
2734 */
2735 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
2736 (SEQ_GT(tp->snd_una, th->th_ack) ||
2737 SEQ_GT(th->th_ack, tp->snd_max)) ) {
2738 rstreason = BANDLIM_RST_OPENPORT;
2739 goto dropwithreset;
2740 }
2741 #if TCPDEBUG
2742 if (so->so_options & SO_DEBUG)
2743 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
2744 &tcp_savetcp, 0);
2745 #endif
2746 m_freem(m);
2747 tp->t_flags |= TF_ACKNOW;
2748 (void) tcp_output(tp);
2749 tcp_unlock(so, 1, 0);
2750 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
2751 return;
2752 dropwithresetnosock:
2753 nosock = 1;
2754 dropwithreset:
2755 /*
2756 * Generate a RST, dropping incoming segment.
2757 * Make ACK acceptable to originator of segment.
2758 * Don't bother to respond if destination was broadcast/multicast.
2759 */
2760 if ((thflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST))
2761 goto drop;
2762 #if INET6
2763 if (isipv6) {
2764 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
2765 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
2766 goto drop;
2767 } else
2768 #endif /* INET6 */
2769 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
2770 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
2771 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
2772 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
2773 goto drop;
2774 /* IPv6 anycast check is done at tcp6_input() */
2775
2776 /*
2777 * Perform bandwidth limiting.
2778 */
2779 #if ICMP_BANDLIM
2780 if (badport_bandlim(rstreason) < 0)
2781 goto drop;
2782 #endif
2783
2784 #if TCPDEBUG
2785 if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
2786 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
2787 &tcp_savetcp, 0);
2788 #endif
2789 if (thflags & TH_ACK)
2790 /* mtod() below is safe as long as hdr dropping is delayed */
2791 tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack,
2792 TH_RST);
2793 else {
2794 if (thflags & TH_SYN)
2795 tlen++;
2796 /* mtod() below is safe as long as hdr dropping is delayed */
2797 tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
2798 (tcp_seq)0, TH_RST|TH_ACK);
2799 }
2800 /* destroy temporarily created socket */
2801 if (dropsocket) {
2802 (void) soabort(so);
2803 tcp_unlock(so, 1, 0);
2804 }
2805 else
2806 if ((inp != NULL) && (nosock == 0))
2807 tcp_unlock(so, 1, 0);
2808 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
2809 return;
2810 dropnosock:
2811 nosock = 1;
2812 drop:
2813 /*
2814 * Drop space held by incoming segment and return.
2815 */
2816 #if TCPDEBUG
2817 if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
2818 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
2819 &tcp_savetcp, 0);
2820 #endif
2821 m_freem(m);
2822 /* destroy temporarily created socket */
2823 if (dropsocket) {
2824 (void) soabort(so);
2825 tcp_unlock(so, 1, 0);
2826 }
2827 else
2828 if (nosock == 0)
2829 tcp_unlock(so, 1, 0);
2830 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
2831 return;
2832 }
2833
2834 static void
2835 tcp_dooptions(tp, cp, cnt, th, to)
2836 struct tcpcb *tp;
2837 u_char *cp;
2838 int cnt;
2839 struct tcphdr *th;
2840 struct tcpopt *to;
2841 {
2842 u_short mss = 0;
2843 int opt, optlen;
2844
2845 for (; cnt > 0; cnt -= optlen, cp += optlen) {
2846 opt = cp[0];
2847 if (opt == TCPOPT_EOL)
2848 break;
2849 if (opt == TCPOPT_NOP)
2850 optlen = 1;
2851 else {
2852 if (cnt < 2)
2853 break;
2854 optlen = cp[1];
2855 if (optlen < 2 || optlen > cnt)
2856 break;
2857 }
2858 switch (opt) {
2859
2860 default:
2861 continue;
2862
2863 case TCPOPT_MAXSEG:
2864 if (optlen != TCPOLEN_MAXSEG)
2865 continue;
2866 if (!(th->th_flags & TH_SYN))
2867 continue;
2868 bcopy((char *) cp + 2, (char *) &mss, sizeof(mss));
2869 NTOHS(mss);
2870 break;
2871
2872 case TCPOPT_WINDOW:
2873 if (optlen != TCPOLEN_WINDOW)
2874 continue;
2875 if (!(th->th_flags & TH_SYN))
2876 continue;
2877 tp->t_flags |= TF_RCVD_SCALE;
2878 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
2879 break;
2880
2881 case TCPOPT_TIMESTAMP:
2882 if (optlen != TCPOLEN_TIMESTAMP)
2883 continue;
2884 to->to_flag |= TOF_TS;
2885 bcopy((char *)cp + 2,
2886 (char *)&to->to_tsval, sizeof(to->to_tsval));
2887 NTOHL(to->to_tsval);
2888 bcopy((char *)cp + 6,
2889 (char *)&to->to_tsecr, sizeof(to->to_tsecr));
2890 NTOHL(to->to_tsecr);
2891
2892 /*
2893 * A timestamp received in a SYN makes
2894 * it ok to send timestamp requests and replies.
2895 */
2896 if (th->th_flags & TH_SYN) {
2897 tp->t_flags |= TF_RCVD_TSTMP;
2898 tp->ts_recent = to->to_tsval;
2899 tp->ts_recent_age = tcp_now;
2900 }
2901 break;
2902 case TCPOPT_CC:
2903 if (optlen != TCPOLEN_CC)
2904 continue;
2905 to->to_flag |= TOF_CC;
2906 bcopy((char *)cp + 2,
2907 (char *)&to->to_cc, sizeof(to->to_cc));
2908 NTOHL(to->to_cc);
2909 /*
2910 * A CC or CC.new option received in a SYN makes
2911 * it ok to send CC in subsequent segments.
2912 */
2913 if (th->th_flags & TH_SYN)
2914 tp->t_flags |= TF_RCVD_CC;
2915 break;
2916 case TCPOPT_CCNEW:
2917 if (optlen != TCPOLEN_CC)
2918 continue;
2919 if (!(th->th_flags & TH_SYN))
2920 continue;
2921 to->to_flag |= TOF_CCNEW;
2922 bcopy((char *)cp + 2,
2923 (char *)&to->to_cc, sizeof(to->to_cc));
2924 NTOHL(to->to_cc);
2925 /*
2926 * A CC or CC.new option received in a SYN makes
2927 * it ok to send CC in subsequent segments.
2928 */
2929 tp->t_flags |= TF_RCVD_CC;
2930 break;
2931 case TCPOPT_CCECHO:
2932 if (optlen != TCPOLEN_CC)
2933 continue;
2934 if (!(th->th_flags & TH_SYN))
2935 continue;
2936 to->to_flag |= TOF_CCECHO;
2937 bcopy((char *)cp + 2,
2938 (char *)&to->to_ccecho, sizeof(to->to_ccecho));
2939 NTOHL(to->to_ccecho);
2940 break;
2941 }
2942 }
2943 if (th->th_flags & TH_SYN)
2944 tcp_mss(tp, mss); /* sets t_maxseg */
2945 }
2946
2947 /*
2948 * Pull out of band byte out of a segment so
2949 * it doesn't appear in the user's data queue.
2950 * It is still reflected in the segment length for
2951 * sequencing purposes.
2952 */
2953 static void
2954 tcp_pulloutofband(so, th, m, off)
2955 struct socket *so;
2956 struct tcphdr *th;
2957 register struct mbuf *m;
2958 int off; /* delayed to be droped hdrlen */
2959 {
2960 int cnt = off + th->th_urp - 1;
2961
2962 while (cnt >= 0) {
2963 if (m->m_len > cnt) {
2964 char *cp = mtod(m, caddr_t) + cnt;
2965 struct tcpcb *tp = sototcpcb(so);
2966
2967 tp->t_iobc = *cp;
2968 tp->t_oobflags |= TCPOOB_HAVEDATA;
2969 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
2970 m->m_len--;
2971 if (m->m_flags & M_PKTHDR)
2972 m->m_pkthdr.len--;
2973 return;
2974 }
2975 cnt -= m->m_len;
2976 m = m->m_next;
2977 if (m == 0)
2978 break;
2979 }
2980 panic("tcp_pulloutofband");
2981 }
2982
2983 /*
2984 * Collect new round-trip time estimate
2985 * and update averages and current timeout.
2986 */
2987 static void
2988 tcp_xmit_timer(tp, rtt)
2989 register struct tcpcb *tp;
2990 int rtt;
2991 {
2992 register int delta;
2993
2994 tcpstat.tcps_rttupdated++;
2995 tp->t_rttupdated++;
2996 if (tp->t_srtt != 0) {
2997 /*
2998 * srtt is stored as fixed point with 5 bits after the
2999 * binary point (i.e., scaled by 8). The following magic
3000 * is equivalent to the smoothing algorithm in rfc793 with
3001 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
3002 * point). Adjust rtt to origin 0.
3003 */
3004 delta = ((rtt - 1) << TCP_DELTA_SHIFT)
3005 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
3006
3007 if ((tp->t_srtt += delta) <= 0)
3008 tp->t_srtt = 1;
3009
3010 /*
3011 * We accumulate a smoothed rtt variance (actually, a
3012 * smoothed mean difference), then set the retransmit
3013 * timer to smoothed rtt + 4 times the smoothed variance.
3014 * rttvar is stored as fixed point with 4 bits after the
3015 * binary point (scaled by 16). The following is
3016 * equivalent to rfc793 smoothing with an alpha of .75
3017 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
3018 * rfc793's wired-in beta.
3019 */
3020 if (delta < 0)
3021 delta = -delta;
3022 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
3023 if ((tp->t_rttvar += delta) <= 0)
3024 tp->t_rttvar = 1;
3025 } else {
3026 /*
3027 * No rtt measurement yet - use the unsmoothed rtt.
3028 * Set the variance to half the rtt (so our first
3029 * retransmit happens at 3*rtt).
3030 */
3031 tp->t_srtt = rtt << TCP_RTT_SHIFT;
3032 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
3033 }
3034 tp->t_rtttime = 0;
3035 tp->t_rxtshift = 0;
3036
3037 /*
3038 * the retransmit should happen at rtt + 4 * rttvar.
3039 * Because of the way we do the smoothing, srtt and rttvar
3040 * will each average +1/2 tick of bias. When we compute
3041 * the retransmit timer, we want 1/2 tick of rounding and
3042 * 1 extra tick because of +-1/2 tick uncertainty in the
3043 * firing of the timer. The bias will give us exactly the
3044 * 1.5 tick we need. But, because the bias is
3045 * statistical, we have to test that we don't drop below
3046 * the minimum feasible timer (which is 2 ticks).
3047 */
3048 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
3049 max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
3050
3051 /*
3052 * We received an ack for a packet that wasn't retransmitted;
3053 * it is probably safe to discard any error indications we've
3054 * received recently. This isn't quite right, but close enough
3055 * for now (a route might have failed after we sent a segment,
3056 * and the return path might not be symmetrical).
3057 */
3058 tp->t_softerror = 0;
3059 }
3060
3061 /*
3062 * Determine a reasonable value for maxseg size.
3063 * If the route is known, check route for mtu.
3064 * If none, use an mss that can be handled on the outgoing
3065 * interface without forcing IP to fragment; if bigger than
3066 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
3067 * to utilize large mbufs. If no route is found, route has no mtu,
3068 * or the destination isn't local, use a default, hopefully conservative
3069 * size (usually 512 or the default IP max size, but no more than the mtu
3070 * of the interface), as we can't discover anything about intervening
3071 * gateways or networks. We also initialize the congestion/slow start
3072 * window to be a single segment if the destination isn't local.
3073 * While looking at the routing entry, we also initialize other path-dependent
3074 * parameters from pre-set or cached values in the routing entry.
3075 *
3076 * Also take into account the space needed for options that we
3077 * send regularly. Make maxseg shorter by that amount to assure
3078 * that we can send maxseg amount of data even when the options
3079 * are present. Store the upper limit of the length of options plus
3080 * data in maxopd.
3081 *
3082 * NOTE that this routine is only called when we process an incoming
3083 * segment, for outgoing segments only tcp_mssopt is called.
3084 *
3085 * In case of T/TCP, we call this routine during implicit connection
3086 * setup as well (offer = -1), to initialize maxseg from the cached
3087 * MSS of our peer.
3088 */
3089 void
3090 tcp_mss(tp, offer)
3091 struct tcpcb *tp;
3092 int offer;
3093 {
3094 register struct rtentry *rt;
3095 struct ifnet *ifp;
3096 register int rtt, mss;
3097 u_long bufsize;
3098 struct inpcb *inp;
3099 struct socket *so;
3100 struct rmxp_tao *taop;
3101 int origoffer = offer;
3102 #if INET6
3103 int isipv6;
3104 int min_protoh;
3105 #endif
3106
3107 inp = tp->t_inpcb;
3108 #if INET6
3109 isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
3110 min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr)
3111 : sizeof (struct tcpiphdr);
3112 #else
3113 #define min_protoh (sizeof (struct tcpiphdr))
3114 #endif
3115 #if INET6
3116 if (isipv6)
3117 rt = tcp_rtlookup6(inp);
3118 else
3119 #endif /* INET6 */
3120 rt = tcp_rtlookup(inp);
3121 if (rt == NULL) {
3122 tp->t_maxopd = tp->t_maxseg =
3123 #if INET6
3124 isipv6 ? tcp_v6mssdflt :
3125 #endif /* INET6 */
3126 tcp_mssdflt;
3127 return;
3128 }
3129 ifp = rt->rt_ifp;
3130 /*
3131 * Slower link window correction:
3132 * If a value is specificied for slowlink_wsize use it for PPP links
3133 * believed to be on a serial modem (speed <128Kbps). Excludes 9600bps as
3134 * it is the default value adversized by pseudo-devices over ppp.
3135 */
3136 if (ifp->if_type == IFT_PPP && slowlink_wsize > 0 &&
3137 ifp->if_baudrate > 9600 && ifp->if_baudrate <= 128000) {
3138 tp->t_flags |= TF_SLOWLINK;
3139 }
3140 so = inp->inp_socket;
3141
3142 taop = rmx_taop(rt->rt_rmx);
3143 /*
3144 * Offer == -1 means that we didn't receive SYN yet,
3145 * use cached value in that case;
3146 */
3147 if (offer == -1)
3148 offer = taop->tao_mssopt;
3149 /*
3150 * Offer == 0 means that there was no MSS on the SYN segment,
3151 * in this case we use tcp_mssdflt.
3152 */
3153 if (offer == 0)
3154 offer =
3155 #if INET6
3156 isipv6 ? tcp_v6mssdflt :
3157 #endif /* INET6 */
3158 tcp_mssdflt;
3159 else {
3160 /*
3161 * Prevent DoS attack with too small MSS. Round up
3162 * to at least minmss.
3163 */
3164 offer = max(offer, tcp_minmss);
3165 /*
3166 * Sanity check: make sure that maxopd will be large
3167 * enough to allow some data on segments even is the
3168 * all the option space is used (40bytes). Otherwise
3169 * funny things may happen in tcp_output.
3170 */
3171 offer = max(offer, 64);
3172 }
3173 taop->tao_mssopt = offer;
3174
3175 /*
3176 * While we're here, check if there's an initial rtt
3177 * or rttvar. Convert from the route-table units
3178 * to scaled multiples of the slow timeout timer.
3179 */
3180 if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) {
3181 /*
3182 * XXX the lock bit for RTT indicates that the value
3183 * is also a minimum value; this is subject to time.
3184 */
3185 if (rt->rt_rmx.rmx_locks & RTV_RTT)
3186 tp->t_rttmin = rtt / (RTM_RTTUNIT / PR_SLOWHZ);
3187 tp->t_srtt = rtt / (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTT_SCALE));
3188 tcpstat.tcps_usedrtt++;
3189 if (rt->rt_rmx.rmx_rttvar) {
3190 tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
3191 (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTTVAR_SCALE));
3192 tcpstat.tcps_usedrttvar++;
3193 } else {
3194 /* default variation is +- 1 rtt */
3195 tp->t_rttvar =
3196 tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
3197 }
3198 TCPT_RANGESET(tp->t_rxtcur,
3199 ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
3200 tp->t_rttmin, TCPTV_REXMTMAX);
3201 }
3202 /*
3203 * if there's an mtu associated with the route, use it
3204 * else, use the link mtu.
3205 */
3206 if (rt->rt_rmx.rmx_mtu)
3207 mss = rt->rt_rmx.rmx_mtu - min_protoh;
3208 else
3209 {
3210 mss =
3211 #if INET6
3212 (isipv6 ? nd_ifinfo[rt->rt_ifp->if_index].linkmtu :
3213 #endif
3214 ifp->if_mtu
3215 #if INET6
3216 )
3217 #endif
3218 - min_protoh;
3219 #if INET6
3220 if (isipv6) {
3221 if (!in6_localaddr(&inp->in6p_faddr))
3222 mss = min(mss, tcp_v6mssdflt);
3223 } else
3224 #endif /* INET6 */
3225 if (!in_localaddr(inp->inp_faddr))
3226 mss = min(mss, tcp_mssdflt);
3227 }
3228 mss = min(mss, offer);
3229 /*
3230 * maxopd stores the maximum length of data AND options
3231 * in a segment; maxseg is the amount of data in a normal
3232 * segment. We need to store this value (maxopd) apart
3233 * from maxseg, because now every segment carries options
3234 * and thus we normally have somewhat less data in segments.
3235 */
3236 tp->t_maxopd = mss;
3237
3238 /*
3239 * In case of T/TCP, origoffer==-1 indicates, that no segments
3240 * were received yet. In this case we just guess, otherwise
3241 * we do the same as before T/TCP.
3242 */
3243 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
3244 (origoffer == -1 ||
3245 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
3246 mss -= TCPOLEN_TSTAMP_APPA;
3247 if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
3248 (origoffer == -1 ||
3249 (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC))
3250 mss -= TCPOLEN_CC_APPA;
3251
3252 /*
3253 * If there's a pipesize (ie loopback), change the socket
3254 * buffer to that size only if it's bigger than the current
3255 * sockbuf size. Make the socket buffers an integral
3256 * number of mss units; if the mss is larger than
3257 * the socket buffer, decrease the mss.
3258 */
3259 #if RTV_SPIPE
3260 bufsize = rt->rt_rmx.rmx_sendpipe;
3261 if (bufsize < so->so_snd.sb_hiwat)
3262 #endif
3263 bufsize = so->so_snd.sb_hiwat;
3264 if (bufsize < mss)
3265 mss = bufsize;
3266 else {
3267 bufsize = roundup(bufsize, mss);
3268 if (bufsize > sb_max)
3269 bufsize = sb_max;
3270 (void)sbreserve(&so->so_snd, bufsize);
3271 }
3272 tp->t_maxseg = mss;
3273
3274 #if RTV_RPIPE
3275 bufsize = rt->rt_rmx.rmx_recvpipe;
3276 if (bufsize < so->so_rcv.sb_hiwat)
3277 #endif
3278 bufsize = so->so_rcv.sb_hiwat;
3279 if (bufsize > mss) {
3280 bufsize = roundup(bufsize, mss);
3281 if (bufsize > sb_max)
3282 bufsize = sb_max;
3283 (void)sbreserve(&so->so_rcv, bufsize);
3284 }
3285
3286 /*
3287 * Set the slow-start flight size depending on whether this
3288 * is a local network or not.
3289 */
3290 if (
3291 #if INET6
3292 (isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
3293 (!isipv6 &&
3294 #endif
3295 in_localaddr(inp->inp_faddr)
3296 #if INET6
3297 )
3298 #endif
3299 )
3300 tp->snd_cwnd = mss * ss_fltsz_local;
3301 else
3302 tp->snd_cwnd = mss * ss_fltsz;
3303
3304 if (rt->rt_rmx.rmx_ssthresh) {
3305 /*
3306 * There's some sort of gateway or interface
3307 * buffer limit on the path. Use this to set
3308 * the slow start threshhold, but set the
3309 * threshold to no less than 2*mss.
3310 */
3311 tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
3312 tcpstat.tcps_usedssthresh++;
3313 }
3314 }
3315
3316 /*
3317 * Determine the MSS option to send on an outgoing SYN.
3318 */
3319 int
3320 tcp_mssopt(tp)
3321 struct tcpcb *tp;
3322 {
3323 struct rtentry *rt;
3324 #if INET6
3325 int isipv6;
3326 int min_protoh;
3327 #endif
3328
3329 #if INET6
3330 isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
3331 min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr)
3332 : sizeof (struct tcpiphdr);
3333 #else
3334 #define min_protoh (sizeof (struct tcpiphdr))
3335 #endif
3336 #if INET6
3337 if (isipv6)
3338 rt = tcp_rtlookup6(tp->t_inpcb);
3339 else
3340 #endif /* INET6 */
3341 rt = tcp_rtlookup(tp->t_inpcb);
3342 if (rt == NULL)
3343 return
3344 #if INET6
3345 isipv6 ? tcp_v6mssdflt :
3346 #endif /* INET6 */
3347 tcp_mssdflt;
3348 /*
3349 * Slower link window correction:
3350 * If a value is specificied for slowlink_wsize use it for PPP links
3351 * believed to be on a serial modem (speed <128Kbps). Excludes 9600bps as
3352 * it is the default value adversized by pseudo-devices over ppp.
3353 */
3354 if (rt->rt_ifp->if_type == IFT_PPP && slowlink_wsize > 0 &&
3355 rt->rt_ifp->if_baudrate > 9600 && rt->rt_ifp->if_baudrate <= 128000) {
3356 tp->t_flags |= TF_SLOWLINK;
3357 }
3358
3359 return rt->rt_ifp->if_mtu - min_protoh;
3360 }
3361
3362
3363 /*
3364 * Checks for partial ack. If partial ack arrives, force the retransmission
3365 * of the next unacknowledged segment, do not clear tp->t_dupacks, and return
3366 * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to
3367 * be started again. If the ack advances at least to tp->snd_recover, return 0.
3368 */
3369 static int
3370 tcp_newreno(tp, th)
3371 struct tcpcb *tp;
3372 struct tcphdr *th;
3373 {
3374 if (SEQ_LT(th->th_ack, tp->snd_recover)) {
3375 tcp_seq onxt = tp->snd_nxt;
3376 u_long ocwnd = tp->snd_cwnd;
3377 #ifdef __APPLE__
3378 tp->t_timer[TCPT_REXMT] = 0;
3379 #else
3380 callout_stop(tp->tt_rexmt);
3381 #endif
3382 tp->t_rtttime = 0;
3383 tp->snd_nxt = th->th_ack;
3384 /*
3385 * Set snd_cwnd to one segment beyond acknowledged offset
3386 * (tp->snd_una has not yet been updated when this function
3387 * is called)
3388 */
3389 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una);
3390 tp->t_flags |= TF_ACKNOW;
3391 (void) tcp_output(tp);
3392 tp->snd_cwnd = ocwnd;
3393 if (SEQ_GT(onxt, tp->snd_nxt))
3394 tp->snd_nxt = onxt;
3395 /*
3396 * Partial window deflation. Relies on fact that tp->snd_una
3397 * not updated yet.
3398 */
3399 tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_maxseg);
3400 return (1);
3401 }
3402 return (0);
3403 }
3404
3405 /*
3406 * Drop a random TCP connection that hasn't been serviced yet and
3407 * is eligible for discard. There is a one in qlen chance that
3408 * we will return a null, saying that there are no dropable
3409 * requests. In this case, the protocol specific code should drop
3410 * the new request. This insures fairness.
3411 *
3412 * The listening TCP socket "head" must be locked
3413 */
3414 static int
3415 tcpdropdropablreq(struct socket *head)
3416 {
3417 struct socket *so;
3418 unsigned int i, j, qlen;
3419 static int rnd;
3420 static struct timeval old_runtime;
3421 static unsigned int cur_cnt, old_cnt;
3422 struct timeval tv;
3423 struct inpcb *inp = NULL;
3424 struct tcpcb *tp;
3425
3426 microtime(&tv);
3427 if ((i = (tv.tv_sec - old_runtime.tv_sec)) != 0) {
3428 old_runtime = tv;
3429 old_cnt = cur_cnt / i;
3430 cur_cnt = 0;
3431 }
3432
3433 so = TAILQ_FIRST(&head->so_incomp);
3434 if (!so)
3435 return 0;
3436
3437 qlen = head->so_incqlen;
3438 if (++cur_cnt > qlen || old_cnt > qlen) {
3439 rnd = (314159 * rnd + 66329) & 0xffff;
3440 j = ((qlen + 1) * rnd) >> 16;
3441
3442 while (j-- && so)
3443 so = TAILQ_NEXT(so, so_list);
3444 }
3445 /* Find a connection that is not already closing */
3446 while (so) {
3447 inp = (struct inpcb *)so->so_pcb;
3448
3449 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) != WNT_STOPUSING)
3450 break;
3451
3452 so = TAILQ_NEXT(so, so_list);
3453 }
3454 if (!so)
3455 return 0;
3456
3457 head->so_incqlen--;
3458 head->so_qlen--;
3459 TAILQ_REMOVE(&head->so_incomp, so, so_list);
3460 tcp_unlock(head, 0, 0);
3461
3462 /* Let's remove this connection from the incomplete list */
3463 tcp_lock(so, 1, 0);
3464
3465 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
3466 tcp_unlock(so, 1, 0);
3467 return 0;
3468 }
3469
3470 so->so_head = NULL;
3471 so->so_usecount--; /* No more held by so_head */
3472
3473 /*
3474 * We do not want to lose track of the PCB right away in case we receive
3475 * more segments from the peer
3476 */
3477 tp = sototcpcb(so);
3478 tp->t_flags |= TF_LQ_OVERFLOW;
3479 tp->t_state = TCPS_CLOSED;
3480 (void) tcp_output(tp);
3481 tcpstat.tcps_drops++;
3482 soisdisconnected(so);
3483 tcp_canceltimers(tp);
3484 add_to_time_wait(tp);
3485
3486 tcp_unlock(so, 1, 0);
3487 tcp_lock(head, 0, 0);
3488
3489 return 1;
3490
3491 }
3492
3493