]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/tcp_input.c
xnu-1699.22.81.tar.gz
[apple/xnu.git] / bsd / netinet / tcp_input.c
1 /*
2 * Copyright (c) 2000-2011 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
30 * The Regents of the University of California. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
61 * $FreeBSD: src/sys/netinet/tcp_input.c,v 1.107.2.16 2001/08/22 00:59:12 silby Exp $
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/kernel.h>
73 #include <sys/sysctl.h>
74 #include <sys/malloc.h>
75 #include <sys/mbuf.h>
76 #include <sys/proc.h> /* for proc0 declaration */
77 #include <sys/protosw.h>
78 #include <sys/socket.h>
79 #include <sys/socketvar.h>
80 #include <sys/syslog.h>
81
82 #include <kern/cpu_number.h> /* before tcp_seq.h, for tcp_random18() */
83
84 #include <machine/endian.h>
85
86 #include <net/if.h>
87 #include <net/if_types.h>
88 #include <net/route.h>
89 #include <net/ntstat.h>
90
91 #include <netinet/in.h>
92 #include <netinet/in_systm.h>
93 #include <netinet/ip.h>
94 #include <netinet/ip_icmp.h> /* for ICMP_BANDLIM */
95 #include <netinet/in_var.h>
96 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
97 #include <netinet/in_pcb.h>
98 #include <netinet/ip_var.h>
99 #include <mach/sdt.h>
100 #if INET6
101 #include <netinet/ip6.h>
102 #include <netinet/icmp6.h>
103 #include <netinet6/nd6.h>
104 #include <netinet6/ip6_var.h>
105 #include <netinet6/in6_pcb.h>
106 #endif
107 #include <netinet/tcp.h>
108 #include <netinet/tcp_fsm.h>
109 #include <netinet/tcp_seq.h>
110 #include <netinet/tcp_timer.h>
111 #include <netinet/tcp_var.h>
112 #include <netinet/tcp_cc.h>
113 #include <kern/zalloc.h>
114 #if INET6
115 #include <netinet6/tcp6_var.h>
116 #endif
117 #include <netinet/tcpip.h>
118 #if TCPDEBUG
119 #include <netinet/tcp_debug.h>
120 u_char tcp_saveipgen[40]; /* the size must be of max ip header, now IPv6 */
121 struct tcphdr tcp_savetcp;
122 #endif /* TCPDEBUG */
123
124 #if IPSEC
125 #include <netinet6/ipsec.h>
126 #if INET6
127 #include <netinet6/ipsec6.h>
128 #endif
129 #include <netkey/key.h>
130 #endif /*IPSEC*/
131
132 #if CONFIG_MACF_NET || CONFIG_MACF_SOCKET
133 #include <security/mac_framework.h>
134 #endif /* CONFIG_MACF_NET || CONFIG_MACF_SOCKET */
135
136 #include <sys/kdebug.h>
137
138 #define DBG_LAYER_BEG NETDBG_CODE(DBG_NETTCP, 0)
139 #define DBG_LAYER_END NETDBG_CODE(DBG_NETTCP, 2)
140 #define DBG_FNC_TCP_INPUT NETDBG_CODE(DBG_NETTCP, (3 << 8))
141 #define DBG_FNC_TCP_NEWCONN NETDBG_CODE(DBG_NETTCP, (7 << 8))
142
143 static int tcprexmtthresh = 2;
144 tcp_cc tcp_ccgen;
145
146 #if IPSEC
147 extern int ipsec_bypass;
148 #endif
149
150 struct tcpstat tcpstat;
151
152 static int log_in_vain = 0;
153 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW | CTLFLAG_LOCKED,
154 &log_in_vain, 0, "Log all incoming TCP connections");
155
156 static int blackhole = 0;
157 SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW | CTLFLAG_LOCKED,
158 &blackhole, 0, "Do not send RST when dropping refused connections");
159
160 int tcp_delack_enabled = 3;
161 SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW | CTLFLAG_LOCKED,
162 &tcp_delack_enabled, 0,
163 "Delay ACK to try and piggyback it onto a data packet");
164
165 int tcp_lq_overflow = 1;
166 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_lq_overflow, CTLFLAG_RW | CTLFLAG_LOCKED,
167 &tcp_lq_overflow, 0,
168 "Listen Queue Overflow");
169
170 int tcp_recv_bg = 0;
171 SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbg, CTLFLAG_RW | CTLFLAG_LOCKED,
172 &tcp_recv_bg, 0,
173 "Receive background");
174
175 #if TCP_DROP_SYNFIN
176 static int drop_synfin = 1;
177 SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW | CTLFLAG_LOCKED,
178 &drop_synfin, 0, "Drop TCP packets with SYN+FIN set");
179 #endif
180
181 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW|CTLFLAG_LOCKED, 0,
182 "TCP Segment Reassembly Queue");
183
184 __private_extern__ int tcp_reass_maxseg = 0;
185 SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RW | CTLFLAG_LOCKED,
186 &tcp_reass_maxseg, 0,
187 "Global maximum number of TCP Segments in Reassembly Queue");
188
189 __private_extern__ int tcp_reass_qsize = 0;
190 SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_RD | CTLFLAG_LOCKED,
191 &tcp_reass_qsize, 0,
192 "Global number of TCP Segments currently in Reassembly Queue");
193
194 static int tcp_reass_overflows = 0;
195 SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD | CTLFLAG_LOCKED,
196 &tcp_reass_overflows, 0,
197 "Global number of TCP Segment Reassembly Queue Overflows");
198
199
200 __private_extern__ int slowlink_wsize = 8192;
201 SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowlink_wsize, CTLFLAG_RW | CTLFLAG_LOCKED,
202 &slowlink_wsize, 0, "Maximum advertised window size for slowlink");
203
204 int maxseg_unacked = 8;
205 SYSCTL_INT(_net_inet_tcp, OID_AUTO, maxseg_unacked, CTLFLAG_RW | CTLFLAG_LOCKED,
206 &maxseg_unacked, 0, "Maximum number of outstanding segments left unacked");
207
208 int tcp_do_rfc3465 = 1;
209 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW | CTLFLAG_LOCKED,
210 &tcp_do_rfc3465, 0, "");
211
212 int tcp_do_rfc3465_lim2 = 1;
213 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465_lim2, CTLFLAG_RW | CTLFLAG_LOCKED,
214 &tcp_do_rfc3465_lim2, 0, "Appropriate bytes counting w/ L=2*SMSS");
215
216 int rtt_samples_per_slot = 20;
217 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rtt_samples_per_slot, CTLFLAG_RW | CTLFLAG_LOCKED,
218 &rtt_samples_per_slot, 0, "Number of RTT samples stored for rtt history");
219
220 int tcp_allowed_iaj = ALLOWED_IAJ;
221 SYSCTL_INT(_net_inet_tcp, OID_AUTO, recv_allowed_iaj, CTLFLAG_RW | CTLFLAG_LOCKED,
222 &tcp_allowed_iaj, 0, "Allowed inter-packet arrival jiter");
223
224 int tcp_acc_iaj_high_thresh = ACC_IAJ_HIGH_THRESH;
225 SYSCTL_INT(_net_inet_tcp, OID_AUTO, acc_iaj_high_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
226 &tcp_acc_iaj_high_thresh, 0, "Used in calculating maximum accumulated IAJ");
227
228 #if CONFIG_IFEF_NOWINDOWSCALE
229 int tcp_obey_ifef_nowindowscale = 0;
230 SYSCTL_INT(_net_inet_tcp, OID_AUTO, obey_ifef_nowindowscale, CTLFLAG_RW | CTLFLAG_LOCKED,
231 &tcp_obey_ifef_nowindowscale, 0, "");
232 #endif
233
234 extern int tcp_TCPTV_MIN;
235 extern int tcp_acc_iaj_high;
236 extern int tcp_acc_iaj_react_limit;
237 extern struct zone *tcp_reass_zone;
238
239
240 u_int32_t tcp_now;
241 struct timeval tcp_uptime; /* uptime when tcp_now was last updated */
242 lck_spin_t *tcp_uptime_lock; /* Used to sychronize updates to tcp_now */
243
244 struct inpcbhead tcb;
245 #define tcb6 tcb /* for KAME src sync over BSD*'s */
246 struct inpcbinfo tcbinfo;
247
248 static void tcp_dooptions(struct tcpcb *, u_char *, int, struct tcphdr *,
249 struct tcpopt *, unsigned int);
250 static void tcp_pulloutofband(struct socket *,
251 struct tcphdr *, struct mbuf *, int);
252 static int tcp_reass(struct tcpcb *, struct tcphdr *, int *,
253 struct mbuf *);
254 static void tcp_xmit_timer(struct tcpcb *, int);
255 static inline unsigned int tcp_maxmtu(struct rtentry *);
256 static inline int tcp_stretch_ack_enable(struct tcpcb *tp);
257
258 #if TRAFFIC_MGT
259 static inline void update_iaj_state(struct tcpcb *tp, uint32_t tlen, int reset_size);
260 void compute_iaj(struct tcpcb *tp);
261 static inline void clear_iaj_state(struct tcpcb *tp);
262 #endif /* TRAFFIC_MGT */
263
264 #if INET6
265 static inline unsigned int tcp_maxmtu6(struct rtentry *);
266 #endif
267
268 /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
269 #if INET6
270 #define ND6_HINT(tp) \
271 do { \
272 if ((tp) && (tp)->t_inpcb && \
273 ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0 && \
274 (tp)->t_inpcb->in6p_route.ro_rt) \
275 nd6_nud_hint((tp)->t_inpcb->in6p_route.ro_rt, NULL, 0); \
276 } while (0)
277 #else
278 #define ND6_HINT(tp)
279 #endif
280
281 extern void add_to_time_wait(struct tcpcb *, uint32_t delay);
282 extern void postevent(struct socket *, struct sockbuf *, int);
283
284 extern void ipfwsyslog( int level, const char *format,...);
285 extern int ChkAddressOK( __uint32_t dstaddr, __uint32_t srcaddr );
286 extern int fw_verbose;
287 __private_extern__ int tcp_sockthreshold;
288 __private_extern__ int tcp_win_scale;
289
290 #if IPFIREWALL
291 #define log_in_vain_log( a ) { \
292 if ( (log_in_vain == 3 ) && (fw_verbose == 2)) { /* Apple logging, log to ipfw.log */ \
293 ipfwsyslog a ; \
294 } \
295 else log a ; \
296 }
297 #else
298 #define log_in_vain_log( a ) { log a; }
299 #endif
300
301 int tcp_rcvunackwin = TCPTV_UNACKWIN;
302 int tcp_maxrcvidle = TCPTV_MAXRCVIDLE;
303 int tcp_rcvsspktcnt = TCP_RCV_SS_PKTCOUNT;
304
305 #define DELAY_ACK(tp, th) (CC_ALGO(tp)->delay_ack != NULL && CC_ALGO(tp)->delay_ack(tp, th))
306
307 static int tcp_dropdropablreq(struct socket *head);
308 static void tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th);
309
310 static void update_base_rtt(struct tcpcb *tp, uint32_t rtt);
311 uint32_t get_base_rtt(struct tcpcb *tp);
312 void tcp_set_background_cc(struct socket *so);
313 void tcp_set_foreground_cc(struct socket *so);
314 static void tcp_set_new_cc(struct socket *so, uint16_t cc_index);
315
316 #if TRAFFIC_MGT
317 void
318 reset_acc_iaj(struct tcpcb *tp)
319 {
320 tp->acc_iaj = 0;
321 tp->iaj_rwintop = 0;
322 clear_iaj_state(tp);
323 }
324
325 static inline void
326 update_iaj_state(struct tcpcb *tp, uint32_t size, int rst_size)
327 {
328 if (rst_size > 0)
329 tp->iaj_size = 0;
330 if (tp->iaj_size == 0 || size >= tp->iaj_size) {
331 tp->iaj_size = size;
332 tp->iaj_rcv_ts = tcp_now;
333 tp->iaj_small_pkt = 0;
334 }
335 }
336
337 static inline void
338 clear_iaj_state(struct tcpcb *tp)
339 {
340 tp->iaj_rcv_ts = 0;
341 }
342
343 /* For every 32 bit unsigned integer(v), this function will find the
344 * largest integer n such that (n*n <= v). This takes at most 16 iterations
345 * irrespective of the value of v and does not involve multiplications.
346 */
347 static inline int
348 isqrt(unsigned int val) {
349 unsigned int sqrt_cache[11] = {0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100};
350 unsigned int temp, g=0, b=0x8000, bshft=15;
351 if ( val <= 100) {
352 for (g = 0; g <= 10; ++g) {
353 if (sqrt_cache[g] > val) {
354 g--;
355 break;
356 } else if (sqrt_cache[g] == val) {
357 break;
358 }
359 }
360 } else {
361 do {
362 temp = (((g << 1) + b) << (bshft--));
363 if (val >= temp) {
364 g += b;
365 val -= temp;
366 }
367 b >>= 1;
368 } while ( b > 0 && val > 0);
369 }
370 return(g);
371 }
372
373 void
374 compute_iaj(struct tcpcb *tp)
375 {
376 /* When accumulated IAJ reaches MAX_ACC_IAJ in milliseconds, throttle the
377 * receive window to a minimum of MIN_IAJ_WIN packets
378 */
379 #define MAX_ACC_IAJ (tcp_acc_iaj_high_thresh + tcp_acc_iaj_react_limit)
380
381 uint32_t allowed_iaj, acc_iaj = 0;
382 uint32_t cur_iaj = tcp_now - tp->iaj_rcv_ts;
383
384 uint32_t mean, temp;
385 int32_t cur_iaj_dev;
386 cur_iaj_dev = (cur_iaj - tp->avg_iaj);
387
388 /* Allow a jitter of "allowed_iaj" milliseconds. Some connections may have a
389 * constant jitter more than that. We detect this by using
390 * standard deviation.
391 */
392 allowed_iaj = tp->avg_iaj + tp->std_dev_iaj;
393 if (allowed_iaj < tcp_allowed_iaj)
394 allowed_iaj = tcp_allowed_iaj;
395
396 /* Initially when the connection starts, the senders congestion window
397 * is small. During this period we avoid throttling a connection because
398 * we do not have a good starting point for allowed_iaj. IAJ_IGNORE_PKTCNT
399 * is used to quietly gloss over the first few packets.
400 */
401 if (tp->iaj_pktcnt > IAJ_IGNORE_PKTCNT) {
402 if ( cur_iaj <= allowed_iaj ) {
403 if (tp->acc_iaj >= 2)
404 acc_iaj = tp->acc_iaj - 2;
405 else
406 acc_iaj = 0;
407 } else {
408 acc_iaj = tp->acc_iaj + (cur_iaj - allowed_iaj);
409 }
410
411 if (acc_iaj > MAX_ACC_IAJ)
412 acc_iaj = MAX_ACC_IAJ;
413 tp->acc_iaj = acc_iaj;
414 }
415
416 /* Compute weighted average where the history has a weight of
417 * 15 out of 16 and the current value has a weight of 1 out of 16.
418 * This will make the short-term measurements have more weight.
419 */
420 tp->avg_iaj = (((tp->avg_iaj << 4) - tp->avg_iaj) + cur_iaj) >> 4;
421
422 /* Compute Root-mean-square of deviation where mean is a weighted
423 * average as described above
424 */
425 temp = tp->std_dev_iaj * tp->std_dev_iaj;
426 mean = (((temp << 4) - temp) + (cur_iaj_dev * cur_iaj_dev)) >> 4;
427
428 tp->std_dev_iaj = isqrt(mean);
429
430 DTRACE_TCP3(iaj, struct tcpcb *, tp, uint32_t, cur_iaj, uint32_t, allowed_iaj);
431
432 return;
433 }
434 #endif /* TRAFFIC_MGT */
435
436 static int
437 tcp_reass(tp, th, tlenp, m)
438 register struct tcpcb *tp;
439 register struct tcphdr *th;
440 int *tlenp;
441 struct mbuf *m;
442 {
443 struct tseg_qent *q;
444 struct tseg_qent *p = NULL;
445 struct tseg_qent *nq;
446 struct tseg_qent *te = NULL;
447 struct socket *so = tp->t_inpcb->inp_socket;
448 int flags;
449 int dowakeup = 0;
450
451 /*
452 * Call with th==0 after become established to
453 * force pre-ESTABLISHED data up to user socket.
454 */
455 if (th == NULL)
456 goto present;
457
458 /* If the reassembly queue already has entries or if we are going to add
459 * a new one, then the connection has reached a loss state.
460 * Reset the stretch-ack algorithm at this point.
461 */
462 if ((tp->t_flags & TF_STRETCHACK) != 0)
463 tcp_reset_stretch_ack(tp);
464
465 /* When the connection reaches a loss state, we need to send more acks
466 * for a period of time so that the sender's congestion window will
467 * open. Wait until we see some packets on the connection before
468 * stretching acks again.
469 */
470 tp->t_flagsext |= TF_RCVUNACK_WAITSS;
471 tp->rcv_waitforss = 0;
472
473
474 #if TRAFFIC_MGT
475 if (tp->acc_iaj > 0)
476 reset_acc_iaj(tp);
477 #endif /* TRAFFIC_MGT */
478
479 /*
480 * Limit the number of segments in the reassembly queue to prevent
481 * holding on to too many segments (and thus running out of mbufs).
482 * Make sure to let the missing segment through which caused this
483 * queue. Always keep one global queue entry spare to be able to
484 * process the missing segment.
485 */
486 if (th->th_seq != tp->rcv_nxt &&
487 tcp_reass_qsize + 1 >= tcp_reass_maxseg) {
488 tcp_reass_overflows++;
489 tcpstat.tcps_rcvmemdrop++;
490 m_freem(m);
491 *tlenp = 0;
492 return (0);
493 }
494
495 /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
496 te = (struct tseg_qent *) zalloc_noblock(tcp_reass_zone);
497 if (te == NULL) {
498 tcpstat.tcps_rcvmemdrop++;
499 m_freem(m);
500 return (0);
501 }
502 tcp_reass_qsize++;
503
504 /*
505 * Find a segment which begins after this one does.
506 */
507 LIST_FOREACH(q, &tp->t_segq, tqe_q) {
508 if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
509 break;
510 p = q;
511 }
512
513 /*
514 * If there is a preceding segment, it may provide some of
515 * our data already. If so, drop the data from the incoming
516 * segment. If it provides all of our data, drop us.
517 */
518 if (p != NULL) {
519 register int i;
520 /* conversion to int (in i) handles seq wraparound */
521 i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
522 if (i > 0) {
523 if (i >= *tlenp) {
524 tcpstat.tcps_rcvduppack++;
525 tcpstat.tcps_rcvdupbyte += *tlenp;
526 if (nstat_collect) {
527 nstat_route_rx(tp->t_inpcb->inp_route.ro_rt, 1, *tlenp, NSTAT_RX_FLAG_DUPLICATE);
528 locked_add_64(&tp->t_inpcb->inp_stat->rxpackets, 1);
529 locked_add_64(&tp->t_inpcb->inp_stat->rxbytes, *tlenp);
530 tp->t_stat.rxduplicatebytes += *tlenp;
531 }
532 m_freem(m);
533 zfree(tcp_reass_zone, te);
534 tcp_reass_qsize--;
535 /*
536 * Try to present any queued data
537 * at the left window edge to the user.
538 * This is needed after the 3-WHS
539 * completes.
540 */
541 goto present; /* ??? */
542 }
543 m_adj(m, i);
544 *tlenp -= i;
545 th->th_seq += i;
546 }
547 }
548 tcpstat.tcps_rcvoopack++;
549 tcpstat.tcps_rcvoobyte += *tlenp;
550 if (nstat_collect) {
551 nstat_route_rx(tp->t_inpcb->inp_route.ro_rt, 1, *tlenp, NSTAT_RX_FLAG_OUT_OF_ORDER);
552 locked_add_64(&tp->t_inpcb->inp_stat->rxpackets, 1);
553 locked_add_64(&tp->t_inpcb->inp_stat->rxbytes, *tlenp);
554 tp->t_stat.rxoutoforderbytes += *tlenp;
555 }
556
557 /*
558 * While we overlap succeeding segments trim them or,
559 * if they are completely covered, dequeue them.
560 */
561 while (q) {
562 register int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
563 if (i <= 0)
564 break;
565 if (i < q->tqe_len) {
566 q->tqe_th->th_seq += i;
567 q->tqe_len -= i;
568 m_adj(q->tqe_m, i);
569 break;
570 }
571
572 nq = LIST_NEXT(q, tqe_q);
573 LIST_REMOVE(q, tqe_q);
574 m_freem(q->tqe_m);
575 zfree(tcp_reass_zone, q);
576 tcp_reass_qsize--;
577 q = nq;
578 }
579
580 /* Insert the new segment queue entry into place. */
581 te->tqe_m = m;
582 te->tqe_th = th;
583 te->tqe_len = *tlenp;
584
585 if (p == NULL) {
586 LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
587 } else {
588 LIST_INSERT_AFTER(p, te, tqe_q);
589 }
590
591 present:
592 /*
593 * Present data to user, advancing rcv_nxt through
594 * completed sequence space.
595 */
596 if (!TCPS_HAVEESTABLISHED(tp->t_state))
597 return (0);
598 q = LIST_FIRST(&tp->t_segq);
599 if (!q || q->tqe_th->th_seq != tp->rcv_nxt)
600 return (0);
601 do {
602 tp->rcv_nxt += q->tqe_len;
603 flags = q->tqe_th->th_flags & TH_FIN;
604 nq = LIST_NEXT(q, tqe_q);
605 LIST_REMOVE(q, tqe_q);
606 if (so->so_state & SS_CANTRCVMORE)
607 m_freem(q->tqe_m);
608 else {
609 so_recv_data_stat(so, q->tqe_m, 0); /* XXXX */
610 if (sbappendstream(&so->so_rcv, q->tqe_m))
611 dowakeup = 1;
612 }
613 zfree(tcp_reass_zone, q);
614 tcp_reass_qsize--;
615 q = nq;
616 } while (q && q->tqe_th->th_seq == tp->rcv_nxt);
617 ND6_HINT(tp);
618
619 #if INET6
620 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
621
622 KERNEL_DEBUG(DBG_LAYER_BEG,
623 ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
624 (((tp->t_inpcb->in6p_laddr.s6_addr16[0] & 0xffff) << 16) |
625 (tp->t_inpcb->in6p_faddr.s6_addr16[0] & 0xffff)),
626 0,0,0);
627 }
628 else
629 #endif
630 {
631 KERNEL_DEBUG(DBG_LAYER_BEG,
632 ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport),
633 (((tp->t_inpcb->inp_laddr.s_addr & 0xffff) << 16) |
634 (tp->t_inpcb->inp_faddr.s_addr & 0xffff)),
635 0,0,0);
636 }
637 if (dowakeup)
638 sorwakeup(so); /* done with socket lock held */
639 return (flags);
640
641 }
642
643 /*
644 * Reduce congestion window.
645 */
646 static void
647 tcp_reduce_congestion_window(
648 struct tcpcb *tp, struct tcphdr *th)
649 {
650 /*
651 * If the current tcp cc module has
652 * defined a hook for tasks to run
653 * before entering FR, call it
654 */
655 if (CC_ALGO(tp)->pre_fr != NULL)
656 CC_ALGO(tp)->pre_fr(tp, th);
657 ENTER_FASTRECOVERY(tp);
658 tp->snd_recover = tp->snd_max;
659 tp->t_timer[TCPT_REXMT] = 0;
660 tp->t_rtttime = 0;
661 tp->ecn_flags |= TE_SENDCWR;
662 tp->snd_cwnd = tp->snd_ssthresh +
663 tp->t_maxseg * tcprexmtthresh;
664 }
665
666
667 /*
668 * TCP input routine, follows pages 65-76 of the
669 * protocol specification dated September, 1981 very closely.
670 */
671 #if INET6
672 int
673 tcp6_input(struct mbuf **mp, int *offp, int proto)
674 {
675 #pragma unused(proto)
676 register struct mbuf *m = *mp;
677 struct in6_ifaddr *ia6;
678
679 IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), return IPPROTO_DONE);
680
681 /*
682 * draft-itojun-ipv6-tcp-to-anycast
683 * better place to put this in?
684 */
685 ia6 = ip6_getdstifaddr(m);
686 if (ia6 != NULL) {
687 IFA_LOCK_SPIN(&ia6->ia_ifa);
688 if (ia6->ia6_flags & IN6_IFF_ANYCAST) {
689 struct ip6_hdr *ip6;
690
691 IFA_UNLOCK(&ia6->ia_ifa);
692 IFA_REMREF(&ia6->ia_ifa);
693 ip6 = mtod(m, struct ip6_hdr *);
694 icmp6_error(m, ICMP6_DST_UNREACH,
695 ICMP6_DST_UNREACH_ADDR,
696 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
697 return (IPPROTO_DONE);
698 }
699 IFA_UNLOCK(&ia6->ia_ifa);
700 IFA_REMREF(&ia6->ia_ifa);
701 }
702
703 tcp_input(m, *offp);
704 return (IPPROTO_DONE);
705 }
706 #endif
707
708 /* A receiver will evaluate the flow of packets on a connection
709 * to see if it can reduce ack traffic. The receiver will start
710 * stretching acks if all of the following conditions are met:
711 * 1. tcp_delack_enabled is set to 3
712 * 2. If the bytes received in the last 100ms is greater than a threshold
713 * defined by maxseg_unacked
714 * 3. If the connection has not been idle for tcp_maxrcvidle period.
715 * 4. If the connection has seen enough packets to let the slow-start
716 * finish after connection establishment or after some packet loss.
717 *
718 * The receiver will stop stretching acks if there is congestion/reordering
719 * as indicated by packets on reassembly queue or an ECN. If the delayed-ack
720 * timer fires while stretching acks, it means that the packet flow has gone
721 * below the threshold defined by maxseg_unacked and the receiver will stop
722 * stretching acks. The receiver gets no indication when slow-start is completed
723 * or when the connection reaches an idle state. That is why we use
724 * tcp_rcvsspktcnt to cover slow-start and tcp_maxrcvidle to identify idle
725 * state.
726 */
727 static inline int
728 tcp_stretch_ack_enable(struct tcpcb *tp) {
729 if (tp->rcv_by_unackwin >= (maxseg_unacked * tp->t_maxseg) &&
730 TSTMP_GT(tp->rcv_unackwin + tcp_maxrcvidle, tcp_now) &&
731 (((tp->t_flagsext & TF_RCVUNACK_WAITSS) == 0) ||
732 (tp->rcv_waitforss >= tcp_rcvsspktcnt))) {
733 return(1);
734 }
735 return(0);
736 }
737
738 /* Reset the state related to stretch-ack algorithm. This will make
739 * the receiver generate an ack every other packet. The receiver
740 * will start re-evaluating the rate at which packets come to decide
741 * if it can benefit by lowering the ack traffic.
742 */
743 void
744 tcp_reset_stretch_ack(struct tcpcb *tp)
745 {
746 tp->t_flags &= ~(TF_STRETCHACK);
747 tp->rcv_by_unackwin = 0;
748 tp->rcv_unackwin = tcp_now + tcp_rcvunackwin;
749 }
750
751 void
752 tcp_input(m, off0)
753 struct mbuf *m;
754 int off0;
755 {
756 register struct tcphdr *th;
757 register struct ip *ip = NULL;
758 register struct ipovly *ipov;
759 register struct inpcb *inp;
760 u_char *optp = NULL;
761 int optlen = 0;
762 int len, tlen, off;
763 int drop_hdrlen;
764 register struct tcpcb *tp = 0;
765 register int thflags;
766 struct socket *so = 0;
767 int todrop, acked, ourfinisacked, needoutput = 0;
768 struct in_addr laddr;
769 #if INET6
770 struct in6_addr laddr6;
771 #endif
772 int dropsocket = 0;
773 int iss = 0;
774 int nosock = 0;
775 u_int32_t tiwin;
776 struct tcpopt to; /* options in this segment */
777 struct sockaddr_in *next_hop = NULL;
778 #if TCPDEBUG
779 short ostate = 0;
780 #endif
781 struct m_tag *fwd_tag;
782 u_char ip_ecn = IPTOS_ECN_NOTECT;
783 unsigned int ifscope, nocell = 0;
784 uint8_t isconnected, isdisconnected;
785
786 /*
787 * Record the interface where this segment arrived on; this does not
788 * affect normal data output (for non-detached TCP) as it provides a
789 * hint about which route and interface to use for sending in the
790 * absence of a PCB, when scoped routing (and thus source interface
791 * selection) are enabled.
792 */
793 if ((m->m_flags & M_PKTHDR) && m->m_pkthdr.rcvif != NULL)
794 ifscope = m->m_pkthdr.rcvif->if_index;
795 else
796 ifscope = IFSCOPE_NONE;
797
798 /* Since this is an entry point for input processing of tcp packets, we
799 * can update the tcp clock here.
800 */
801 calculate_tcp_clock();
802
803 /* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */
804 if (!SLIST_EMPTY(&m->m_pkthdr.tags)) {
805 fwd_tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID,
806 KERNEL_TAG_TYPE_IPFORWARD, NULL);
807 } else {
808 fwd_tag = NULL;
809 }
810 if (fwd_tag != NULL) {
811 struct ip_fwd_tag *ipfwd_tag = (struct ip_fwd_tag *)(fwd_tag+1);
812
813 next_hop = ipfwd_tag->next_hop;
814 m_tag_delete(m, fwd_tag);
815 }
816
817 #if INET6
818 struct ip6_hdr *ip6 = NULL;
819 int isipv6;
820 #endif /* INET6 */
821 int rstreason; /* For badport_bandlim accounting purposes */
822 struct proc *proc0=current_proc();
823
824 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_START,0,0,0,0,0);
825
826 #if INET6
827 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
828 #endif
829 bzero((char *)&to, sizeof(to));
830
831 tcpstat.tcps_rcvtotal++;
832
833
834
835 #if INET6
836 if (isipv6) {
837 /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */
838 ip6 = mtod(m, struct ip6_hdr *);
839 tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
840 th = (struct tcphdr *)((caddr_t)ip6 + off0);
841
842 if ((apple_hwcksum_rx != 0) && (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)) {
843 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
844 th->th_sum = m->m_pkthdr.csum_data;
845 else
846 th->th_sum = in6_cksum_phdr(&ip6->ip6_src,
847 &ip6->ip6_dst, htonl(sizeof(struct tcphdr)),
848 htonl(IPPROTO_TCP));
849
850 th->th_sum ^= 0xffff;
851 if (th->th_sum) {
852 tcpstat.tcps_rcvbadsum++;
853 goto dropnosock;
854 }
855 }
856 else {
857 if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) {
858 tcpstat.tcps_rcvbadsum++;
859 goto dropnosock;
860 }
861 }
862
863 KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport),
864 (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
865 th->th_seq, th->th_ack, th->th_win);
866 /*
867 * Be proactive about unspecified IPv6 address in source.
868 * As we use all-zero to indicate unbounded/unconnected pcb,
869 * unspecified IPv6 address can be used to confuse us.
870 *
871 * Note that packets with unspecified IPv6 destination is
872 * already dropped in ip6_input.
873 */
874 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
875 /* XXX stat */
876 goto dropnosock;
877 }
878 DTRACE_TCP5(receive, sruct mbuf *, m, struct inpcb *, NULL,
879 struct ip6_hdr *, ip6, struct tcpcb *, NULL,
880 struct tcphdr *, th);
881
882 ip_ecn = (ntohl(ip6->ip6_flow) >> 20) & IPTOS_ECN_MASK;
883 } else
884 #endif /* INET6 */
885 {
886 /*
887 * Get IP and TCP header together in first mbuf.
888 * Note: IP leaves IP header in first mbuf.
889 */
890 if (off0 > sizeof (struct ip)) {
891 ip_stripoptions(m, (struct mbuf *)0);
892 off0 = sizeof(struct ip);
893 if (m->m_pkthdr.csum_flags & CSUM_TCP_SUM16)
894 m->m_pkthdr.csum_flags = 0; /* invalidate hwcksuming */
895
896 }
897 if (m->m_len < sizeof (struct tcpiphdr)) {
898 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) {
899 tcpstat.tcps_rcvshort++;
900 return;
901 }
902 }
903 ip = mtod(m, struct ip *);
904 ipov = (struct ipovly *)ip;
905 th = (struct tcphdr *)((caddr_t)ip + off0);
906 tlen = ip->ip_len;
907
908 DTRACE_TCP5(receive, struct mbuf *, m, struct inpcb *, NULL,
909 struct ip *, ip, struct tcpcb *, NULL, struct tcphdr *, th);
910
911 KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport),
912 (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
913 th->th_seq, th->th_ack, th->th_win);
914
915 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
916 if (m->m_pkthdr.csum_flags & CSUM_TCP_SUM16) {
917 u_short pseudo;
918 char b[9];
919 *(uint32_t*)&b[0] = *(uint32_t*)&ipov->ih_x1[0];
920 *(uint32_t*)&b[4] = *(uint32_t*)&ipov->ih_x1[4];
921 *(uint8_t*)&b[8] = *(uint8_t*)&ipov->ih_x1[8];
922
923 bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
924 ipov->ih_len = (u_short)tlen;
925
926 #if BYTE_ORDER != BIG_ENDIAN
927 HTONS(ipov->ih_len);
928 #endif
929
930 pseudo = in_cksum(m, sizeof (struct ip));
931
932 *(uint32_t*)&ipov->ih_x1[0] = *(uint32_t*)&b[0];
933 *(uint32_t*)&ipov->ih_x1[4] = *(uint32_t*)&b[4];
934 *(uint8_t*)&ipov->ih_x1[8] = *(uint8_t*)&b[8];
935
936 th->th_sum = in_addword(pseudo, (m->m_pkthdr.csum_data & 0xFFFF));
937 } else {
938 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
939 th->th_sum = m->m_pkthdr.csum_data;
940 else
941 th->th_sum = in_pseudo(ip->ip_src.s_addr,
942 ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data +
943 ip->ip_len + IPPROTO_TCP));
944 }
945 th->th_sum ^= 0xffff;
946 } else {
947 char b[9];
948 /*
949 * Checksum extended TCP header and data.
950 */
951 *(uint32_t*)&b[0] = *(uint32_t*)&ipov->ih_x1[0];
952 *(uint32_t*)&b[4] = *(uint32_t*)&ipov->ih_x1[4];
953 *(uint8_t*)&b[8] = *(uint8_t*)&ipov->ih_x1[8];
954
955 len = sizeof (struct ip) + tlen;
956 bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
957 ipov->ih_len = (u_short)tlen;
958
959 #if BYTE_ORDER != BIG_ENDIAN
960 HTONS(ipov->ih_len);
961 #endif
962
963 th->th_sum = in_cksum(m, len);
964
965 *(uint32_t*)&ipov->ih_x1[0] = *(uint32_t*)&b[0];
966 *(uint32_t*)&ipov->ih_x1[4] = *(uint32_t*)&b[4];
967 *(uint8_t*)&ipov->ih_x1[8] = *(uint8_t*)&b[8];
968
969 tcp_in_cksum_stats(len);
970 }
971 if (th->th_sum) {
972 tcpstat.tcps_rcvbadsum++;
973 goto dropnosock;
974 }
975 #if INET6
976 /* Re-initialization for later version check */
977 ip->ip_v = IPVERSION;
978 #endif
979 ip_ecn = (ip->ip_tos & IPTOS_ECN_MASK);
980 }
981
982 /*
983 * Check that TCP offset makes sense,
984 * pull out TCP options and adjust length. XXX
985 */
986 off = th->th_off << 2;
987 if (off < sizeof (struct tcphdr) || off > tlen) {
988 tcpstat.tcps_rcvbadoff++;
989 goto dropnosock;
990 }
991 tlen -= off; /* tlen is used instead of ti->ti_len */
992 if (off > sizeof (struct tcphdr)) {
993 #if INET6
994 if (isipv6) {
995 IP6_EXTHDR_CHECK(m, off0, off, return);
996 ip6 = mtod(m, struct ip6_hdr *);
997 th = (struct tcphdr *)((caddr_t)ip6 + off0);
998 } else
999 #endif /* INET6 */
1000 {
1001 if (m->m_len < sizeof(struct ip) + off) {
1002 if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) {
1003 tcpstat.tcps_rcvshort++;
1004 return;
1005 }
1006 ip = mtod(m, struct ip *);
1007 ipov = (struct ipovly *)ip;
1008 th = (struct tcphdr *)((caddr_t)ip + off0);
1009 }
1010 }
1011 optlen = off - sizeof (struct tcphdr);
1012 optp = (u_char *)(th + 1);
1013 /*
1014 * Do quick retrieval of timestamp options ("options
1015 * prediction?"). If timestamp is the only option and it's
1016 * formatted as recommended in RFC 1323 appendix A, we
1017 * quickly get the values now and not bother calling
1018 * tcp_dooptions(), etc.
1019 */
1020 if ((optlen == TCPOLEN_TSTAMP_APPA ||
1021 (optlen > TCPOLEN_TSTAMP_APPA &&
1022 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
1023 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
1024 (th->th_flags & TH_SYN) == 0) {
1025 to.to_flags |= TOF_TS;
1026 to.to_tsval = ntohl(*(u_int32_t *)(optp + 4));
1027 to.to_tsecr = ntohl(*(u_int32_t *)(optp + 8));
1028 optp = NULL; /* we've parsed the options */
1029 }
1030 }
1031 thflags = th->th_flags;
1032
1033 #if TCP_DROP_SYNFIN
1034 /*
1035 * If the drop_synfin option is enabled, drop all packets with
1036 * both the SYN and FIN bits set. This prevents e.g. nmap from
1037 * identifying the TCP/IP stack.
1038 *
1039 * This is a violation of the TCP specification.
1040 */
1041 if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN))
1042 goto dropnosock;
1043 #endif
1044
1045 /*
1046 * Convert TCP protocol specific fields to host format.
1047 */
1048
1049 #if BYTE_ORDER != BIG_ENDIAN
1050 NTOHL(th->th_seq);
1051 NTOHL(th->th_ack);
1052 NTOHS(th->th_win);
1053 NTOHS(th->th_urp);
1054 #endif
1055
1056 /*
1057 * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options,
1058 * until after ip6_savecontrol() is called and before other functions
1059 * which don't want those proto headers.
1060 * Because ip6_savecontrol() is going to parse the mbuf to
1061 * search for data to be passed up to user-land, it wants mbuf
1062 * parameters to be unchanged.
1063 */
1064 drop_hdrlen = off0 + off;
1065
1066 /*
1067 * Locate pcb for segment.
1068 */
1069 findpcb:
1070
1071 isconnected = FALSE;
1072 isdisconnected = FALSE;
1073
1074 #if IPFIREWALL_FORWARD
1075 if (next_hop != NULL
1076 #if INET6
1077 && isipv6 == 0 /* IPv6 support is not yet */
1078 #endif /* INET6 */
1079 ) {
1080 /*
1081 * Diverted. Pretend to be the destination.
1082 * already got one like this?
1083 */
1084 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
1085 ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif);
1086 if (!inp) {
1087 /*
1088 * No, then it's new. Try find the ambushing socket
1089 */
1090 if (!next_hop->sin_port) {
1091 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src,
1092 th->th_sport, next_hop->sin_addr,
1093 th->th_dport, 1, m->m_pkthdr.rcvif);
1094 } else {
1095 inp = in_pcblookup_hash(&tcbinfo,
1096 ip->ip_src, th->th_sport,
1097 next_hop->sin_addr,
1098 ntohs(next_hop->sin_port), 1,
1099 m->m_pkthdr.rcvif);
1100 }
1101 }
1102 } else
1103 #endif /* IPFIREWALL_FORWARD */
1104 {
1105 #if INET6
1106 if (isipv6)
1107 inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport,
1108 &ip6->ip6_dst, th->th_dport, 1,
1109 m->m_pkthdr.rcvif);
1110 else
1111 #endif /* INET6 */
1112 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
1113 ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif);
1114 }
1115
1116 /*
1117 * Use the interface scope information from the PCB for outbound
1118 * segments. If the PCB isn't present and if scoped routing is
1119 * enabled, tcp_respond will use the scope of the interface where
1120 * the segment arrived on.
1121 */
1122 if (inp != NULL && (inp->inp_flags & INP_BOUND_IF))
1123 ifscope = inp->inp_boundif;
1124 /*
1125 * If the PCB is present and the socket isn't allowed to use
1126 * the cellular interface, indicate it as such for tcp_respond.
1127 */
1128 if (inp != NULL && (inp->inp_flags & INP_NO_IFT_CELLULAR))
1129 nocell = 1;
1130
1131 #if IPSEC
1132 if (ipsec_bypass == 0) {
1133 #if INET6
1134 if (isipv6) {
1135 if (inp != NULL && ipsec6_in_reject_so(m, inp->inp_socket)) {
1136 IPSEC_STAT_INCREMENT(ipsec6stat.in_polvio);
1137 if (in_pcb_checkstate(inp, WNT_RELEASE, 0) == WNT_STOPUSING)
1138 inp = NULL; // pretend we didn't find it
1139 goto dropnosock;
1140 }
1141 } else
1142 #endif /* INET6 */
1143 if (inp != NULL && ipsec4_in_reject_so(m, inp->inp_socket)) {
1144 IPSEC_STAT_INCREMENT(ipsecstat.in_polvio);
1145 if (in_pcb_checkstate(inp, WNT_RELEASE, 0) == WNT_STOPUSING)
1146 inp = NULL; // pretend we didn't find it
1147 goto dropnosock;
1148 }
1149 }
1150 #endif /*IPSEC*/
1151
1152 /*
1153 * If the state is CLOSED (i.e., TCB does not exist) then
1154 * all data in the incoming segment is discarded.
1155 * If the TCB exists but is in CLOSED state, it is embryonic,
1156 * but should either do a listen or a connect soon.
1157 */
1158 if (inp == NULL) {
1159 if (log_in_vain) {
1160 #if INET6
1161 char dbuf[MAX_IPv6_STR_LEN], sbuf[MAX_IPv6_STR_LEN];
1162 #else /* INET6 */
1163 char dbuf[MAX_IPv4_STR_LEN], sbuf[MAX_IPv4_STR_LEN];
1164 #endif /* INET6 */
1165
1166 #if INET6
1167 if (isipv6) {
1168 inet_ntop(AF_INET6, &ip6->ip6_dst, dbuf, sizeof(dbuf));
1169 inet_ntop(AF_INET6, &ip6->ip6_src, sbuf, sizeof(sbuf));
1170 } else
1171 #endif
1172 {
1173 inet_ntop(AF_INET, &ip->ip_dst, dbuf, sizeof(dbuf));
1174 inet_ntop(AF_INET, &ip->ip_src, sbuf, sizeof(sbuf));
1175 }
1176 switch (log_in_vain) {
1177 case 1:
1178 if(thflags & TH_SYN)
1179 log(LOG_INFO,
1180 "Connection attempt to TCP %s:%d from %s:%d\n",
1181 dbuf, ntohs(th->th_dport),
1182 sbuf,
1183 ntohs(th->th_sport));
1184 break;
1185 case 2:
1186 log(LOG_INFO,
1187 "Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n",
1188 dbuf, ntohs(th->th_dport), sbuf,
1189 ntohs(th->th_sport), thflags);
1190 break;
1191 case 3:
1192 if ((thflags & TH_SYN) &&
1193 !(m->m_flags & (M_BCAST | M_MCAST)) &&
1194 #if INET6
1195 ((isipv6 && !IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) ||
1196 (!isipv6 && ip->ip_dst.s_addr != ip->ip_src.s_addr))
1197 #else
1198 ip->ip_dst.s_addr != ip->ip_src.s_addr
1199 #endif
1200 )
1201 log_in_vain_log((LOG_INFO,
1202 "Stealth Mode connection attempt to TCP %s:%d from %s:%d\n",
1203 dbuf, ntohs(th->th_dport),
1204 sbuf,
1205 ntohs(th->th_sport)));
1206 break;
1207 default:
1208 break;
1209 }
1210 }
1211 if (blackhole) {
1212 if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type != IFT_LOOP)
1213 switch (blackhole) {
1214 case 1:
1215 if (thflags & TH_SYN)
1216 goto dropnosock;
1217 break;
1218 case 2:
1219 goto dropnosock;
1220 default:
1221 goto dropnosock;
1222 }
1223 }
1224 rstreason = BANDLIM_RST_CLOSEDPORT;
1225 goto dropwithresetnosock;
1226 }
1227 so = inp->inp_socket;
1228 if (so == NULL) {
1229 /* This case shouldn't happen as the socket shouldn't be null
1230 * if inp_state isn't set to INPCB_STATE_DEAD
1231 * But just in case, we pretend we didn't find the socket if we hit this case
1232 * as this isn't cause for a panic (the socket might be leaked however)...
1233 */
1234 inp = NULL;
1235 #if TEMPDEBUG
1236 printf("tcp_input: no more socket for inp=%x. This shouldn't happen\n", inp);
1237 #endif
1238 goto dropnosock;
1239 }
1240
1241 tcp_lock(so, 1, 0);
1242 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
1243 tcp_unlock(so, 1, (void *)2);
1244 inp = NULL; // pretend we didn't find it
1245 goto dropnosock;
1246 }
1247
1248 tp = intotcpcb(inp);
1249 if (tp == 0) {
1250 rstreason = BANDLIM_RST_CLOSEDPORT;
1251 goto dropwithreset;
1252 }
1253 if (tp->t_state == TCPS_CLOSED)
1254 goto drop;
1255
1256 /* Unscale the window into a 32-bit value. */
1257 if ((thflags & TH_SYN) == 0)
1258 tiwin = th->th_win << tp->snd_scale;
1259 else
1260 tiwin = th->th_win;
1261
1262 #if CONFIG_MACF_NET
1263 if (mac_inpcb_check_deliver(inp, m, AF_INET, SOCK_STREAM))
1264 goto drop;
1265 #endif
1266
1267 /* Radar 7377561: Avoid processing packets while closing a listen socket */
1268 if (tp->t_state == TCPS_LISTEN && (so->so_options & SO_ACCEPTCONN) == 0)
1269 goto drop;
1270
1271 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
1272 #if TCPDEBUG
1273 if (so->so_options & SO_DEBUG) {
1274 ostate = tp->t_state;
1275 #if INET6
1276 if (isipv6)
1277 bcopy((char *)ip6, (char *)tcp_saveipgen,
1278 sizeof(*ip6));
1279 else
1280 #endif /* INET6 */
1281 bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip));
1282 tcp_savetcp = *th;
1283 }
1284 #endif
1285 if (so->so_options & SO_ACCEPTCONN) {
1286 register struct tcpcb *tp0 = tp;
1287 struct socket *so2;
1288 struct socket *oso;
1289 struct sockaddr_storage from;
1290 #if INET6
1291 struct inpcb *oinp = sotoinpcb(so);
1292 #endif /* INET6 */
1293 unsigned int head_ifscope;
1294 unsigned int head_nocell;
1295
1296 /* Get listener's bound-to-interface, if any */
1297 head_ifscope = (inp->inp_flags & INP_BOUND_IF) ?
1298 inp->inp_boundif : IFSCOPE_NONE;
1299 /* Get listener's no-cellular information, if any */
1300 head_nocell = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0;
1301
1302 /*
1303 * If the state is LISTEN then ignore segment if it contains an RST.
1304 * If the segment contains an ACK then it is bad and send a RST.
1305 * If it does not contain a SYN then it is not interesting; drop it.
1306 * If it is from this socket, drop it, it must be forged.
1307 */
1308 if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
1309 if (thflags & TH_RST) {
1310 goto drop;
1311 }
1312 if (thflags & TH_ACK) {
1313 tp = NULL;
1314 tcpstat.tcps_badsyn++;
1315 rstreason = BANDLIM_RST_OPENPORT;
1316 goto dropwithreset;
1317 }
1318
1319 /* We come here if there is no SYN set */
1320 tcpstat.tcps_badsyn++;
1321 goto drop;
1322 }
1323 KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_START,0,0,0,0,0);
1324 if (th->th_dport == th->th_sport) {
1325 #if INET6
1326 if (isipv6) {
1327 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
1328 &ip6->ip6_src))
1329 goto drop;
1330 } else
1331 #endif /* INET6 */
1332 if (ip->ip_dst.s_addr == ip->ip_src.s_addr)
1333 goto drop;
1334 }
1335 /*
1336 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
1337 * in_broadcast() should never return true on a received
1338 * packet with M_BCAST not set.
1339 *
1340 * Packets with a multicast source address should also
1341 * be discarded.
1342 */
1343 if (m->m_flags & (M_BCAST|M_MCAST))
1344 goto drop;
1345 #if INET6
1346 if (isipv6) {
1347 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
1348 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
1349 goto drop;
1350 } else
1351 #endif
1352 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
1353 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
1354 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
1355 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
1356 goto drop;
1357
1358
1359 #if INET6
1360 /*
1361 * If deprecated address is forbidden,
1362 * we do not accept SYN to deprecated interface
1363 * address to prevent any new inbound connection from
1364 * getting established.
1365 * When we do not accept SYN, we send a TCP RST,
1366 * with deprecated source address (instead of dropping
1367 * it). We compromise it as it is much better for peer
1368 * to send a RST, and RST will be the final packet
1369 * for the exchange.
1370 *
1371 * If we do not forbid deprecated addresses, we accept
1372 * the SYN packet. RFC2462 does not suggest dropping
1373 * SYN in this case.
1374 * If we decipher RFC2462 5.5.4, it says like this:
1375 * 1. use of deprecated addr with existing
1376 * communication is okay - "SHOULD continue to be
1377 * used"
1378 * 2. use of it with new communication:
1379 * (2a) "SHOULD NOT be used if alternate address
1380 * with sufficient scope is available"
1381 * (2b) nothing mentioned otherwise.
1382 * Here we fall into (2b) case as we have no choice in
1383 * our source address selection - we must obey the peer.
1384 *
1385 * The wording in RFC2462 is confusing, and there are
1386 * multiple description text for deprecated address
1387 * handling - worse, they are not exactly the same.
1388 * I believe 5.5.4 is the best one, so we follow 5.5.4.
1389 */
1390 if (isipv6 && !ip6_use_deprecated) {
1391 struct in6_ifaddr *ia6;
1392
1393 ia6 = ip6_getdstifaddr(m);
1394 if (ia6 != NULL) {
1395 IFA_LOCK_SPIN(&ia6->ia_ifa);
1396 if (ia6->ia6_flags & IN6_IFF_DEPRECATED) {
1397 IFA_UNLOCK(&ia6->ia_ifa);
1398 IFA_REMREF(&ia6->ia_ifa);
1399 tp = NULL;
1400 rstreason = BANDLIM_RST_OPENPORT;
1401 goto dropwithreset;
1402 }
1403 IFA_UNLOCK(&ia6->ia_ifa);
1404 IFA_REMREF(&ia6->ia_ifa);
1405 }
1406 }
1407 #endif
1408 if (so->so_filt) {
1409 #if INET6
1410 if (isipv6) {
1411 struct sockaddr_in6 *sin6 = (struct sockaddr_in6*)&from;
1412
1413 sin6->sin6_len = sizeof(*sin6);
1414 sin6->sin6_family = AF_INET6;
1415 sin6->sin6_port = th->th_sport;
1416 sin6->sin6_flowinfo = 0;
1417 sin6->sin6_addr = ip6->ip6_src;
1418 sin6->sin6_scope_id = 0;
1419 }
1420 else
1421 #endif
1422 {
1423 struct sockaddr_in *sin = (struct sockaddr_in*)&from;
1424
1425 sin->sin_len = sizeof(*sin);
1426 sin->sin_family = AF_INET;
1427 sin->sin_port = th->th_sport;
1428 sin->sin_addr = ip->ip_src;
1429 }
1430 so2 = sonewconn(so, 0, (struct sockaddr*)&from);
1431 } else {
1432 so2 = sonewconn(so, 0, NULL);
1433 }
1434 if (so2 == 0) {
1435 tcpstat.tcps_listendrop++;
1436 if (tcp_dropdropablreq(so)) {
1437 if (so->so_filt)
1438 so2 = sonewconn(so, 0, (struct sockaddr*)&from);
1439 else
1440 so2 = sonewconn(so, 0, NULL);
1441 }
1442 if (!so2)
1443 goto drop;
1444 }
1445
1446 /* Point "inp" and "tp" in tandem to new socket */
1447 inp = (struct inpcb *)so2->so_pcb;
1448 tp = intotcpcb(inp);
1449
1450 oso = so;
1451 tcp_unlock(so, 0, 0); /* Unlock but keep a reference on listener for now */
1452
1453 so = so2;
1454 tcp_lock(so, 1, 0);
1455 /*
1456 * Mark socket as temporary until we're
1457 * committed to keeping it. The code at
1458 * ``drop'' and ``dropwithreset'' check the
1459 * flag dropsocket to see if the temporary
1460 * socket created here should be discarded.
1461 * We mark the socket as discardable until
1462 * we're committed to it below in TCPS_LISTEN.
1463 * There are some error conditions in which we
1464 * have to drop the temporary socket.
1465 */
1466 dropsocket++;
1467 /*
1468 * Inherit INP_BOUND_IF from listener; testing if
1469 * head_ifscope is non-zero is sufficient, since it
1470 * can only be set to a non-zero value earlier if
1471 * the listener has such a flag set.
1472 */
1473 if (head_ifscope != IFSCOPE_NONE) {
1474 inp->inp_flags |= INP_BOUND_IF;
1475 inp->inp_boundif = head_ifscope;
1476 }
1477 /*
1478 * Inherit INP_NO_IFT_CELLULAR from listener.
1479 */
1480 if (head_nocell) {
1481 inp->inp_flags |= INP_NO_IFT_CELLULAR;
1482 }
1483 #if INET6
1484 if (isipv6)
1485 inp->in6p_laddr = ip6->ip6_dst;
1486 else {
1487 inp->inp_vflag &= ~INP_IPV6;
1488 inp->inp_vflag |= INP_IPV4;
1489 #endif /* INET6 */
1490 inp->inp_laddr = ip->ip_dst;
1491 #if INET6
1492 }
1493 #endif /* INET6 */
1494 inp->inp_lport = th->th_dport;
1495 if (in_pcbinshash(inp, 0) != 0) {
1496 /*
1497 * Undo the assignments above if we failed to
1498 * put the PCB on the hash lists.
1499 */
1500 #if INET6
1501 if (isipv6)
1502 inp->in6p_laddr = in6addr_any;
1503 else
1504 #endif /* INET6 */
1505 inp->inp_laddr.s_addr = INADDR_ANY;
1506 inp->inp_lport = 0;
1507 tcp_lock(oso, 0, 0); /* release ref on parent */
1508 tcp_unlock(oso, 1, 0);
1509 goto drop;
1510 }
1511 #if INET6
1512 if (isipv6) {
1513 /*
1514 * Inherit socket options from the listening
1515 * socket.
1516 * Note that in6p_inputopts are not (even
1517 * should not be) copied, since it stores
1518 * previously received options and is used to
1519 * detect if each new option is different than
1520 * the previous one and hence should be passed
1521 * to a user.
1522 * If we copied in6p_inputopts, a user would
1523 * not be able to receive options just after
1524 * calling the accept system call.
1525 */
1526 inp->inp_flags |=
1527 oinp->inp_flags & INP_CONTROLOPTS;
1528 if (oinp->in6p_outputopts)
1529 inp->in6p_outputopts =
1530 ip6_copypktopts(oinp->in6p_outputopts,
1531 M_NOWAIT);
1532 } else
1533 #endif /* INET6 */
1534 inp->inp_options = ip_srcroute();
1535 tcp_lock(oso, 0, 0);
1536 #if IPSEC
1537 /* copy old policy into new socket's */
1538 if (sotoinpcb(oso)->inp_sp)
1539 {
1540 int error = 0;
1541 /* Is it a security hole here to silently fail to copy the policy? */
1542 if (inp->inp_sp != NULL)
1543 error = ipsec_init_policy(so, &inp->inp_sp);
1544 if (error != 0 || ipsec_copy_policy(sotoinpcb(oso)->inp_sp, inp->inp_sp))
1545 printf("tcp_input: could not copy policy\n");
1546 }
1547 #endif
1548 /* inherit states from the listener */
1549 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
1550 struct tcpcb *, tp, int32_t, TCPS_LISTEN);
1551 tp->t_state = TCPS_LISTEN;
1552 tp->t_flags |= tp0->t_flags & (TF_NOPUSH|TF_NOOPT|TF_NODELAY);
1553 tp->t_flagsext |= (tp0->t_flagsext & TF_RXTFINDROP);
1554 tp->t_keepinit = tp0->t_keepinit;
1555 tp->t_inpcb->inp_ip_ttl = tp0->t_inpcb->inp_ip_ttl;
1556
1557 /* now drop the reference on the listener */
1558 tcp_unlock(oso, 1, 0);
1559
1560 /* Compute proper scaling value from buffer space */
1561 if (inp->inp_pcbinfo->ipi_count < tcp_sockthreshold) {
1562 tp->request_r_scale = max(tcp_win_scale, tp->request_r_scale);
1563 so->so_rcv.sb_hiwat = imin(TCP_MAXWIN << tp->request_r_scale, (sb_max / (MSIZE+MCLBYTES)) * MCLBYTES);
1564 }
1565 else {
1566 while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
1567 TCP_MAXWIN << tp->request_r_scale <
1568 so->so_rcv.sb_hiwat)
1569 tp->request_r_scale++;
1570 }
1571
1572 KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_END,0,0,0,0,0);
1573 }
1574 }
1575 lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1576
1577 /*
1578 * Radar 3529618
1579 * This is the second part of the MSS DoS prevention code (after
1580 * minmss on the sending side) and it deals with too many too small
1581 * tcp packets in a too short timeframe (1 second).
1582 *
1583 * For every full second we count the number of received packets
1584 * and bytes. If we get a lot of packets per second for this connection
1585 * (tcp_minmssoverload) we take a closer look at it and compute the
1586 * average packet size for the past second. If that is less than
1587 * tcp_minmss we get too many packets with very small payload which
1588 * is not good and burdens our system (and every packet generates
1589 * a wakeup to the process connected to our socket). We can reasonable
1590 * expect this to be small packet DoS attack to exhaust our CPU
1591 * cycles.
1592 *
1593 * Care has to be taken for the minimum packet overload value. This
1594 * value defines the minimum number of packets per second before we
1595 * start to worry. This must not be too low to avoid killing for
1596 * example interactive connections with many small packets like
1597 * telnet or SSH.
1598 *
1599 * Setting either tcp_minmssoverload or tcp_minmss to "0" disables
1600 * this check.
1601 *
1602 * Account for packet if payload packet, skip over ACK, etc.
1603 */
1604 if (tp->t_state == TCPS_ESTABLISHED && tlen > 0) {
1605 if (TSTMP_GT(tp->rcv_reset, tcp_now)) {
1606 tp->rcv_pps++;
1607 tp->rcv_byps += tlen + off;
1608 if (tp->rcv_byps > tp->rcv_maxbyps)
1609 tp->rcv_maxbyps = tp->rcv_byps;
1610 /*
1611 * Setting either tcp_minmssoverload or tcp_minmss to "0" disables
1612 * the check.
1613 */
1614 if (tcp_minmss && tcp_minmssoverload && tp->rcv_pps > tcp_minmssoverload) {
1615 if ((tp->rcv_byps / tp->rcv_pps) < tcp_minmss) {
1616 char ipstrbuf[MAX_IPv6_STR_LEN];
1617 printf("too many small tcp packets from "
1618 "%s:%u, av. %ubyte/packet, "
1619 "dropping connection\n",
1620 #if INET6
1621 isipv6 ?
1622 inet_ntop(AF_INET6, &inp->in6p_faddr, ipstrbuf,
1623 sizeof(ipstrbuf)) :
1624 #endif
1625 inet_ntop(AF_INET, &inp->inp_faddr, ipstrbuf,
1626 sizeof(ipstrbuf)),
1627 inp->inp_fport,
1628 tp->rcv_byps / tp->rcv_pps);
1629 tp = tcp_drop(tp, ECONNRESET);
1630 /* tcpstat.tcps_minmssdrops++; */
1631 goto drop;
1632 }
1633 }
1634 } else {
1635 tp->rcv_reset = tcp_now + TCP_RETRANSHZ;
1636 tp->rcv_pps = 1;
1637 tp->rcv_byps = tlen + off;
1638 }
1639
1640 /* Evaluate the rate of arrival of packets to see if the
1641 * receiver can reduce the ack traffic. The algorithm to
1642 * stretch acks will be enabled if the connection meets
1643 * certain criteria defined in tcp_stretch_ack_enable function.
1644 */
1645 if ((tp->t_flagsext & TF_RCVUNACK_WAITSS) != 0) {
1646 tp->rcv_waitforss++;
1647 }
1648 if (tcp_stretch_ack_enable(tp)) {
1649 tp->t_flags |= TF_STRETCHACK;
1650 tp->t_flagsext &= ~(TF_RCVUNACK_WAITSS);
1651 tp->rcv_waitforss = 0;
1652 } else {
1653 tp->t_flags &= ~(TF_STRETCHACK);
1654 }
1655 if (TSTMP_GT(tp->rcv_unackwin, tcp_now)) {
1656 tp->rcv_by_unackwin += (tlen + off);
1657 } else {
1658 tp->rcv_unackwin = tcp_now + tcp_rcvunackwin;
1659 tp->rcv_by_unackwin = tlen + off;
1660 }
1661 }
1662
1663 /*
1664 Explicit Congestion Notification - Flag that we need to send ECT if
1665 + The IP Congestion experienced flag was set.
1666 + Socket is in established state
1667 + We negotiated ECN in the TCP setup
1668 + This isn't a pure ack (tlen > 0)
1669 + The data is in the valid window
1670
1671 TE_SENDECE will be cleared when we receive a packet with TH_CWR set.
1672 */
1673 if (ip_ecn == IPTOS_ECN_CE && tp->t_state == TCPS_ESTABLISHED &&
1674 (tp->ecn_flags & (TE_SETUPSENT | TE_SETUPRECEIVED)) ==
1675 (TE_SETUPSENT | TE_SETUPRECEIVED) && tlen > 0 &&
1676 SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
1677 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
1678 tp->ecn_flags |= TE_SENDECE;
1679 }
1680
1681 /*
1682 Clear TE_SENDECE if TH_CWR is set. This is harmless, so we don't
1683 bother doing extensive checks for state and whatnot.
1684 */
1685 if ((thflags & TH_CWR) == TH_CWR) {
1686 tp->ecn_flags &= ~TE_SENDECE;
1687 }
1688
1689 /* If we received an explicit notification of congestion in
1690 * ip tos ecn bits or by the CWR bit in TCP header flags, reset
1691 * the ack-strteching state.
1692 */
1693 if (tp->t_state == TCPS_ESTABLISHED && (tp->t_flags & TF_STRETCHACK) != 0 &&
1694 ((ip_ecn == IPTOS_ECN_CE) || ((thflags & TH_CWR) == TH_CWR)))
1695 tcp_reset_stretch_ack(tp);
1696
1697 /*
1698 * Segment received on connection.
1699 * Reset idle time and keep-alive timer.
1700 */
1701 tp->t_rcvtime = tcp_now;
1702 if (TCPS_HAVEESTABLISHED(tp->t_state))
1703 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, TCP_KEEPIDLE(tp));
1704
1705 /*
1706 * Process options if not in LISTEN state,
1707 * else do it below (after getting remote address).
1708 */
1709 if (tp->t_state != TCPS_LISTEN && optp)
1710 tcp_dooptions(tp, optp, optlen, th, &to, ifscope);
1711
1712 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
1713 if (to.to_flags & TOF_SCALE) {
1714 tp->t_flags |= TF_RCVD_SCALE;
1715 tp->requested_s_scale = to.to_requested_s_scale;
1716 tp->snd_wnd = th->th_win << tp->snd_scale;
1717 tiwin = tp->snd_wnd;
1718 }
1719 if (to.to_flags & TOF_TS) {
1720 tp->t_flags |= TF_RCVD_TSTMP;
1721 tp->ts_recent = to.to_tsval;
1722 tp->ts_recent_age = tcp_now;
1723 }
1724 if (to.to_flags & TOF_MSS)
1725 tcp_mss(tp, to.to_mss, ifscope);
1726 if (tp->sack_enable) {
1727 if (!(to.to_flags & TOF_SACK))
1728 tp->sack_enable = 0;
1729 else
1730 tp->t_flags |= TF_SACK_PERMIT;
1731 }
1732 }
1733
1734 #if TRAFFIC_MGT
1735 /* Compute inter-packet arrival jitter. According to RFC 3550, inter-packet
1736 * arrival jitter is defined as the difference in packet spacing at the
1737 * receiver compared to the sender for a pair of packets. When two packets
1738 * of maximum segment size come one after the other with consecutive
1739 * sequence numbers, we consider them as packets sent together at the
1740 * sender and use them as a pair to compute inter-packet arrival jitter.
1741 * This metric indicates the delay induced by the network components due
1742 * to queuing in edge/access routers.
1743 */
1744 if (tp->t_state == TCPS_ESTABLISHED &&
1745 (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK|TH_ECE|TH_PUSH)) == TH_ACK &&
1746 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
1747 ((to.to_flags & TOF_TS) == 0 ||
1748 TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
1749 th->th_seq == tp->rcv_nxt &&
1750 LIST_EMPTY(&tp->t_segq)) {
1751 if (tp->iaj_pktcnt <= IAJ_IGNORE_PKTCNT) {
1752 tp->iaj_pktcnt++;
1753 }
1754
1755 if ( tp->iaj_size == 0 || tlen > tp->iaj_size ||
1756 (tlen == tp->iaj_size && tp->iaj_rcv_ts == 0)) {
1757 /* State related to inter-arrival jitter is uninitialized
1758 * or we are trying to find a good first packet to start
1759 * computing the metric
1760 */
1761 update_iaj_state(tp, tlen, 0);
1762 } else {
1763 if (tlen == tp->iaj_size) {
1764 /* Compute inter-arrival jitter taking this packet
1765 * as the second packet
1766 */
1767 compute_iaj(tp);
1768 }
1769 if (tlen < tp->iaj_size) {
1770 /* There is a smaller packet in the stream.
1771 * Some times the maximum size supported on a path can
1772 * change if there is a new link with smaller MTU.
1773 * The receiver will not know about this change.
1774 * If there are too many packets smaller than iaj_size,
1775 * we try to learn the iaj_size again.
1776 */
1777 tp->iaj_small_pkt++;
1778 if (tp->iaj_small_pkt > RESET_IAJ_SIZE_THRESH) {
1779 update_iaj_state(tp, tlen, 1);
1780 } else {
1781 clear_iaj_state(tp);
1782 }
1783 } else {
1784 update_iaj_state(tp, tlen, 0);
1785 }
1786 }
1787 } else {
1788 clear_iaj_state(tp);
1789 }
1790 #endif /* TRAFFIC_MGT */
1791
1792 /*
1793 * Header prediction: check for the two common cases
1794 * of a uni-directional data xfer. If the packet has
1795 * no control flags, is in-sequence, the window didn't
1796 * change and we're not retransmitting, it's a
1797 * candidate. If the length is zero and the ack moved
1798 * forward, we're the sender side of the xfer. Just
1799 * free the data acked & wake any higher level process
1800 * that was blocked waiting for space. If the length
1801 * is non-zero and the ack didn't move, we're the
1802 * receiver side. If we're getting packets in-order
1803 * (the reassembly queue is empty), add the data to
1804 * the socket buffer and note that we need a delayed ack.
1805 * Make sure that the hidden state-flags are also off.
1806 * Since we check for TCPS_ESTABLISHED above, it can only
1807 * be TH_NEEDSYN.
1808 */
1809 if (tp->t_state == TCPS_ESTABLISHED &&
1810 (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK|TH_ECE)) == TH_ACK &&
1811 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
1812 ((to.to_flags & TOF_TS) == 0 ||
1813 TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
1814 th->th_seq == tp->rcv_nxt &&
1815 tiwin && tiwin == tp->snd_wnd &&
1816 tp->snd_nxt == tp->snd_max) {
1817
1818 /*
1819 * If last ACK falls within this segment's sequence numbers,
1820 * record the timestamp.
1821 * NOTE that the test is modified according to the latest
1822 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
1823 */
1824 if ((to.to_flags & TOF_TS) != 0 &&
1825 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
1826 tp->ts_recent_age = tcp_now;
1827 tp->ts_recent = to.to_tsval;
1828 }
1829
1830 /* Force acknowledgment if we received a FIN */
1831
1832 if (thflags & TH_FIN)
1833 tp->t_flags |= TF_ACKNOW;
1834
1835 if (tlen == 0) {
1836 if (SEQ_GT(th->th_ack, tp->snd_una) &&
1837 SEQ_LEQ(th->th_ack, tp->snd_max) &&
1838 tp->snd_cwnd >= tp->snd_ssthresh &&
1839 (!IN_FASTRECOVERY(tp) &&
1840 ((!tp->sack_enable && tp->t_dupacks < tcprexmtthresh) ||
1841 (tp->sack_enable && to.to_nsacks == 0 &&
1842 TAILQ_EMPTY(&tp->snd_holes))))) {
1843 /*
1844 * this is a pure ack for outstanding data.
1845 */
1846 ++tcpstat.tcps_predack;
1847 /*
1848 * "bad retransmit" recovery
1849 */
1850 if (tp->t_rxtshift == 1 &&
1851 TSTMP_LT(tcp_now, tp->t_badrxtwin)) {
1852 ++tcpstat.tcps_sndrexmitbad;
1853 tp->snd_cwnd = tp->snd_cwnd_prev;
1854 tp->snd_ssthresh =
1855 tp->snd_ssthresh_prev;
1856 tp->snd_recover = tp->snd_recover_prev;
1857 if (tp->t_flags & TF_WASFRECOVERY)
1858 ENTER_FASTRECOVERY(tp);
1859 tp->snd_nxt = tp->snd_max;
1860 tp->t_badrxtwin = 0;
1861 tp->t_rxtshift = 0;
1862 tp->rxt_start = 0;
1863 DTRACE_TCP5(cc, void, NULL, struct inpcb *, tp->t_inpcb,
1864 struct tcpcb *, tp, struct tcphdr *, th,
1865 int32_t, TCP_CC_BAD_REXMT_RECOVERY);
1866 }
1867 /*
1868 * Recalculate the transmit timer / rtt.
1869 *
1870 * Some boxes send broken timestamp replies
1871 * during the SYN+ACK phase, ignore
1872 * timestamps of 0 or we could calculate a
1873 * huge RTT and blow up the retransmit timer.
1874 */
1875 if (((to.to_flags & TOF_TS) != 0) && (to.to_tsecr != 0) &&
1876 TSTMP_GEQ(tcp_now, to.to_tsecr)) {
1877 tcp_xmit_timer(tp,
1878 tcp_now - to.to_tsecr);
1879 } else if (tp->t_rtttime &&
1880 SEQ_GT(th->th_ack, tp->t_rtseq)) {
1881 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime);
1882 }
1883 acked = th->th_ack - tp->snd_una;
1884 tcpstat.tcps_rcvackpack++;
1885 tcpstat.tcps_rcvackbyte += acked;
1886
1887 /* Handle an ack that is in sequence during congestion
1888 * avoidance phase. The calculations in this function
1889 * assume that snd_una is not updated yet.
1890 */
1891 if (CC_ALGO(tp)->inseq_ack_rcvd != NULL)
1892 CC_ALGO(tp)->inseq_ack_rcvd(tp, th);
1893
1894 DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp,
1895 struct tcpcb *, tp, struct tcphdr *, th,
1896 int32_t, TCP_CC_INSEQ_ACK_RCVD);
1897
1898 sbdrop(&so->so_snd, acked);
1899 if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
1900 SEQ_LEQ(th->th_ack, tp->snd_recover))
1901 tp->snd_recover = th->th_ack - 1;
1902 tp->snd_una = th->th_ack;
1903 /*
1904 * pull snd_wl2 up to prevent seq wrap relative
1905 * to th_ack.
1906 */
1907 tp->snd_wl2 = th->th_ack;
1908 tp->t_dupacks = 0;
1909 m_freem(m);
1910 ND6_HINT(tp); /* some progress has been done */
1911
1912 /*
1913 * If all outstanding data are acked, stop
1914 * retransmit timer, otherwise restart timer
1915 * using current (possibly backed-off) value.
1916 * If process is waiting for space,
1917 * wakeup/selwakeup/signal. If data
1918 * are ready to send, let tcp_output
1919 * decide between more output or persist.
1920 */
1921 if (tp->snd_una == tp->snd_max)
1922 tp->t_timer[TCPT_REXMT] = 0;
1923 else if (tp->t_timer[TCPT_PERSIST] == 0)
1924 tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
1925
1926 sowwakeup(so); /* has to be done with socket lock held */
1927 if ((so->so_snd.sb_cc) || (tp->t_flags & TF_ACKNOW)) {
1928 (void) tcp_output(tp);
1929 }
1930
1931 tcp_check_timer_state(tp);
1932 tcp_unlock(so, 1, 0);
1933 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
1934 return;
1935 }
1936 } else if (th->th_ack == tp->snd_una &&
1937 LIST_EMPTY(&tp->t_segq) &&
1938 tlen <= tcp_sbspace(tp)) {
1939 /*
1940 * this is a pure, in-sequence data packet
1941 * with nothing on the reassembly queue and
1942 * we have enough buffer space to take it.
1943 */
1944 /* Clean receiver SACK report if present */
1945 if (tp->sack_enable && tp->rcv_numsacks)
1946 tcp_clean_sackreport(tp);
1947 ++tcpstat.tcps_preddat;
1948 tp->rcv_nxt += tlen;
1949 /*
1950 * Pull snd_wl1 up to prevent seq wrap relative to
1951 * th_seq.
1952 */
1953 tp->snd_wl1 = th->th_seq;
1954 /*
1955 * Pull rcv_up up to prevent seq wrap relative to
1956 * rcv_nxt.
1957 */
1958 tp->rcv_up = tp->rcv_nxt;
1959 tcpstat.tcps_rcvpack++;
1960 tcpstat.tcps_rcvbyte += tlen;
1961 if (nstat_collect) {
1962 locked_add_64(&inp->inp_stat->rxpackets, 1);
1963 locked_add_64(&inp->inp_stat->rxbytes, tlen);
1964 }
1965 ND6_HINT(tp); /* some progress has been done */
1966 /*
1967 * Add data to socket buffer.
1968 */
1969 so_recv_data_stat(so, m, 0);
1970 m_adj(m, drop_hdrlen); /* delayed header drop */
1971 if (sbappendstream(&so->so_rcv, m))
1972 sorwakeup(so);
1973 #if INET6
1974 if (isipv6) {
1975 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
1976 (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
1977 th->th_seq, th->th_ack, th->th_win);
1978 }
1979 else
1980 #endif
1981 {
1982 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
1983 (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
1984 th->th_seq, th->th_ack, th->th_win);
1985 }
1986 if (DELAY_ACK(tp, th)) {
1987 if ((tp->t_flags & TF_DELACK) == 0) {
1988 tp->t_flags |= TF_DELACK;
1989 tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
1990 }
1991 tp->t_unacksegs++;
1992 } else {
1993 tp->t_flags |= TF_ACKNOW;
1994 tcp_output(tp);
1995 }
1996 tcp_check_timer_state(tp);
1997 tcp_unlock(so, 1, 0);
1998 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
1999 return;
2000 }
2001 }
2002
2003 /*
2004 * Calculate amount of space in receive window,
2005 * and then do TCP input processing.
2006 * Receive window is amount of space in rcv queue,
2007 * but not less than advertised window.
2008 */
2009 lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
2010
2011 { int win;
2012
2013 win = tcp_sbspace(tp);
2014
2015 if (win < 0)
2016 win = 0;
2017 else { /* clip rcv window to 4K for modems */
2018 if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0)
2019 win = min(win, slowlink_wsize);
2020 }
2021 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
2022 }
2023
2024 switch (tp->t_state) {
2025
2026 /*
2027 * Initialize tp->rcv_nxt, and tp->irs, select an initial
2028 * tp->iss, and send a segment:
2029 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
2030 * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
2031 * Fill in remote peer address fields if not previously specified.
2032 * Enter SYN_RECEIVED state, and process any other fields of this
2033 * segment in this state.
2034 */
2035 case TCPS_LISTEN: {
2036 register struct sockaddr_in *sin;
2037 #if INET6
2038 register struct sockaddr_in6 *sin6;
2039 #endif
2040
2041 lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
2042 #if INET6
2043 if (isipv6) {
2044 MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6,
2045 M_SONAME, M_NOWAIT);
2046 if (sin6 == NULL)
2047 goto drop;
2048 bzero(sin6, sizeof(*sin6));
2049 sin6->sin6_family = AF_INET6;
2050 sin6->sin6_len = sizeof(*sin6);
2051 sin6->sin6_addr = ip6->ip6_src;
2052 sin6->sin6_port = th->th_sport;
2053 laddr6 = inp->in6p_laddr;
2054 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
2055 inp->in6p_laddr = ip6->ip6_dst;
2056 if (in6_pcbconnect(inp, (struct sockaddr *)sin6,
2057 proc0)) {
2058 inp->in6p_laddr = laddr6;
2059 FREE(sin6, M_SONAME);
2060 goto drop;
2061 }
2062 FREE(sin6, M_SONAME);
2063 } else
2064 #endif
2065 {
2066 lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
2067 MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME,
2068 M_NOWAIT);
2069 if (sin == NULL)
2070 goto drop;
2071 sin->sin_family = AF_INET;
2072 sin->sin_len = sizeof(*sin);
2073 sin->sin_addr = ip->ip_src;
2074 sin->sin_port = th->th_sport;
2075 bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero));
2076 laddr = inp->inp_laddr;
2077 if (inp->inp_laddr.s_addr == INADDR_ANY)
2078 inp->inp_laddr = ip->ip_dst;
2079 if (in_pcbconnect(inp, (struct sockaddr *)sin, proc0, NULL)) {
2080 inp->inp_laddr = laddr;
2081 FREE(sin, M_SONAME);
2082 goto drop;
2083 }
2084 FREE(sin, M_SONAME);
2085 }
2086
2087 tcp_dooptions(tp, optp, optlen, th, &to, ifscope);
2088
2089 if (tp->sack_enable) {
2090 if (!(to.to_flags & TOF_SACK))
2091 tp->sack_enable = 0;
2092 else
2093 tp->t_flags |= TF_SACK_PERMIT;
2094 }
2095
2096 if (iss)
2097 tp->iss = iss;
2098 else {
2099 tp->iss = tcp_new_isn(tp);
2100 }
2101 tp->irs = th->th_seq;
2102 tcp_sendseqinit(tp);
2103 tcp_rcvseqinit(tp);
2104 tp->snd_recover = tp->snd_una;
2105 /*
2106 * Initialization of the tcpcb for transaction;
2107 * set SND.WND = SEG.WND,
2108 * initialize CCsend and CCrecv.
2109 */
2110 tp->snd_wnd = tiwin; /* initial send-window */
2111 tp->t_flags |= TF_ACKNOW;
2112 tp->t_unacksegs = 0;
2113 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
2114 struct tcpcb *, tp, int32_t, TCPS_SYN_RECEIVED);
2115 tp->t_state = TCPS_SYN_RECEIVED;
2116 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
2117 tp->t_keepinit ? tp->t_keepinit : tcp_keepinit);
2118 dropsocket = 0; /* committed to socket */
2119
2120 /* reset the incomp processing flag */
2121 so->so_flags &= ~(SOF_INCOMP_INPROGRESS);
2122 tcpstat.tcps_accepts++;
2123 if ((thflags & (TH_ECE | TH_CWR)) == (TH_ECE | TH_CWR)) {
2124 /* ECN-setup SYN */
2125 tp->ecn_flags |= (TE_SETUPRECEIVED | TE_SENDIPECT);
2126 }
2127 #if CONFIG_IFEF_NOWINDOWSCALE
2128 if (tcp_obey_ifef_nowindowscale && m->m_pkthdr.rcvif != NULL &&
2129 (m->m_pkthdr.rcvif->if_eflags & IFEF_NOWINDOWSCALE)) {
2130 /* Window scaling is not enabled on this interface */
2131 tp->t_flags &= ~TF_REQ_SCALE;
2132 }
2133 #endif
2134 goto trimthenstep6;
2135 }
2136
2137 /*
2138 * If the state is SYN_RECEIVED:
2139 * if seg contains an ACK, but not for our SYN/ACK, send a RST.
2140 */
2141 case TCPS_SYN_RECEIVED:
2142 if ((thflags & TH_ACK) &&
2143 (SEQ_LEQ(th->th_ack, tp->snd_una) ||
2144 SEQ_GT(th->th_ack, tp->snd_max))) {
2145 rstreason = BANDLIM_RST_OPENPORT;
2146 goto dropwithreset;
2147 }
2148 break;
2149
2150 /*
2151 * If the state is SYN_SENT:
2152 * if seg contains an ACK, but not for our SYN, drop the input.
2153 * if seg contains a RST, then drop the connection.
2154 * if seg does not contain SYN, then drop it.
2155 * Otherwise this is an acceptable SYN segment
2156 * initialize tp->rcv_nxt and tp->irs
2157 * if seg contains ack then advance tp->snd_una
2158 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
2159 * arrange for segment to be acked (eventually)
2160 * continue processing rest of data/controls, beginning with URG
2161 */
2162 case TCPS_SYN_SENT:
2163 if ((thflags & TH_ACK) &&
2164 (SEQ_LEQ(th->th_ack, tp->iss) ||
2165 SEQ_GT(th->th_ack, tp->snd_max))) {
2166 rstreason = BANDLIM_UNLIMITED;
2167 goto dropwithreset;
2168 }
2169 if (thflags & TH_RST) {
2170 if ((thflags & TH_ACK) != 0) {
2171 tp = tcp_drop(tp, ECONNREFUSED);
2172 postevent(so, 0, EV_RESET);
2173 }
2174 goto drop;
2175 }
2176 if ((thflags & TH_SYN) == 0)
2177 goto drop;
2178 tp->snd_wnd = th->th_win; /* initial send window */
2179
2180 tp->irs = th->th_seq;
2181 tcp_rcvseqinit(tp);
2182 if (thflags & TH_ACK) {
2183 tcpstat.tcps_connects++;
2184
2185 if ((thflags & (TH_ECE | TH_CWR)) == (TH_ECE)) {
2186 /* ECN-setup SYN-ACK */
2187 tp->ecn_flags |= TE_SETUPRECEIVED;
2188 }
2189 else {
2190 /* non-ECN-setup SYN-ACK */
2191 tp->ecn_flags &= ~TE_SENDIPECT;
2192 }
2193
2194 #if CONFIG_MACF_NET && CONFIG_MACF_SOCKET
2195 /* XXXMAC: recursive lock: SOCK_LOCK(so); */
2196 mac_socketpeer_label_associate_mbuf(m, so);
2197 /* XXXMAC: SOCK_UNLOCK(so); */
2198 #endif
2199 /* Do window scaling on this connection? */
2200 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
2201 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
2202 tp->snd_scale = tp->requested_s_scale;
2203 tp->rcv_scale = tp->request_r_scale;
2204 }
2205 tp->rcv_adv += tp->rcv_wnd;
2206 tp->snd_una++; /* SYN is acked */
2207 /*
2208 * If there's data, delay ACK; if there's also a FIN
2209 * ACKNOW will be turned on later.
2210 */
2211 if (DELAY_ACK(tp, th) && tlen != 0) {
2212 if ((tp->t_flags & TF_DELACK) == 0) {
2213 tp->t_flags |= TF_DELACK;
2214 tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
2215 }
2216 tp->t_unacksegs++;
2217 }
2218 else {
2219 tp->t_flags |= TF_ACKNOW;
2220 }
2221 /*
2222 * Received <SYN,ACK> in SYN_SENT[*] state.
2223 * Transitions:
2224 * SYN_SENT --> ESTABLISHED
2225 * SYN_SENT* --> FIN_WAIT_1
2226 */
2227 tp->t_starttime = tcp_now;
2228 if (tp->t_flags & TF_NEEDFIN) {
2229 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
2230 struct tcpcb *, tp, int32_t, TCPS_FIN_WAIT_1);
2231 tp->t_state = TCPS_FIN_WAIT_1;
2232 tp->t_flags &= ~TF_NEEDFIN;
2233 thflags &= ~TH_SYN;
2234 } else {
2235 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
2236 struct tcpcb *, tp, int32_t, TCPS_ESTABLISHED);
2237 tp->t_state = TCPS_ESTABLISHED;
2238 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, TCP_KEEPIDLE(tp));
2239 if (nstat_collect)
2240 nstat_route_connect_success(tp->t_inpcb->inp_route.ro_rt);
2241 }
2242 isconnected = TRUE;
2243 } else {
2244 /*
2245 * Received initial SYN in SYN-SENT[*] state => simul-
2246 * taneous open. If segment contains CC option and there is
2247 * a cached CC, apply TAO test; if it succeeds, connection is
2248 * half-synchronized. Otherwise, do 3-way handshake:
2249 * SYN-SENT -> SYN-RECEIVED
2250 * SYN-SENT* -> SYN-RECEIVED*
2251 */
2252 tp->t_flags |= TF_ACKNOW;
2253 tp->t_timer[TCPT_REXMT] = 0;
2254 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
2255 struct tcpcb *, tp, int32_t, TCPS_SYN_RECEIVED);
2256 tp->t_state = TCPS_SYN_RECEIVED;
2257
2258 }
2259
2260 trimthenstep6:
2261 /*
2262 * Advance th->th_seq to correspond to first data byte.
2263 * If data, trim to stay within window,
2264 * dropping FIN if necessary.
2265 */
2266 th->th_seq++;
2267 if (tlen > tp->rcv_wnd) {
2268 todrop = tlen - tp->rcv_wnd;
2269 m_adj(m, -todrop);
2270 tlen = tp->rcv_wnd;
2271 thflags &= ~TH_FIN;
2272 tcpstat.tcps_rcvpackafterwin++;
2273 tcpstat.tcps_rcvbyteafterwin += todrop;
2274 }
2275 tp->snd_wl1 = th->th_seq - 1;
2276 tp->rcv_up = th->th_seq;
2277 /*
2278 * Client side of transaction: already sent SYN and data.
2279 * If the remote host used T/TCP to validate the SYN,
2280 * our data will be ACK'd; if so, enter normal data segment
2281 * processing in the middle of step 5, ack processing.
2282 * Otherwise, goto step 6.
2283 */
2284 if (thflags & TH_ACK)
2285 goto process_ACK;
2286 goto step6;
2287 /*
2288 * If the state is LAST_ACK or CLOSING or TIME_WAIT:
2289 * do normal processing.
2290 *
2291 * NB: Leftover from RFC1644 T/TCP. Cases to be reused later.
2292 */
2293 case TCPS_LAST_ACK:
2294 case TCPS_CLOSING:
2295 case TCPS_TIME_WAIT:
2296 break; /* continue normal processing */
2297
2298 /* Received a SYN while connection is already established.
2299 * This is a "half open connection and other anomalies" described
2300 * in RFC793 page 34, send an ACK so the remote reset the connection
2301 * or recovers by adjusting its sequence numberering
2302 */
2303 case TCPS_ESTABLISHED:
2304 if (thflags & TH_SYN)
2305 goto dropafterack;
2306 break;
2307 }
2308
2309 /*
2310 * States other than LISTEN or SYN_SENT.
2311 * First check the RST flag and sequence number since reset segments
2312 * are exempt from the timestamp and connection count tests. This
2313 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
2314 * below which allowed reset segments in half the sequence space
2315 * to fall though and be processed (which gives forged reset
2316 * segments with a random sequence number a 50 percent chance of
2317 * killing a connection).
2318 * Then check timestamp, if present.
2319 * Then check the connection count, if present.
2320 * Then check that at least some bytes of segment are within
2321 * receive window. If segment begins before rcv_nxt,
2322 * drop leading data (and SYN); if nothing left, just ack.
2323 *
2324 *
2325 * If the RST bit is set, check the sequence number to see
2326 * if this is a valid reset segment.
2327 * RFC 793 page 37:
2328 * In all states except SYN-SENT, all reset (RST) segments
2329 * are validated by checking their SEQ-fields. A reset is
2330 * valid if its sequence number is in the window.
2331 * Note: this does not take into account delayed ACKs, so
2332 * we should test against last_ack_sent instead of rcv_nxt.
2333 * The sequence number in the reset segment is normally an
2334 * echo of our outgoing acknowlegement numbers, but some hosts
2335 * send a reset with the sequence number at the rightmost edge
2336 * of our receive window, and we have to handle this case.
2337 * Note 2: Paul Watson's paper "Slipping in the Window" has shown
2338 * that brute force RST attacks are possible. To combat this,
2339 * we use a much stricter check while in the ESTABLISHED state,
2340 * only accepting RSTs where the sequence number is equal to
2341 * last_ack_sent. In all other states (the states in which a
2342 * RST is more likely), the more permissive check is used.
2343 * If we have multiple segments in flight, the intial reset
2344 * segment sequence numbers will be to the left of last_ack_sent,
2345 * but they will eventually catch up.
2346 * In any case, it never made sense to trim reset segments to
2347 * fit the receive window since RFC 1122 says:
2348 * 4.2.2.12 RST Segment: RFC-793 Section 3.4
2349 *
2350 * A TCP SHOULD allow a received RST segment to include data.
2351 *
2352 * DISCUSSION
2353 * It has been suggested that a RST segment could contain
2354 * ASCII text that encoded and explained the cause of the
2355 * RST. No standard has yet been established for such
2356 * data.
2357 *
2358 * If the reset segment passes the sequence number test examine
2359 * the state:
2360 * SYN_RECEIVED STATE:
2361 * If passive open, return to LISTEN state.
2362 * If active open, inform user that connection was refused.
2363 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES:
2364 * Inform user that connection was reset, and close tcb.
2365 * CLOSING, LAST_ACK STATES:
2366 * Close the tcb.
2367 * TIME_WAIT STATE:
2368 * Drop the segment - see Stevens, vol. 2, p. 964 and
2369 * RFC 1337.
2370 *
2371 * Radar 4803931: Allows for the case where we ACKed the FIN but
2372 * there is already a RST in flight from the peer.
2373 * In that case, accept the RST for non-established
2374 * state if it's one off from last_ack_sent.
2375
2376 */
2377 if (thflags & TH_RST) {
2378 if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
2379 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
2380 (tp->rcv_wnd == 0 &&
2381 ((tp->last_ack_sent == th->th_seq) || ((tp->last_ack_sent -1) == th->th_seq)))) {
2382 switch (tp->t_state) {
2383
2384 case TCPS_SYN_RECEIVED:
2385 so->so_error = ECONNREFUSED;
2386 goto close;
2387
2388 case TCPS_ESTABLISHED:
2389 if (tp->last_ack_sent != th->th_seq) {
2390 tcpstat.tcps_badrst++;
2391 goto drop;
2392 }
2393 case TCPS_FIN_WAIT_1:
2394 case TCPS_CLOSE_WAIT:
2395 /*
2396 Drop through ...
2397 */
2398 case TCPS_FIN_WAIT_2:
2399 so->so_error = ECONNRESET;
2400 close:
2401 postevent(so, 0, EV_RESET);
2402 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
2403 struct tcpcb *, tp, int32_t, TCPS_CLOSED);
2404 tp->t_state = TCPS_CLOSED;
2405 tcpstat.tcps_drops++;
2406 tp = tcp_close(tp);
2407 break;
2408
2409 case TCPS_CLOSING:
2410 case TCPS_LAST_ACK:
2411 tp = tcp_close(tp);
2412 break;
2413
2414 case TCPS_TIME_WAIT:
2415 break;
2416 }
2417 }
2418 goto drop;
2419 }
2420
2421 /*
2422 * RFC 1323 PAWS: If we have a timestamp reply on this segment
2423 * and it's less than ts_recent, drop it.
2424 */
2425 if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
2426 TSTMP_LT(to.to_tsval, tp->ts_recent)) {
2427
2428 /* Check to see if ts_recent is over 24 days old. */
2429 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) {
2430 /*
2431 * Invalidate ts_recent. If this segment updates
2432 * ts_recent, the age will be reset later and ts_recent
2433 * will get a valid value. If it does not, setting
2434 * ts_recent to zero will at least satisfy the
2435 * requirement that zero be placed in the timestamp
2436 * echo reply when ts_recent isn't valid. The
2437 * age isn't reset until we get a valid ts_recent
2438 * because we don't want out-of-order segments to be
2439 * dropped when ts_recent is old.
2440 */
2441 tp->ts_recent = 0;
2442 } else {
2443 tcpstat.tcps_rcvduppack++;
2444 tcpstat.tcps_rcvdupbyte += tlen;
2445 tcpstat.tcps_pawsdrop++;
2446 if (nstat_collect) {
2447 nstat_route_rx(tp->t_inpcb->inp_route.ro_rt, 1, tlen, NSTAT_RX_FLAG_DUPLICATE);
2448 locked_add_64(&inp->inp_stat->rxpackets, 1);
2449 locked_add_64(&inp->inp_stat->rxbytes, tlen);
2450 tp->t_stat.rxduplicatebytes += tlen;
2451 }
2452 if (tlen)
2453 goto dropafterack;
2454 goto drop;
2455 }
2456 }
2457
2458 /*
2459 * In the SYN-RECEIVED state, validate that the packet belongs to
2460 * this connection before trimming the data to fit the receive
2461 * window. Check the sequence number versus IRS since we know
2462 * the sequence numbers haven't wrapped. This is a partial fix
2463 * for the "LAND" DoS attack.
2464 */
2465 if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
2466 rstreason = BANDLIM_RST_OPENPORT;
2467 goto dropwithreset;
2468 }
2469
2470 todrop = tp->rcv_nxt - th->th_seq;
2471 if (todrop > 0) {
2472 if (thflags & TH_SYN) {
2473 thflags &= ~TH_SYN;
2474 th->th_seq++;
2475 if (th->th_urp > 1)
2476 th->th_urp--;
2477 else
2478 thflags &= ~TH_URG;
2479 todrop--;
2480 }
2481 /*
2482 * Following if statement from Stevens, vol. 2, p. 960.
2483 */
2484 if (todrop > tlen
2485 || (todrop == tlen && (thflags & TH_FIN) == 0)) {
2486 /*
2487 * Any valid FIN must be to the left of the window.
2488 * At this point the FIN must be a duplicate or out
2489 * of sequence; drop it.
2490 */
2491 thflags &= ~TH_FIN;
2492
2493 /*
2494 * Send an ACK to resynchronize and drop any data.
2495 * But keep on processing for RST or ACK.
2496 */
2497 tp->t_flags |= TF_ACKNOW;
2498 todrop = tlen;
2499 tcpstat.tcps_rcvduppack++;
2500 tcpstat.tcps_rcvdupbyte += todrop;
2501 } else {
2502 tcpstat.tcps_rcvpartduppack++;
2503 tcpstat.tcps_rcvpartdupbyte += todrop;
2504 }
2505 if (nstat_collect) {
2506 nstat_route_rx(tp->t_inpcb->inp_route.ro_rt, 1, todrop, NSTAT_RX_FLAG_DUPLICATE);
2507 locked_add_64(&inp->inp_stat->rxpackets, 1);
2508 locked_add_64(&inp->inp_stat->rxbytes, todrop);
2509 tp->t_stat.rxduplicatebytes += todrop;
2510 }
2511 drop_hdrlen += todrop; /* drop from the top afterwards */
2512 th->th_seq += todrop;
2513 tlen -= todrop;
2514 if (th->th_urp > todrop)
2515 th->th_urp -= todrop;
2516 else {
2517 thflags &= ~TH_URG;
2518 th->th_urp = 0;
2519 }
2520 }
2521
2522 /*
2523 * If new data are received on a connection after the
2524 * user processes are gone, then RST the other end.
2525 */
2526 if ((so->so_state & SS_NOFDREF) &&
2527 tp->t_state > TCPS_CLOSE_WAIT && tlen) {
2528 tp = tcp_close(tp);
2529 tcpstat.tcps_rcvafterclose++;
2530 rstreason = BANDLIM_UNLIMITED;
2531 goto dropwithreset;
2532 }
2533
2534 /*
2535 * If segment ends after window, drop trailing data
2536 * (and PUSH and FIN); if nothing left, just ACK.
2537 */
2538 todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd);
2539 if (todrop > 0) {
2540 tcpstat.tcps_rcvpackafterwin++;
2541 if (todrop >= tlen) {
2542 tcpstat.tcps_rcvbyteafterwin += tlen;
2543 /*
2544 * If a new connection request is received
2545 * while in TIME_WAIT, drop the old connection
2546 * and start over if the sequence numbers
2547 * are above the previous ones.
2548 */
2549 if (thflags & TH_SYN &&
2550 tp->t_state == TCPS_TIME_WAIT &&
2551 SEQ_GT(th->th_seq, tp->rcv_nxt)) {
2552 iss = tcp_new_isn(tp);
2553 tp = tcp_close(tp);
2554 tcp_unlock(so, 1, 0);
2555 goto findpcb;
2556 }
2557 /*
2558 * If window is closed can only take segments at
2559 * window edge, and have to drop data and PUSH from
2560 * incoming segments. Continue processing, but
2561 * remember to ack. Otherwise, drop segment
2562 * and ack.
2563 */
2564 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
2565 tp->t_flags |= TF_ACKNOW;
2566 tcpstat.tcps_rcvwinprobe++;
2567 } else
2568 goto dropafterack;
2569 } else
2570 tcpstat.tcps_rcvbyteafterwin += todrop;
2571 m_adj(m, -todrop);
2572 tlen -= todrop;
2573 thflags &= ~(TH_PUSH|TH_FIN);
2574 }
2575
2576 /*
2577 * If last ACK falls within this segment's sequence numbers,
2578 * record its timestamp.
2579 * NOTE:
2580 * 1) That the test incorporates suggestions from the latest
2581 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
2582 * 2) That updating only on newer timestamps interferes with
2583 * our earlier PAWS tests, so this check should be solely
2584 * predicated on the sequence space of this segment.
2585 * 3) That we modify the segment boundary check to be
2586 * Last.ACK.Sent <= SEG.SEQ + SEG.Len
2587 * instead of RFC1323's
2588 * Last.ACK.Sent < SEG.SEQ + SEG.Len,
2589 * This modified check allows us to overcome RFC1323's
2590 * limitations as described in Stevens TCP/IP Illustrated
2591 * Vol. 2 p.869. In such cases, we can still calculate the
2592 * RTT correctly when RCV.NXT == Last.ACK.Sent.
2593 */
2594 if ((to.to_flags & TOF_TS) != 0 &&
2595 SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
2596 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
2597 ((thflags & (TH_SYN|TH_FIN)) != 0))) {
2598 tp->ts_recent_age = tcp_now;
2599 tp->ts_recent = to.to_tsval;
2600 }
2601
2602 /*
2603 * If a SYN is in the window, then this is an
2604 * error and we send an RST and drop the connection.
2605 */
2606 if (thflags & TH_SYN) {
2607 tp = tcp_drop(tp, ECONNRESET);
2608 rstreason = BANDLIM_UNLIMITED;
2609 postevent(so, 0, EV_RESET);
2610 goto dropwithreset;
2611 }
2612
2613 /*
2614 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN
2615 * flag is on (half-synchronized state), then queue data for
2616 * later processing; else drop segment and return.
2617 */
2618 if ((thflags & TH_ACK) == 0) {
2619 if (tp->t_state == TCPS_SYN_RECEIVED ||
2620 (tp->t_flags & TF_NEEDSYN))
2621 goto step6;
2622 else if (tp->t_flags & TF_ACKNOW)
2623 goto dropafterack;
2624 else
2625 goto drop;
2626 }
2627
2628 /*
2629 * Ack processing.
2630 */
2631 switch (tp->t_state) {
2632
2633 /*
2634 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
2635 * ESTABLISHED state and continue processing.
2636 * The ACK was checked above.
2637 */
2638 case TCPS_SYN_RECEIVED:
2639
2640 tcpstat.tcps_connects++;
2641
2642 /* Do window scaling? */
2643 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
2644 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
2645 tp->snd_scale = tp->requested_s_scale;
2646 tp->rcv_scale = tp->request_r_scale;
2647 tp->snd_wnd = th->th_win << tp->snd_scale;
2648 tiwin = tp->snd_wnd;
2649 }
2650 /*
2651 * Make transitions:
2652 * SYN-RECEIVED -> ESTABLISHED
2653 * SYN-RECEIVED* -> FIN-WAIT-1
2654 */
2655 tp->t_starttime = tcp_now;
2656 if (tp->t_flags & TF_NEEDFIN) {
2657 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
2658 struct tcpcb *, tp, int32_t, TCPS_FIN_WAIT_1);
2659 tp->t_state = TCPS_FIN_WAIT_1;
2660 tp->t_flags &= ~TF_NEEDFIN;
2661 } else {
2662 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
2663 struct tcpcb *, tp, int32_t, TCPS_ESTABLISHED);
2664 tp->t_state = TCPS_ESTABLISHED;
2665 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, TCP_KEEPIDLE(tp));
2666 if (nstat_collect)
2667 nstat_route_connect_success(tp->t_inpcb->inp_route.ro_rt);
2668 }
2669 /*
2670 * If segment contains data or ACK, will call tcp_reass()
2671 * later; if not, do so now to pass queued data to user.
2672 */
2673 if (tlen == 0 && (thflags & TH_FIN) == 0)
2674 (void) tcp_reass(tp, (struct tcphdr *)0, &tlen,
2675 (struct mbuf *)0);
2676 tp->snd_wl1 = th->th_seq - 1;
2677
2678 /* FALLTHROUGH */
2679
2680 isconnected = TRUE;
2681
2682 /*
2683 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
2684 * ACKs. If the ack is in the range
2685 * tp->snd_una < th->th_ack <= tp->snd_max
2686 * then advance tp->snd_una to th->th_ack and drop
2687 * data from the retransmission queue. If this ACK reflects
2688 * more up to date window information we update our window information.
2689 */
2690 case TCPS_ESTABLISHED:
2691 case TCPS_FIN_WAIT_1:
2692 case TCPS_FIN_WAIT_2:
2693 case TCPS_CLOSE_WAIT:
2694 case TCPS_CLOSING:
2695 case TCPS_LAST_ACK:
2696 case TCPS_TIME_WAIT:
2697 if (SEQ_GT(th->th_ack, tp->snd_max)) {
2698 tcpstat.tcps_rcvacktoomuch++;
2699 goto dropafterack;
2700 }
2701 if (tp->sack_enable &&
2702 (to.to_nsacks > 0 || !TAILQ_EMPTY(&tp->snd_holes)))
2703 tcp_sack_doack(tp, &to, th->th_ack);
2704 if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
2705 if (tlen == 0 && tiwin == tp->snd_wnd) {
2706 tcpstat.tcps_rcvdupack++;
2707 /*
2708 * If we have outstanding data (other than
2709 * a window probe), this is a completely
2710 * duplicate ack (ie, window info didn't
2711 * change), the ack is the biggest we've
2712 * seen and we've seen exactly our rexmt
2713 * threshhold of them, assume a packet
2714 * has been dropped and retransmit it.
2715 * Kludge snd_nxt & the congestion
2716 * window so we send only this one
2717 * packet.
2718 *
2719 * We know we're losing at the current
2720 * window size so do congestion avoidance
2721 * (set ssthresh to half the current window
2722 * and pull our congestion window back to
2723 * the new ssthresh).
2724 *
2725 * Dup acks mean that packets have left the
2726 * network (they're now cached at the receiver)
2727 * so bump cwnd by the amount in the receiver
2728 * to keep a constant cwnd packets in the
2729 * network.
2730 */
2731 if (tp->t_timer[TCPT_REXMT] == 0 ||
2732 th->th_ack != tp->snd_una)
2733 tp->t_dupacks = 0;
2734 else if (++tp->t_dupacks > tcprexmtthresh ||
2735 IN_FASTRECOVERY(tp)) {
2736 if (tp->sack_enable && IN_FASTRECOVERY(tp)) {
2737 int awnd;
2738
2739 /*
2740 * Compute the amount of data in flight first.
2741 * We can inject new data into the pipe iff
2742 * we have less than 1/2 the original window's
2743 * worth of data in flight.
2744 */
2745 awnd = (tp->snd_nxt - tp->snd_fack) +
2746 tp->sackhint.sack_bytes_rexmit;
2747 if (awnd < tp->snd_ssthresh) {
2748 tp->snd_cwnd += tp->t_maxseg;
2749 if (tp->snd_cwnd > tp->snd_ssthresh)
2750 tp->snd_cwnd = tp->snd_ssthresh;
2751 }
2752 } else
2753 tp->snd_cwnd += tp->t_maxseg;
2754
2755 DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp,
2756 struct tcpcb *, tp, struct tcphdr *, th,
2757 int32_t, TCP_CC_IN_FASTRECOVERY);
2758
2759 (void) tcp_output(tp);
2760 goto drop;
2761 } else if (tp->t_dupacks == tcprexmtthresh) {
2762 tcp_seq onxt = tp->snd_nxt;
2763
2764 /*
2765 * If we're doing sack, check to
2766 * see if we're already in sack
2767 * recovery. If we're not doing sack,
2768 * check to see if we're in newreno
2769 * recovery.
2770 */
2771 if (tp->sack_enable) {
2772 if (IN_FASTRECOVERY(tp)) {
2773 tp->t_dupacks = 0;
2774 break;
2775 }
2776 } else {
2777 if (SEQ_LEQ(th->th_ack,
2778 tp->snd_recover)) {
2779 tp->t_dupacks = 0;
2780 break;
2781 }
2782 }
2783
2784 /*
2785 * If the current tcp cc module has
2786 * defined a hook for tasks to run
2787 * before entering FR, call it
2788 */
2789 if (CC_ALGO(tp)->pre_fr != NULL)
2790 CC_ALGO(tp)->pre_fr(tp, th);
2791 ENTER_FASTRECOVERY(tp);
2792 tp->snd_recover = tp->snd_max;
2793 tp->t_timer[TCPT_REXMT] = 0;
2794 tp->t_rtttime = 0;
2795 tp->ecn_flags |= TE_SENDCWR;
2796 if (tp->sack_enable) {
2797 tcpstat.tcps_sack_recovery_episode++;
2798 tp->sack_newdata = tp->snd_nxt;
2799 tp->snd_cwnd = tp->t_maxseg;
2800
2801 DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp,
2802 struct tcpcb *, tp, struct tcphdr *, th,
2803 int32_t, TCP_CC_ENTER_FASTRECOVERY);
2804
2805 (void) tcp_output(tp);
2806 goto drop;
2807 }
2808 tp->snd_nxt = th->th_ack;
2809 tp->snd_cwnd = tp->t_maxseg;
2810 (void) tcp_output(tp);
2811 tp->snd_cwnd = tp->snd_ssthresh +
2812 tp->t_maxseg * tp->t_dupacks;
2813 if (SEQ_GT(onxt, tp->snd_nxt))
2814 tp->snd_nxt = onxt;
2815 DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp,
2816 struct tcpcb *, tp, struct tcphdr *, th,
2817 int32_t, TCP_CC_ENTER_FASTRECOVERY);
2818 goto drop;
2819 }
2820 } else
2821 tp->t_dupacks = 0;
2822 break;
2823 }
2824 /*
2825 * If the congestion window was inflated to account
2826 * for the other side's cached packets, retract it.
2827 */
2828 if (IN_FASTRECOVERY(tp)) {
2829 if (SEQ_LT(th->th_ack, tp->snd_recover)) {
2830 if (tp->sack_enable)
2831 tcp_sack_partialack(tp, th);
2832 else
2833 tcp_newreno_partial_ack(tp, th);
2834
2835 DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp,
2836 struct tcpcb *, tp, struct tcphdr *, th,
2837 int32_t, TCP_CC_PARTIAL_ACK);
2838 } else {
2839 EXIT_FASTRECOVERY(tp);
2840 if (CC_ALGO(tp)->post_fr != NULL)
2841 CC_ALGO(tp)->post_fr(tp, th);
2842 tp->t_dupacks = 0;
2843
2844 DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp,
2845 struct tcpcb *, tp, struct tcphdr *, th,
2846 int32_t, TCP_CC_EXIT_FASTRECOVERY);
2847 }
2848 } else {
2849 /*
2850 * We were not in fast recovery. Reset the duplicate ack
2851 * counter.
2852 */
2853 tp->t_dupacks = 0;
2854 }
2855
2856
2857 /*
2858 * If we reach this point, ACK is not a duplicate,
2859 * i.e., it ACKs something we sent.
2860 */
2861 if (tp->t_flags & TF_NEEDSYN) {
2862 /*
2863 * T/TCP: Connection was half-synchronized, and our
2864 * SYN has been ACK'd (so connection is now fully
2865 * synchronized). Go to non-starred state,
2866 * increment snd_una for ACK of SYN, and check if
2867 * we can do window scaling.
2868 */
2869 tp->t_flags &= ~TF_NEEDSYN;
2870 tp->snd_una++;
2871 /* Do window scaling? */
2872 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
2873 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
2874 tp->snd_scale = tp->requested_s_scale;
2875 tp->rcv_scale = tp->request_r_scale;
2876 }
2877 }
2878
2879 process_ACK:
2880 acked = th->th_ack - tp->snd_una;
2881 tcpstat.tcps_rcvackpack++;
2882 tcpstat.tcps_rcvackbyte += acked;
2883
2884 /*
2885 * If we just performed our first retransmit, and the ACK
2886 * arrives within our recovery window, then it was a mistake
2887 * to do the retransmit in the first place. Recover our
2888 * original cwnd and ssthresh, and proceed to transmit where
2889 * we left off.
2890 */
2891 if (tp->t_rxtshift == 1 &&
2892 TSTMP_LT(tcp_now, tp->t_badrxtwin)) {
2893 ++tcpstat.tcps_sndrexmitbad;
2894 tp->snd_cwnd = tp->snd_cwnd_prev;
2895 tp->snd_ssthresh = tp->snd_ssthresh_prev;
2896 tp->snd_recover = tp->snd_recover_prev;
2897 if (tp->t_flags & TF_WASFRECOVERY)
2898 ENTER_FASTRECOVERY(tp);
2899 tp->snd_nxt = tp->snd_max;
2900 tp->t_badrxtwin = 0; /* XXX probably not required */
2901 tp->t_rxtshift = 0;
2902 tp->rxt_start = 0;
2903
2904 DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp,
2905 struct tcpcb *, tp, struct tcphdr *, th,
2906 int32_t, TCP_CC_BAD_REXMT_RECOVERY);
2907 }
2908
2909 /*
2910 * If we have a timestamp reply, update smoothed
2911 * round trip time. If no timestamp is present but
2912 * transmit timer is running and timed sequence
2913 * number was acked, update smoothed round trip time.
2914 * Since we now have an rtt measurement, cancel the
2915 * timer backoff (cf., Phil Karn's retransmit alg.).
2916 * Recompute the initial retransmit timer.
2917 * Also makes sure we have a valid time stamp in hand
2918 *
2919 * Some boxes send broken timestamp replies
2920 * during the SYN+ACK phase, ignore
2921 * timestamps of 0 or we could calculate a
2922 * huge RTT and blow up the retransmit timer.
2923 */
2924 if (((to.to_flags & TOF_TS) != 0) && (to.to_tsecr != 0) &&
2925 TSTMP_GEQ(tcp_now, to.to_tsecr)) {
2926 tcp_xmit_timer(tp, tcp_now - to.to_tsecr);
2927 } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) {
2928 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime);
2929 }
2930
2931 /*
2932 * If all outstanding data is acked, stop retransmit
2933 * timer and remember to restart (more output or persist).
2934 * If there is more data to be acked, restart retransmit
2935 * timer, using current (possibly backed-off) value.
2936 */
2937 if (th->th_ack == tp->snd_max) {
2938 tp->t_timer[TCPT_REXMT] = 0;
2939 needoutput = 1;
2940 } else if (tp->t_timer[TCPT_PERSIST] == 0)
2941 tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
2942
2943 /*
2944 * If no data (only SYN) was ACK'd,
2945 * skip rest of ACK processing.
2946 */
2947 if (acked == 0)
2948 goto step6;
2949
2950 if ((thflags & TH_ECE) != 0 &&
2951 (tp->ecn_flags & TE_SETUPSENT) != 0) {
2952 /*
2953 * Reduce the congestion window if we haven't done so.
2954 */
2955 if (!tp->sack_enable && !IN_FASTRECOVERY(tp) &&
2956 SEQ_GEQ(th->th_ack, tp->snd_recover)) {
2957 tcp_reduce_congestion_window(tp, th);
2958 DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp,
2959 struct tcpcb *, tp, struct tcphdr *, th,
2960 int32_t, TCP_CC_ECN_RCVD);
2961 }
2962 }
2963
2964 /*
2965 * When new data is acked, open the congestion window.
2966 * The specifics of how this is achieved are up to the
2967 * congestion control algorithm in use for this connection.
2968 *
2969 * The calculations in this function assume that snd_una is
2970 * not updated yet.
2971 */
2972 if (!IN_FASTRECOVERY(tp)) {
2973 if (CC_ALGO(tp)->ack_rcvd != NULL)
2974 CC_ALGO(tp)->ack_rcvd(tp, th);
2975
2976 DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp,
2977 struct tcpcb *, tp, struct tcphdr *, th,
2978 int32_t, TCP_CC_ACK_RCVD);
2979 }
2980 if (acked > so->so_snd.sb_cc) {
2981 tp->snd_wnd -= so->so_snd.sb_cc;
2982 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
2983 ourfinisacked = 1;
2984 } else {
2985 sbdrop(&so->so_snd, acked);
2986 tp->snd_wnd -= acked;
2987 ourfinisacked = 0;
2988 }
2989 /* detect una wraparound */
2990 if ( !IN_FASTRECOVERY(tp) &&
2991 SEQ_GT(tp->snd_una, tp->snd_recover) &&
2992 SEQ_LEQ(th->th_ack, tp->snd_recover))
2993 tp->snd_recover = th->th_ack - 1;
2994
2995 if (IN_FASTRECOVERY(tp) &&
2996 SEQ_GEQ(th->th_ack, tp->snd_recover))
2997 EXIT_FASTRECOVERY(tp);
2998
2999 tp->snd_una = th->th_ack;
3000 if (tp->sack_enable) {
3001 if (SEQ_GT(tp->snd_una, tp->snd_recover))
3002 tp->snd_recover = tp->snd_una;
3003 }
3004 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
3005 tp->snd_nxt = tp->snd_una;
3006
3007 /*
3008 * sowwakeup must happen after snd_una, et al. are updated so that
3009 * the sequence numbers are in sync with so_snd
3010 */
3011 sowwakeup(so);
3012
3013 switch (tp->t_state) {
3014
3015 /*
3016 * In FIN_WAIT_1 STATE in addition to the processing
3017 * for the ESTABLISHED state if our FIN is now acknowledged
3018 * then enter FIN_WAIT_2.
3019 */
3020 case TCPS_FIN_WAIT_1:
3021 if (ourfinisacked) {
3022 /*
3023 * If we can't receive any more
3024 * data, then closing user can proceed.
3025 * Starting the timer is contrary to the
3026 * specification, but if we don't get a FIN
3027 * we'll hang forever.
3028 */
3029 if (so->so_state & SS_CANTRCVMORE) {
3030 add_to_time_wait(tp, tcp_maxidle);
3031 isconnected = FALSE;
3032 isdisconnected = TRUE;
3033 }
3034 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
3035 struct tcpcb *, tp, int32_t, TCPS_FIN_WAIT_2);
3036 tp->t_state = TCPS_FIN_WAIT_2;
3037 /* fall through and make sure we also recognize data ACKed with the FIN */
3038 }
3039 tp->t_flags |= TF_ACKNOW;
3040 break;
3041
3042 /*
3043 * In CLOSING STATE in addition to the processing for
3044 * the ESTABLISHED state if the ACK acknowledges our FIN
3045 * then enter the TIME-WAIT state, otherwise ignore
3046 * the segment.
3047 */
3048 case TCPS_CLOSING:
3049 if (ourfinisacked) {
3050 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
3051 struct tcpcb *, tp, int32_t, TCPS_TIME_WAIT);
3052 tp->t_state = TCPS_TIME_WAIT;
3053 tcp_canceltimers(tp);
3054 /* Shorten TIME_WAIT [RFC-1644, p.28] */
3055 if (tp->cc_recv != 0 &&
3056 ((int)(tcp_now - tp->t_starttime)) < tcp_msl)
3057 add_to_time_wait(tp, tp->t_rxtcur * TCPTV_TWTRUNC);
3058 else
3059 add_to_time_wait(tp, 2 * tcp_msl);
3060 isconnected = FALSE;
3061 isdisconnected = TRUE;
3062 }
3063 tp->t_flags |= TF_ACKNOW;
3064 break;
3065
3066 /*
3067 * In LAST_ACK, we may still be waiting for data to drain
3068 * and/or to be acked, as well as for the ack of our FIN.
3069 * If our FIN is now acknowledged, delete the TCB,
3070 * enter the closed state and return.
3071 */
3072 case TCPS_LAST_ACK:
3073 if (ourfinisacked) {
3074 tp = tcp_close(tp);
3075 goto drop;
3076 }
3077 break;
3078
3079 /*
3080 * In TIME_WAIT state the only thing that should arrive
3081 * is a retransmission of the remote FIN. Acknowledge
3082 * it and restart the finack timer.
3083 */
3084 case TCPS_TIME_WAIT:
3085 add_to_time_wait(tp, 2 * tcp_msl);
3086 goto dropafterack;
3087 }
3088 }
3089
3090 step6:
3091 /*
3092 * Update window information.
3093 * Don't look at window if no ACK: TAC's send garbage on first SYN.
3094 */
3095 if ((thflags & TH_ACK) &&
3096 (SEQ_LT(tp->snd_wl1, th->th_seq) ||
3097 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
3098 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
3099 /* keep track of pure window updates */
3100 if (tlen == 0 &&
3101 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
3102 tcpstat.tcps_rcvwinupd++;
3103 tp->snd_wnd = tiwin;
3104 tp->snd_wl1 = th->th_seq;
3105 tp->snd_wl2 = th->th_ack;
3106 if (tp->snd_wnd > tp->max_sndwnd)
3107 tp->max_sndwnd = tp->snd_wnd;
3108 needoutput = 1;
3109 }
3110
3111 /*
3112 * Process segments with URG.
3113 */
3114 if ((thflags & TH_URG) && th->th_urp &&
3115 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
3116 /*
3117 * This is a kludge, but if we receive and accept
3118 * random urgent pointers, we'll crash in
3119 * soreceive. It's hard to imagine someone
3120 * actually wanting to send this much urgent data.
3121 */
3122 if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
3123 th->th_urp = 0; /* XXX */
3124 thflags &= ~TH_URG; /* XXX */
3125 goto dodata; /* XXX */
3126 }
3127 /*
3128 * If this segment advances the known urgent pointer,
3129 * then mark the data stream. This should not happen
3130 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
3131 * a FIN has been received from the remote side.
3132 * In these states we ignore the URG.
3133 *
3134 * According to RFC961 (Assigned Protocols),
3135 * the urgent pointer points to the last octet
3136 * of urgent data. We continue, however,
3137 * to consider it to indicate the first octet
3138 * of data past the urgent section as the original
3139 * spec states (in one of two places).
3140 */
3141 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
3142 tp->rcv_up = th->th_seq + th->th_urp;
3143 so->so_oobmark = so->so_rcv.sb_cc +
3144 (tp->rcv_up - tp->rcv_nxt) - 1;
3145 if (so->so_oobmark == 0) {
3146 so->so_state |= SS_RCVATMARK;
3147 postevent(so, 0, EV_OOB);
3148 }
3149 sohasoutofband(so);
3150 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
3151 }
3152 /*
3153 * Remove out of band data so doesn't get presented to user.
3154 * This can happen independent of advancing the URG pointer,
3155 * but if two URG's are pending at once, some out-of-band
3156 * data may creep in... ick.
3157 */
3158 if (th->th_urp <= (u_int32_t)tlen
3159 #if SO_OOBINLINE
3160 && (so->so_options & SO_OOBINLINE) == 0
3161 #endif
3162 )
3163 tcp_pulloutofband(so, th, m,
3164 drop_hdrlen); /* hdr drop is delayed */
3165 } else {
3166 /*
3167 * If no out of band data is expected,
3168 * pull receive urgent pointer along
3169 * with the receive window.
3170 */
3171 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
3172 tp->rcv_up = tp->rcv_nxt;
3173 }
3174 dodata:
3175
3176 /* Set socket's connect or disconnect state correcly before doing data.
3177 * The following might unlock the socket if there is an upcall or a socket
3178 * filter.
3179 */
3180 if (isconnected) {
3181 soisconnected(so);
3182 } else if (isdisconnected) {
3183 soisdisconnected(so);
3184 }
3185
3186 /* Let's check the state of pcb just to make sure that it did not get closed
3187 * when we unlocked above
3188 */
3189 if (inp->inp_state == INPCB_STATE_DEAD) {
3190 /* Just drop the packet that we are processing and return */
3191 goto drop;
3192 }
3193
3194 /*
3195 * Process the segment text, merging it into the TCP sequencing queue,
3196 * and arranging for acknowledgment of receipt if necessary.
3197 * This process logically involves adjusting tp->rcv_wnd as data
3198 * is presented to the user (this happens in tcp_usrreq.c,
3199 * case PRU_RCVD). If a FIN has already been received on this
3200 * connection then we just ignore the text.
3201 */
3202 if ((tlen || (thflags & TH_FIN)) &&
3203 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
3204 tcp_seq save_start = th->th_seq;
3205 tcp_seq save_end = th->th_seq + tlen;
3206 m_adj(m, drop_hdrlen); /* delayed header drop */
3207 /*
3208 * Insert segment which includes th into TCP reassembly queue
3209 * with control block tp. Set thflags to whether reassembly now
3210 * includes a segment with FIN. This handles the common case
3211 * inline (segment is the next to be received on an established
3212 * connection, and the queue is empty), avoiding linkage into
3213 * and removal from the queue and repetition of various
3214 * conversions.
3215 * Set DELACK for segments received in order, but ack
3216 * immediately when segments are out of order (so
3217 * fast retransmit can work).
3218 */
3219 if (th->th_seq == tp->rcv_nxt &&
3220 LIST_EMPTY(&tp->t_segq) &&
3221 TCPS_HAVEESTABLISHED(tp->t_state)) {
3222 if (DELAY_ACK(tp, th) && ((tp->t_flags & TF_ACKNOW) == 0)) {
3223 if ((tp->t_flags & TF_DELACK) == 0) {
3224 tp->t_flags |= TF_DELACK;
3225 tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
3226 }
3227 tp->t_unacksegs++;
3228 }
3229 else {
3230 tp->t_flags |= TF_ACKNOW;
3231 }
3232 tp->rcv_nxt += tlen;
3233 thflags = th->th_flags & TH_FIN;
3234 tcpstat.tcps_rcvpack++;
3235 tcpstat.tcps_rcvbyte += tlen;
3236 if (nstat_collect) {
3237 locked_add_64(&inp->inp_stat->rxpackets, 1);
3238 locked_add_64(&inp->inp_stat->rxbytes, tlen);
3239 }
3240 ND6_HINT(tp);
3241 so_recv_data_stat(so, m, drop_hdrlen);
3242 if (sbappendstream(&so->so_rcv, m))
3243 sorwakeup(so);
3244 } else {
3245 thflags = tcp_reass(tp, th, &tlen, m);
3246 tp->t_flags |= TF_ACKNOW;
3247 }
3248
3249 if (tlen > 0 && tp->sack_enable)
3250 tcp_update_sack_list(tp, save_start, save_end);
3251
3252 if (tp->t_flags & TF_DELACK)
3253 {
3254 #if INET6
3255 if (isipv6) {
3256 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
3257 (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
3258 th->th_seq, th->th_ack, th->th_win);
3259 }
3260 else
3261 #endif
3262 {
3263 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
3264 (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
3265 th->th_seq, th->th_ack, th->th_win);
3266 }
3267
3268 }
3269 /*
3270 * Note the amount of data that peer has sent into
3271 * our window, in order to estimate the sender's
3272 * buffer size.
3273 */
3274 len = (u_int)(so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt));
3275 if (len > so->so_rcv.sb_maxused)
3276 so->so_rcv.sb_maxused = len;
3277 } else {
3278 m_freem(m);
3279 thflags &= ~TH_FIN;
3280 }
3281
3282 /*
3283 * If FIN is received ACK the FIN and let the user know
3284 * that the connection is closing.
3285 */
3286 if (thflags & TH_FIN) {
3287 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
3288 socantrcvmore(so);
3289 postevent(so, 0, EV_FIN);
3290 /*
3291 * If connection is half-synchronized
3292 * (ie NEEDSYN flag on) then delay ACK,
3293 * If connection is half-synchronized
3294 * (ie NEEDSYN flag on) then delay ACK,
3295 * so it may be piggybacked when SYN is sent.
3296 * Otherwise, since we received a FIN then no
3297 * more input can be expected, send ACK now.
3298 */
3299 if (DELAY_ACK(tp, th) && (tp->t_flags & TF_NEEDSYN)) {
3300 if ((tp->t_flags & TF_DELACK) == 0) {
3301 tp->t_flags |= TF_DELACK;
3302 tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
3303 }
3304 tp->t_unacksegs++;
3305 }
3306 else {
3307 tp->t_flags |= TF_ACKNOW;
3308 }
3309 tp->rcv_nxt++;
3310 }
3311 switch (tp->t_state) {
3312
3313 /*
3314 * In SYN_RECEIVED and ESTABLISHED STATES
3315 * enter the CLOSE_WAIT state.
3316 */
3317 case TCPS_SYN_RECEIVED:
3318 tp->t_starttime = tcp_now;
3319 case TCPS_ESTABLISHED:
3320 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
3321 struct tcpcb *, tp, int32_t, TCPS_CLOSE_WAIT);
3322 tp->t_state = TCPS_CLOSE_WAIT;
3323 break;
3324
3325 /*
3326 * If still in FIN_WAIT_1 STATE FIN has not been acked so
3327 * enter the CLOSING state.
3328 */
3329 case TCPS_FIN_WAIT_1:
3330 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
3331 struct tcpcb *, tp, int32_t, TCPS_CLOSING);
3332 tp->t_state = TCPS_CLOSING;
3333 break;
3334
3335 /*
3336 * In FIN_WAIT_2 state enter the TIME_WAIT state,
3337 * starting the time-wait timer, turning off the other
3338 * standard timers.
3339 */
3340 case TCPS_FIN_WAIT_2:
3341 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
3342 struct tcpcb *, tp, int32_t, TCPS_TIME_WAIT);
3343 tp->t_state = TCPS_TIME_WAIT;
3344 tcp_canceltimers(tp);
3345 /* Shorten TIME_WAIT [RFC-1644, p.28] */
3346 if (tp->cc_recv != 0 &&
3347 ((int)(tcp_now - tp->t_starttime)) < tcp_msl) {
3348 add_to_time_wait(tp, tp->t_rxtcur * TCPTV_TWTRUNC);
3349 /* For transaction client, force ACK now. */
3350 tp->t_flags |= TF_ACKNOW;
3351 tp->t_unacksegs = 0;
3352 }
3353 else
3354 add_to_time_wait(tp, 2 * tcp_msl);
3355 soisdisconnected(so);
3356 break;
3357
3358 /*
3359 * In TIME_WAIT state restart the 2 MSL time_wait timer.
3360 */
3361 case TCPS_TIME_WAIT:
3362 add_to_time_wait(tp, 2 * tcp_msl);
3363 break;
3364 }
3365 }
3366 #if TCPDEBUG
3367 if (so->so_options & SO_DEBUG)
3368 tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
3369 &tcp_savetcp, 0);
3370 #endif
3371
3372 /*
3373 * Return any desired output.
3374 */
3375 if (needoutput || (tp->t_flags & TF_ACKNOW)) {
3376 (void) tcp_output(tp);
3377 }
3378
3379 tcp_check_timer_state(tp);
3380
3381
3382 tcp_unlock(so, 1, 0);
3383 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
3384 return;
3385
3386 dropafterack:
3387 /*
3388 * Generate an ACK dropping incoming segment if it occupies
3389 * sequence space, where the ACK reflects our state.
3390 *
3391 * We can now skip the test for the RST flag since all
3392 * paths to this code happen after packets containing
3393 * RST have been dropped.
3394 *
3395 * In the SYN-RECEIVED state, don't send an ACK unless the
3396 * segment we received passes the SYN-RECEIVED ACK test.
3397 * If it fails send a RST. This breaks the loop in the
3398 * "LAND" DoS attack, and also prevents an ACK storm
3399 * between two listening ports that have been sent forged
3400 * SYN segments, each with the source address of the other.
3401 */
3402 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
3403 (SEQ_GT(tp->snd_una, th->th_ack) ||
3404 SEQ_GT(th->th_ack, tp->snd_max)) ) {
3405 rstreason = BANDLIM_RST_OPENPORT;
3406 goto dropwithreset;
3407 }
3408 #if TCPDEBUG
3409 if (so->so_options & SO_DEBUG)
3410 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
3411 &tcp_savetcp, 0);
3412 #endif
3413 m_freem(m);
3414 tp->t_flags |= TF_ACKNOW;
3415 (void) tcp_output(tp);
3416
3417 /* Don't need to check timer state as we should have done it during tcp_output */
3418 tcp_unlock(so, 1, 0);
3419 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
3420 return;
3421 dropwithresetnosock:
3422 nosock = 1;
3423 dropwithreset:
3424 /*
3425 * Generate a RST, dropping incoming segment.
3426 * Make ACK acceptable to originator of segment.
3427 * Don't bother to respond if destination was broadcast/multicast.
3428 */
3429 if ((thflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST))
3430 goto drop;
3431 #if INET6
3432 if (isipv6) {
3433 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
3434 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
3435 goto drop;
3436 } else
3437 #endif /* INET6 */
3438 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
3439 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
3440 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
3441 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
3442 goto drop;
3443 /* IPv6 anycast check is done at tcp6_input() */
3444
3445 /*
3446 * Perform bandwidth limiting.
3447 */
3448 #if ICMP_BANDLIM
3449 if (badport_bandlim(rstreason) < 0)
3450 goto drop;
3451 #endif
3452
3453 #if TCPDEBUG
3454 if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
3455 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
3456 &tcp_savetcp, 0);
3457 #endif
3458 if (thflags & TH_ACK)
3459 /* mtod() below is safe as long as hdr dropping is delayed */
3460 tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack,
3461 TH_RST, ifscope, nocell);
3462 else {
3463 if (thflags & TH_SYN)
3464 tlen++;
3465 /* mtod() below is safe as long as hdr dropping is delayed */
3466 tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
3467 (tcp_seq)0, TH_RST|TH_ACK, ifscope, nocell);
3468 }
3469 /* destroy temporarily created socket */
3470 if (dropsocket) {
3471 (void) soabort(so);
3472 tcp_unlock(so, 1, 0);
3473 }
3474 else if ((inp != NULL) && (nosock == 0)) {
3475 tcp_unlock(so, 1, 0);
3476 }
3477 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
3478 return;
3479 dropnosock:
3480 nosock = 1;
3481 drop:
3482 /*
3483 * Drop space held by incoming segment and return.
3484 */
3485 #if TCPDEBUG
3486 if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
3487 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
3488 &tcp_savetcp, 0);
3489 #endif
3490 m_freem(m);
3491 /* destroy temporarily created socket */
3492 if (dropsocket) {
3493 (void) soabort(so);
3494 tcp_unlock(so, 1, 0);
3495 }
3496 else if (nosock == 0) {
3497 tcp_unlock(so, 1, 0);
3498 }
3499 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
3500 return;
3501 }
3502
3503 static void
3504 tcp_dooptions(tp, cp, cnt, th, to, input_ifscope)
3505 /*
3506 * Parse TCP options and place in tcpopt.
3507 */
3508 struct tcpcb *tp;
3509 u_char *cp;
3510 int cnt;
3511 struct tcphdr *th;
3512 struct tcpopt *to;
3513 unsigned int input_ifscope;
3514 {
3515 u_short mss = 0;
3516 int opt, optlen;
3517
3518 for (; cnt > 0; cnt -= optlen, cp += optlen) {
3519 opt = cp[0];
3520 if (opt == TCPOPT_EOL)
3521 break;
3522 if (opt == TCPOPT_NOP)
3523 optlen = 1;
3524 else {
3525 if (cnt < 2)
3526 break;
3527 optlen = cp[1];
3528 if (optlen < 2 || optlen > cnt)
3529 break;
3530 }
3531 switch (opt) {
3532
3533 default:
3534 continue;
3535
3536 case TCPOPT_MAXSEG:
3537 if (optlen != TCPOLEN_MAXSEG)
3538 continue;
3539 if (!(th->th_flags & TH_SYN))
3540 continue;
3541 bcopy((char *) cp + 2, (char *) &mss, sizeof(mss));
3542
3543 #if BYTE_ORDER != BIG_ENDIAN
3544 NTOHS(mss);
3545 #endif
3546
3547 break;
3548
3549 case TCPOPT_WINDOW:
3550 if (optlen != TCPOLEN_WINDOW)
3551 continue;
3552 if (!(th->th_flags & TH_SYN))
3553 continue;
3554 tp->t_flags |= TF_RCVD_SCALE;
3555 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
3556 break;
3557
3558 case TCPOPT_TIMESTAMP:
3559 if (optlen != TCPOLEN_TIMESTAMP)
3560 continue;
3561 to->to_flags |= TOF_TS;
3562 bcopy((char *)cp + 2,
3563 (char *)&to->to_tsval, sizeof(to->to_tsval));
3564
3565 #if BYTE_ORDER != BIG_ENDIAN
3566 NTOHL(to->to_tsval);
3567 #endif
3568
3569 bcopy((char *)cp + 6,
3570 (char *)&to->to_tsecr, sizeof(to->to_tsecr));
3571
3572 #if BYTE_ORDER != BIG_ENDIAN
3573 NTOHL(to->to_tsecr);
3574 #endif
3575
3576 /*
3577 * A timestamp received in a SYN makes
3578 * it ok to send timestamp requests and replies.
3579 */
3580 if (th->th_flags & TH_SYN) {
3581 tp->t_flags |= TF_RCVD_TSTMP;
3582 tp->ts_recent = to->to_tsval;
3583 tp->ts_recent_age = tcp_now;
3584 }
3585 break;
3586 case TCPOPT_SACK_PERMITTED:
3587 if (!tcp_do_sack ||
3588 optlen != TCPOLEN_SACK_PERMITTED)
3589 continue;
3590 if (th->th_flags & TH_SYN)
3591 to->to_flags |= TOF_SACK;
3592 break;
3593 case TCPOPT_SACK:
3594 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
3595 continue;
3596 to->to_nsacks = (optlen - 2) / TCPOLEN_SACK;
3597 to->to_sacks = cp + 2;
3598 tcpstat.tcps_sack_rcv_blocks++;
3599
3600 break;
3601 }
3602 }
3603 if (th->th_flags & TH_SYN)
3604 tcp_mss(tp, mss, input_ifscope); /* sets t_maxseg */
3605 }
3606
3607 /*
3608 * Pull out of band byte out of a segment so
3609 * it doesn't appear in the user's data queue.
3610 * It is still reflected in the segment length for
3611 * sequencing purposes.
3612 */
3613 static void
3614 tcp_pulloutofband(so, th, m, off)
3615 struct socket *so;
3616 struct tcphdr *th;
3617 register struct mbuf *m;
3618 int off; /* delayed to be droped hdrlen */
3619 {
3620 int cnt = off + th->th_urp - 1;
3621
3622 while (cnt >= 0) {
3623 if (m->m_len > cnt) {
3624 char *cp = mtod(m, caddr_t) + cnt;
3625 struct tcpcb *tp = sototcpcb(so);
3626
3627 tp->t_iobc = *cp;
3628 tp->t_oobflags |= TCPOOB_HAVEDATA;
3629 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
3630 m->m_len--;
3631 if (m->m_flags & M_PKTHDR)
3632 m->m_pkthdr.len--;
3633 return;
3634 }
3635 cnt -= m->m_len;
3636 m = m->m_next;
3637 if (m == 0)
3638 break;
3639 }
3640 panic("tcp_pulloutofband");
3641 }
3642
3643 uint32_t
3644 get_base_rtt(struct tcpcb *tp)
3645 {
3646 uint32_t base_rtt = 0, i;
3647 for (i = 0; i < N_RTT_BASE; ++i) {
3648 if (tp->rtt_hist[i] != 0 &&
3649 (base_rtt == 0 || tp->rtt_hist[i] < base_rtt))
3650 base_rtt = tp->rtt_hist[i];
3651 }
3652 return base_rtt;
3653 }
3654
3655 /* Each value of RTT base represents the minimum RTT seen in a minute.
3656 * We keep upto N_RTT_BASE minutes worth of history.
3657 */
3658 void
3659 update_base_rtt(struct tcpcb *tp, uint32_t rtt)
3660 {
3661 if (++tp->rtt_count >= rtt_samples_per_slot) {
3662 int i=0;
3663 for (i = (N_RTT_BASE-1); i > 0; --i) {
3664 tp->rtt_hist[i] = tp->rtt_hist[i-1];
3665 }
3666 tp->rtt_hist[0] = rtt;
3667 tp->rtt_count = 0;
3668 } else {
3669 tp->rtt_hist[0] = min(tp->rtt_hist[0], rtt);
3670 }
3671 }
3672
3673 /*
3674 * Collect new round-trip time estimate
3675 * and update averages and current timeout.
3676 */
3677 static void
3678 tcp_xmit_timer(tp, rtt)
3679 register struct tcpcb *tp;
3680 int rtt;
3681 {
3682 register int delta;
3683
3684 tcpstat.tcps_rttupdated++;
3685 tp->t_rttupdated++;
3686
3687 if (rtt > 0) {
3688 tp->t_rttcur = rtt;
3689 update_base_rtt(tp, rtt);
3690 }
3691
3692 if (tp->t_srtt != 0) {
3693 /*
3694 * srtt is stored as fixed point with 5 bits after the
3695 * binary point (i.e., scaled by 32). The following magic
3696 * is equivalent to the smoothing algorithm in rfc793 with
3697 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
3698 * point).
3699 *
3700 * Freebsd adjusts rtt to origin 0 by subtracting 1 from the provided
3701 * rtt value. This was required because of the way t_rtttime was
3702 * initiailised to 1 before. Since we changed t_rtttime to be based on
3703 * tcp_now, this extra adjustment is not needed.
3704 */
3705 delta = (rtt << TCP_DELTA_SHIFT)
3706 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
3707
3708 if ((tp->t_srtt += delta) <= 0)
3709 tp->t_srtt = 1;
3710
3711 /*
3712 * We accumulate a smoothed rtt variance (actually, a
3713 * smoothed mean difference), then set the retransmit
3714 * timer to smoothed rtt + 4 times the smoothed variance.
3715 * rttvar is stored as fixed point with 4 bits after the
3716 * binary point (scaled by 16). The following is
3717 * equivalent to rfc793 smoothing with an alpha of .75
3718 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
3719 * rfc793's wired-in beta.
3720 */
3721 if (delta < 0)
3722 delta = -delta;
3723 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
3724 if ((tp->t_rttvar += delta) <= 0)
3725 tp->t_rttvar = 1;
3726 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
3727 tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
3728 } else {
3729 /*
3730 * No rtt measurement yet - use the unsmoothed rtt.
3731 * Set the variance to half the rtt (so our first
3732 * retransmit happens at 3*rtt).
3733 */
3734 tp->t_srtt = rtt << TCP_RTT_SHIFT;
3735 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
3736 tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
3737 }
3738 nstat_route_rtt(tp->t_inpcb->inp_route.ro_rt, tp->t_srtt, tp->t_rttvar);
3739 tp->t_rtttime = 0;
3740 tp->t_rxtshift = 0;
3741 tp->rxt_start = 0;
3742
3743 /*
3744 * the retransmit should happen at rtt + 4 * rttvar.
3745 * Because of the way we do the smoothing, srtt and rttvar
3746 * will each average +1/2 tick of bias. When we compute
3747 * the retransmit timer, we want 1/2 tick of rounding and
3748 * 1 extra tick because of +-1/2 tick uncertainty in the
3749 * firing of the timer. The bias will give us exactly the
3750 * 1.5 tick we need. But, because the bias is
3751 * statistical, we have to test that we don't drop below
3752 * the minimum feasible timer (which is 2 ticks).
3753 */
3754 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
3755 max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX,
3756 TCP_ADD_REXMTSLOP(tp));
3757
3758 /*
3759 * We received an ack for a packet that wasn't retransmitted;
3760 * it is probably safe to discard any error indications we've
3761 * received recently. This isn't quite right, but close enough
3762 * for now (a route might have failed after we sent a segment,
3763 * and the return path might not be symmetrical).
3764 */
3765 tp->t_softerror = 0;
3766 }
3767
3768 static inline unsigned int
3769 tcp_maxmtu(struct rtentry *rt)
3770 {
3771 unsigned int maxmtu;
3772
3773 RT_LOCK_ASSERT_HELD(rt);
3774 if (rt->rt_rmx.rmx_mtu == 0)
3775 maxmtu = rt->rt_ifp->if_mtu;
3776 else
3777 maxmtu = MIN(rt->rt_rmx.rmx_mtu, rt->rt_ifp->if_mtu);
3778
3779 return (maxmtu);
3780 }
3781
3782 #if INET6
3783 static inline unsigned int
3784 tcp_maxmtu6(struct rtentry *rt)
3785 {
3786 unsigned int maxmtu;
3787
3788 RT_LOCK_ASSERT_HELD(rt);
3789 lck_rw_lock_shared(nd_if_rwlock);
3790 if (rt->rt_rmx.rmx_mtu == 0)
3791 maxmtu = IN6_LINKMTU(rt->rt_ifp);
3792 else
3793 maxmtu = MIN(rt->rt_rmx.rmx_mtu, IN6_LINKMTU(rt->rt_ifp));
3794 lck_rw_done(nd_if_rwlock);
3795
3796 return (maxmtu);
3797 }
3798 #endif
3799
3800 /*
3801 * Determine a reasonable value for maxseg size.
3802 * If the route is known, check route for mtu.
3803 * If none, use an mss that can be handled on the outgoing
3804 * interface without forcing IP to fragment; if bigger than
3805 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
3806 * to utilize large mbufs. If no route is found, route has no mtu,
3807 * or the destination isn't local, use a default, hopefully conservative
3808 * size (usually 512 or the default IP max size, but no more than the mtu
3809 * of the interface), as we can't discover anything about intervening
3810 * gateways or networks. We also initialize the congestion/slow start
3811 * window to be a single segment if the destination isn't local.
3812 * While looking at the routing entry, we also initialize other path-dependent
3813 * parameters from pre-set or cached values in the routing entry.
3814 *
3815 * Also take into account the space needed for options that we
3816 * send regularly. Make maxseg shorter by that amount to assure
3817 * that we can send maxseg amount of data even when the options
3818 * are present. Store the upper limit of the length of options plus
3819 * data in maxopd.
3820 *
3821 * NOTE that this routine is only called when we process an incoming
3822 * segment, for outgoing segments only tcp_mssopt is called.
3823 *
3824 */
3825 void
3826 tcp_mss(tp, offer, input_ifscope)
3827 struct tcpcb *tp;
3828 int offer;
3829 unsigned int input_ifscope;
3830 {
3831 register struct rtentry *rt;
3832 struct ifnet *ifp;
3833 register int rtt, mss;
3834 u_int32_t bufsize;
3835 struct inpcb *inp;
3836 struct socket *so;
3837 struct rmxp_tao *taop;
3838 int origoffer = offer;
3839 u_int32_t sb_max_corrected;
3840 int isnetlocal = 0;
3841 #if INET6
3842 int isipv6;
3843 int min_protoh;
3844 #endif
3845
3846 inp = tp->t_inpcb;
3847 #if INET6
3848 isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
3849 min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr)
3850 : sizeof (struct tcpiphdr);
3851 #else
3852 #define min_protoh (sizeof (struct tcpiphdr))
3853 #endif
3854
3855 #if INET6
3856 if (isipv6) {
3857 rt = tcp_rtlookup6(inp, input_ifscope);
3858 if (rt != NULL &&
3859 (IN6_IS_ADDR_LOOPBACK(&inp->in6p_faddr) ||
3860 IN6_IS_ADDR_LINKLOCAL(&inp->in6p_faddr) ||
3861 rt->rt_gateway->sa_family == AF_LINK ||
3862 in6_localaddr(&inp->in6p_faddr))) {
3863 tp->t_flags |= TF_LOCAL;
3864 }
3865 }
3866 else
3867 #endif /* INET6 */
3868 {
3869 rt = tcp_rtlookup(inp, input_ifscope);
3870 if (rt != NULL &&
3871 (rt->rt_gateway->sa_family == AF_LINK ||
3872 rt->rt_ifp->if_flags & IFF_LOOPBACK ||
3873 in_localaddr(inp->inp_faddr))) {
3874 tp->t_flags |= TF_LOCAL;
3875 }
3876 }
3877 isnetlocal = (tp->t_flags & TF_LOCAL);
3878
3879 if (rt == NULL) {
3880 tp->t_maxopd = tp->t_maxseg =
3881 #if INET6
3882 isipv6 ? tcp_v6mssdflt :
3883 #endif /* INET6 */
3884 tcp_mssdflt;
3885 return;
3886 }
3887 ifp = rt->rt_ifp;
3888 /*
3889 * Slower link window correction:
3890 * If a value is specificied for slowlink_wsize use it for PPP links
3891 * believed to be on a serial modem (speed <128Kbps). Excludes 9600bps as
3892 * it is the default value adversized by pseudo-devices over ppp.
3893 */
3894 if (ifp->if_type == IFT_PPP && slowlink_wsize > 0 &&
3895 ifp->if_baudrate > 9600 && ifp->if_baudrate <= 128000) {
3896 tp->t_flags |= TF_SLOWLINK;
3897 }
3898 so = inp->inp_socket;
3899
3900 taop = rmx_taop(rt->rt_rmx);
3901 /*
3902 * Offer == -1 means that we didn't receive SYN yet,
3903 * use cached value in that case;
3904 */
3905 if (offer == -1)
3906 offer = taop->tao_mssopt;
3907 /*
3908 * Offer == 0 means that there was no MSS on the SYN segment,
3909 * in this case we use tcp_mssdflt.
3910 */
3911 if (offer == 0)
3912 offer =
3913 #if INET6
3914 isipv6 ? tcp_v6mssdflt :
3915 #endif /* INET6 */
3916 tcp_mssdflt;
3917 else {
3918 /*
3919 * Prevent DoS attack with too small MSS. Round up
3920 * to at least minmss.
3921 */
3922 offer = max(offer, tcp_minmss);
3923 /*
3924 * Sanity check: make sure that maxopd will be large
3925 * enough to allow some data on segments even is the
3926 * all the option space is used (40bytes). Otherwise
3927 * funny things may happen in tcp_output.
3928 */
3929 offer = max(offer, 64);
3930 }
3931 taop->tao_mssopt = offer;
3932
3933 /*
3934 * While we're here, check if there's an initial rtt
3935 * or rttvar. Convert from the route-table units
3936 * to scaled multiples of the slow timeout timer.
3937 */
3938 if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) {
3939 /*
3940 * XXX the lock bit for RTT indicates that the value
3941 * is also a minimum value; this is subject to time.
3942 */
3943 if (rt->rt_rmx.rmx_locks & RTV_RTT)
3944 tp->t_rttmin = rtt / (RTM_RTTUNIT / TCP_RETRANSHZ);
3945 else
3946 tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN : TCPTV_REXMTMIN;
3947 tp->t_srtt = rtt / (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE));
3948 tcpstat.tcps_usedrtt++;
3949 if (rt->rt_rmx.rmx_rttvar) {
3950 tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
3951 (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTTVAR_SCALE));
3952 tcpstat.tcps_usedrttvar++;
3953 } else {
3954 /* default variation is +- 1 rtt */
3955 tp->t_rttvar =
3956 tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
3957 }
3958 TCPT_RANGESET(tp->t_rxtcur,
3959 ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
3960 tp->t_rttmin, TCPTV_REXMTMAX,
3961 TCP_ADD_REXMTSLOP(tp));
3962 }
3963 else
3964 tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN : TCPTV_REXMTMIN;
3965
3966 #if INET6
3967 mss = (isipv6 ? tcp_maxmtu6(rt) : tcp_maxmtu(rt));
3968 #else
3969 mss = tcp_maxmtu(rt);
3970 #endif
3971 mss -= min_protoh;
3972
3973 if (rt->rt_rmx.rmx_mtu == 0) {
3974 #if INET6
3975 if (isipv6) {
3976 if (!isnetlocal)
3977 mss = min(mss, tcp_v6mssdflt);
3978 } else
3979 #endif /* INET6 */
3980 if (!isnetlocal)
3981 mss = min(mss, tcp_mssdflt);
3982 }
3983
3984 mss = min(mss, offer);
3985 /*
3986 * maxopd stores the maximum length of data AND options
3987 * in a segment; maxseg is the amount of data in a normal
3988 * segment. We need to store this value (maxopd) apart
3989 * from maxseg, because now every segment carries options
3990 * and thus we normally have somewhat less data in segments.
3991 */
3992 tp->t_maxopd = mss;
3993
3994 /*
3995 * origoffer==-1 indicates, that no segments were received yet.
3996 * In this case we just guess.
3997 */
3998 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
3999 (origoffer == -1 ||
4000 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
4001 mss -= TCPOLEN_TSTAMP_APPA;
4002 tp->t_maxseg = mss;
4003
4004 /*
4005 * Calculate corrected value for sb_max; ensure to upgrade the
4006 * numerator for large sb_max values else it will overflow.
4007 */
4008 sb_max_corrected = (sb_max * (u_int64_t)MCLBYTES) / (MSIZE + MCLBYTES);
4009
4010 /*
4011 * If there's a pipesize (ie loopback), change the socket
4012 * buffer to that size only if it's bigger than the current
4013 * sockbuf size. Make the socket buffers an integral
4014 * number of mss units; if the mss is larger than
4015 * the socket buffer, decrease the mss.
4016 */
4017 #if RTV_SPIPE
4018 bufsize = rt->rt_rmx.rmx_sendpipe;
4019 if (bufsize < so->so_snd.sb_hiwat)
4020 #endif
4021 bufsize = so->so_snd.sb_hiwat;
4022 if (bufsize < mss)
4023 mss = bufsize;
4024 else {
4025 bufsize = (((bufsize + (u_int64_t)mss - 1) / (u_int64_t)mss) * (u_int64_t)mss);
4026 if (bufsize > sb_max_corrected)
4027 bufsize = sb_max_corrected;
4028 (void)sbreserve(&so->so_snd, bufsize);
4029 }
4030 tp->t_maxseg = mss;
4031
4032 #if RTV_RPIPE
4033 bufsize = rt->rt_rmx.rmx_recvpipe;
4034 if (bufsize < so->so_rcv.sb_hiwat)
4035 #endif
4036 bufsize = so->so_rcv.sb_hiwat;
4037 if (bufsize > mss) {
4038 bufsize = (((bufsize + (u_int64_t)mss - 1) / (u_int64_t)mss) * (u_int64_t)mss);
4039 if (bufsize > sb_max_corrected)
4040 bufsize = sb_max_corrected;
4041 (void)sbreserve(&so->so_rcv, bufsize);
4042 }
4043
4044 set_tcp_stream_priority(so);
4045
4046 if (rt->rt_rmx.rmx_ssthresh) {
4047 /*
4048 * There's some sort of gateway or interface
4049 * buffer limit on the path. Use this to set
4050 * the slow start threshhold, but set the
4051 * threshold to no less than 2*mss.
4052 */
4053 tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
4054 tcpstat.tcps_usedssthresh++;
4055 } else {
4056 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
4057 }
4058
4059
4060 /*
4061 * Set the slow-start flight size depending on whether this
4062 * is a local network or not.
4063 */
4064 if (CC_ALGO(tp)->cwnd_init != NULL)
4065 CC_ALGO(tp)->cwnd_init(tp);
4066
4067 DTRACE_TCP5(cc, void, NULL, struct inpcb *, tp->t_inpcb, struct tcpcb *, tp,
4068 struct tcphdr *, NULL, int32_t, TCP_CC_CWND_INIT);
4069
4070 /* Route locked during lookup above */
4071 RT_UNLOCK(rt);
4072 }
4073
4074 /*
4075 * Determine the MSS option to send on an outgoing SYN.
4076 */
4077 int
4078 tcp_mssopt(tp)
4079 struct tcpcb *tp;
4080 {
4081 struct rtentry *rt;
4082 int mss;
4083 #if INET6
4084 int isipv6;
4085 int min_protoh;
4086 #endif
4087
4088 #if INET6
4089 isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
4090 min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr)
4091 : sizeof (struct tcpiphdr);
4092 #else
4093 #define min_protoh (sizeof (struct tcpiphdr))
4094 #endif
4095
4096 #if INET6
4097 if (isipv6)
4098 rt = tcp_rtlookup6(tp->t_inpcb, IFSCOPE_NONE);
4099 else
4100 #endif /* INET6 */
4101 rt = tcp_rtlookup(tp->t_inpcb, IFSCOPE_NONE);
4102 if (rt == NULL) {
4103 return (
4104 #if INET6
4105 isipv6 ? tcp_v6mssdflt :
4106 #endif /* INET6 */
4107 tcp_mssdflt);
4108 }
4109 /*
4110 * Slower link window correction:
4111 * If a value is specificied for slowlink_wsize use it for PPP links
4112 * believed to be on a serial modem (speed <128Kbps). Excludes 9600bps as
4113 * it is the default value adversized by pseudo-devices over ppp.
4114 */
4115 if (rt->rt_ifp->if_type == IFT_PPP && slowlink_wsize > 0 &&
4116 rt->rt_ifp->if_baudrate > 9600 && rt->rt_ifp->if_baudrate <= 128000) {
4117 tp->t_flags |= TF_SLOWLINK;
4118 }
4119
4120 #if INET6
4121 mss = (isipv6 ? tcp_maxmtu6(rt) : tcp_maxmtu(rt));
4122 #else
4123 mss = tcp_maxmtu(rt);
4124 #endif
4125 /* Route locked during lookup above */
4126 RT_UNLOCK(rt);
4127 return (mss - min_protoh);
4128 }
4129
4130 /*
4131 * On a partial ack arrives, force the retransmission of the
4132 * next unacknowledged segment. Do not clear tp->t_dupacks.
4133 * By setting snd_nxt to th_ack, this forces retransmission timer to
4134 * be started again.
4135 */
4136 static void
4137 tcp_newreno_partial_ack(tp, th)
4138 struct tcpcb *tp;
4139 struct tcphdr *th;
4140 {
4141 tcp_seq onxt = tp->snd_nxt;
4142 u_int32_t ocwnd = tp->snd_cwnd;
4143 tp->t_timer[TCPT_REXMT] = 0;
4144 tp->t_rtttime = 0;
4145 tp->snd_nxt = th->th_ack;
4146 /*
4147 * Set snd_cwnd to one segment beyond acknowledged offset
4148 * (tp->snd_una has not yet been updated when this function
4149 * is called)
4150 */
4151 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una);
4152 tp->t_flags |= TF_ACKNOW;
4153 (void) tcp_output(tp);
4154 tp->snd_cwnd = ocwnd;
4155 if (SEQ_GT(onxt, tp->snd_nxt))
4156 tp->snd_nxt = onxt;
4157 /*
4158 * Partial window deflation. Relies on fact that tp->snd_una
4159 * not updated yet.
4160 */
4161 if (tp->snd_cwnd > th->th_ack - tp->snd_una)
4162 tp->snd_cwnd -= th->th_ack - tp->snd_una;
4163 else
4164 tp->snd_cwnd = 0;
4165 tp->snd_cwnd += tp->t_maxseg;
4166
4167 }
4168
4169 /*
4170 * Drop a random TCP connection that hasn't been serviced yet and
4171 * is eligible for discard. There is a one in qlen chance that
4172 * we will return a null, saying that there are no dropable
4173 * requests. In this case, the protocol specific code should drop
4174 * the new request. This insures fairness.
4175 *
4176 * The listening TCP socket "head" must be locked
4177 */
4178 static int
4179 tcp_dropdropablreq(struct socket *head)
4180 {
4181 struct socket *so, *sonext;
4182 unsigned int i, j, qlen;
4183 static int rnd;
4184 static struct timeval old_runtime;
4185 static unsigned int cur_cnt, old_cnt;
4186 struct timeval tv;
4187 struct inpcb *inp = NULL;
4188 struct tcpcb *tp;
4189
4190 if ((head->so_options & SO_ACCEPTCONN) == 0)
4191 return 0;
4192
4193 so = TAILQ_FIRST(&head->so_incomp);
4194 if (!so)
4195 return 0;
4196
4197 microtime(&tv);
4198 if ((i = (tv.tv_sec - old_runtime.tv_sec)) != 0) {
4199 old_runtime = tv;
4200 old_cnt = cur_cnt / i;
4201 cur_cnt = 0;
4202 }
4203
4204
4205 qlen = head->so_incqlen;
4206 if (++cur_cnt > qlen || old_cnt > qlen) {
4207 rnd = (314159 * rnd + 66329) & 0xffff;
4208 j = ((qlen + 1) * rnd) >> 16;
4209
4210 while (j-- && so)
4211 so = TAILQ_NEXT(so, so_list);
4212 }
4213 /* Find a connection that is not already closing (or being served) */
4214 while (so) {
4215 inp = (struct inpcb *)so->so_pcb;
4216
4217 sonext = TAILQ_NEXT(so, so_list);
4218
4219 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
4220 /* Avoid the issue of a socket being accepted by one input thread
4221 * and being dropped by another input thread.
4222 * If we can't get a hold on this mutex, then grab the next socket in line.
4223 */
4224 if (lck_mtx_try_lock(&inp->inpcb_mtx)) {
4225 so->so_usecount++;
4226 if ((so->so_usecount == 2) &&
4227 (so->so_state & SS_INCOMP) != 0 &&
4228 (so->so_flags & SOF_INCOMP_INPROGRESS) == 0)
4229 break;
4230 else {/* don't use if being accepted or used in any other way */
4231 in_pcb_checkstate(inp, WNT_RELEASE, 1);
4232 tcp_unlock(so, 1, 0);
4233 }
4234 }
4235 else {
4236 /* do not try to lock the inp in in_pcb_checkstate
4237 * because the lock is already held in some other thread.
4238 * Only drop the inp_wntcnt reference.
4239 */
4240 in_pcb_checkstate(inp, WNT_RELEASE, 1);
4241 }
4242 }
4243 so = sonext;
4244
4245 }
4246 if (!so)
4247 return 0;
4248
4249 /* Makes sure socket is still in the right state to be discarded */
4250
4251 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
4252 tcp_unlock(so, 1, 0);
4253 return 0;
4254 }
4255
4256 if (so->so_usecount != 2 || !(so->so_state & SS_INCOMP)) {
4257 /* do not discard: that socket is being accepted */
4258 tcp_unlock(so, 1, 0);
4259 return 0;
4260 }
4261
4262 TAILQ_REMOVE(&head->so_incomp, so, so_list);
4263 tcp_unlock(head, 0, 0);
4264
4265 lck_mtx_assert(&inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
4266 tp = sototcpcb(so);
4267 so->so_flags |= SOF_OVERFLOW;
4268 so->so_head = NULL;
4269
4270 tcp_close(tp);
4271 tp->t_unacksegs = 0;
4272
4273 if (inp->inp_wantcnt > 0 && inp->inp_wantcnt != WNT_STOPUSING) {
4274 /* Some one has a wantcnt on this pcb. Since WNT_ACQUIRE
4275 * doesn't require a lock, it could have happened while
4276 * we are holding the lock. This pcb will have to
4277 * be garbage collected later.
4278 * Release the reference held for so_incomp queue
4279 */
4280 so->so_usecount--;
4281
4282 tcp_unlock(so, 1, 0);
4283 } else {
4284 /* Unlock this socket and leave the reference on. We need to
4285 * acquire the pcbinfo lock in order to fully dispose it off
4286 */
4287 tcp_unlock(so, 0, 0);
4288
4289 lck_rw_lock_exclusive(tcbinfo.mtx);
4290
4291 tcp_lock(so, 0, 0);
4292
4293 /* Release the reference held for so_incomp queue */
4294 so->so_usecount--;
4295
4296 if (so->so_usecount != 1 ||
4297 (inp->inp_wantcnt > 0 && inp->inp_wantcnt != WNT_STOPUSING)) {
4298 /* There is an extra wantcount or usecount that must
4299 * have been added when the socket was unlocked. This
4300 * socket will have to be garbage collected later
4301 */
4302 tcp_unlock(so, 1, 0);
4303 } else {
4304
4305 /* Drop the reference held for this function */
4306 so->so_usecount--;
4307
4308 in_pcbdispose(inp);
4309 }
4310 lck_rw_done(tcbinfo.mtx);
4311 }
4312 tcpstat.tcps_drops++;
4313
4314 tcp_lock(head, 0, 0);
4315 head->so_incqlen--;
4316 head->so_qlen--;
4317 return(1);
4318 }
4319
4320 /* Set background congestion control on a socket */
4321 void
4322 tcp_set_background_cc(struct socket *so)
4323 {
4324 tcp_set_new_cc(so, TCP_CC_ALGO_BACKGROUND_INDEX);
4325 }
4326
4327 /* Set foreground congestion control on a socket */
4328 void
4329 tcp_set_foreground_cc(struct socket *so)
4330 {
4331 tcp_set_new_cc(so, TCP_CC_ALGO_NEWRENO_INDEX);
4332 }
4333
4334 static void
4335 tcp_set_new_cc(struct socket *so, uint16_t cc_index)
4336 {
4337 struct inpcb *inp = sotoinpcb(so);
4338 struct tcpcb *tp = intotcpcb(inp);
4339 uint16_t old_cc_index = 0;
4340 if (tp->tcp_cc_index != cc_index) {
4341
4342 old_cc_index = tp->tcp_cc_index;
4343
4344 if (CC_ALGO(tp)->cleanup != NULL)
4345 CC_ALGO(tp)->cleanup(tp);
4346 tp->tcp_cc_index = cc_index;
4347
4348 /* Decide if the connection is just starting or if
4349 * we have sent some packets on it.
4350 */
4351 if (tp->snd_nxt > tp->iss) {
4352 /* Already sent some packets */
4353 if (CC_ALGO(tp)->switch_to != NULL)
4354 CC_ALGO(tp)->switch_to(tp, old_cc_index);
4355 } else {
4356 if (CC_ALGO(tp)->init != NULL)
4357 CC_ALGO(tp)->init(tp);
4358 }
4359 DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp,
4360 struct tcpcb *, tp, struct tcphdr *, NULL,
4361 int32_t, TCP_CC_CHANGE_ALGO);
4362 }
4363 }
4364
4365 static int
4366 tcp_getstat SYSCTL_HANDLER_ARGS
4367 {
4368 #pragma unused(oidp, arg1, arg2)
4369
4370 int error;
4371
4372 if (req->oldptr == 0) {
4373 req->oldlen= (size_t)sizeof(struct tcpstat);
4374 }
4375
4376 error = SYSCTL_OUT(req, &tcpstat, MIN(sizeof (tcpstat), req->oldlen));
4377
4378 return (error);
4379
4380 }
4381
4382 SYSCTL_PROC(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
4383 tcp_getstat, "S,tcpstat", "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
4384
4385 static int
4386 sysctl_rexmtthresh SYSCTL_HANDLER_ARGS
4387 {
4388 #pragma unused(arg1, arg2)
4389
4390 int error, val = tcprexmtthresh;
4391
4392 error = sysctl_handle_int(oidp, &val, 0, req);
4393 if (error || !req->newptr)
4394 return (error);
4395
4396 /*
4397 * Constrain the number of duplicate ACKs
4398 * to consider for TCP fast retransmit
4399 * to either 2 or 3
4400 */
4401
4402 if (val < 2 || val > 3)
4403 return (EINVAL);
4404
4405 tcprexmtthresh = val;
4406
4407 return (0);
4408 }
4409
4410 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmt_thresh, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED,
4411 &tcprexmtthresh, 0, &sysctl_rexmtthresh, "I", "Duplicate ACK Threshold for Fast Retransmit");