]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/tcp_subr.c
248b90d8707280fc8c71d5f7d9a7a0f5480dee37
[apple/xnu.git] / bsd / netinet / tcp_subr.c
1 /*
2 * Copyright (c) 2006 Apple Computer, Inc. All Rights Reserved.
3 *
4 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the
10 * License may not be used to create, or enable the creation or
11 * redistribution of, unlawful or unlicensed copies of an Apple operating
12 * system, or to circumvent, violate, or enable the circumvention or
13 * violation of, any terms of an Apple operating system software license
14 * agreement.
15 *
16 * Please obtain a copy of the License at
17 * http://www.opensource.apple.com/apsl/ and read it before using this
18 * file.
19 *
20 * The Original Code and all software distributed under the License are
21 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
22 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
23 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
25 * Please see the License for the specific language governing rights and
26 * limitations under the License.
27 *
28 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
29 */
30 /*
31 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
32 * The Regents of the University of California. All rights reserved.
33 *
34 * Redistribution and use in source and binary forms, with or without
35 * modification, are permitted provided that the following conditions
36 * are met:
37 * 1. Redistributions of source code must retain the above copyright
38 * notice, this list of conditions and the following disclaimer.
39 * 2. Redistributions in binary form must reproduce the above copyright
40 * notice, this list of conditions and the following disclaimer in the
41 * documentation and/or other materials provided with the distribution.
42 * 3. All advertising materials mentioning features or use of this software
43 * must display the following acknowledgement:
44 * This product includes software developed by the University of
45 * California, Berkeley and its contributors.
46 * 4. Neither the name of the University nor the names of its contributors
47 * may be used to endorse or promote products derived from this software
48 * without specific prior written permission.
49 *
50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60 * SUCH DAMAGE.
61 *
62 * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95
63 * $FreeBSD: src/sys/netinet/tcp_subr.c,v 1.73.2.22 2001/08/22 00:59:12 silby Exp $
64 */
65
66
67 #include <sys/param.h>
68 #include <sys/systm.h>
69 #include <sys/callout.h>
70 #include <sys/kernel.h>
71 #include <sys/sysctl.h>
72 #include <sys/malloc.h>
73 #include <sys/mbuf.h>
74 #if INET6
75 #include <sys/domain.h>
76 #endif
77 #include <sys/proc.h>
78 #include <sys/kauth.h>
79 #include <sys/socket.h>
80 #include <sys/socketvar.h>
81 #include <sys/protosw.h>
82 #include <sys/random.h>
83 #include <sys/syslog.h>
84 #include <kern/locks.h>
85
86
87
88 #include <net/route.h>
89 #include <net/if.h>
90
91 #define _IP_VHL
92 #include <netinet/in.h>
93 #include <netinet/in_systm.h>
94 #include <netinet/ip.h>
95 #if INET6
96 #include <netinet/ip6.h>
97 #endif
98 #include <netinet/in_pcb.h>
99 #if INET6
100 #include <netinet6/in6_pcb.h>
101 #endif
102 #include <netinet/in_var.h>
103 #include <netinet/ip_var.h>
104 #if INET6
105 #include <netinet6/ip6_var.h>
106 #endif
107 #include <netinet/tcp.h>
108 #include <netinet/tcp_fsm.h>
109 #include <netinet/tcp_seq.h>
110 #include <netinet/tcp_timer.h>
111 #include <netinet/tcp_var.h>
112 #if INET6
113 #include <netinet6/tcp6_var.h>
114 #endif
115 #include <netinet/tcpip.h>
116 #if TCPDEBUG
117 #include <netinet/tcp_debug.h>
118 #endif
119 #include <netinet6/ip6protosw.h>
120
121 #if IPSEC
122 #include <netinet6/ipsec.h>
123 #if INET6
124 #include <netinet6/ipsec6.h>
125 #endif
126 #endif /*IPSEC*/
127
128 #include <sys/md5.h>
129 #include <sys/kdebug.h>
130
131 #define DBG_FNC_TCP_CLOSE NETDBG_CODE(DBG_NETTCP, ((5 << 8) | 2))
132
133 extern int tcp_lq_overflow;
134
135 /* temporary: for testing */
136 #if IPSEC
137 extern int ipsec_bypass;
138 extern lck_mtx_t *sadb_mutex;
139 #endif
140
141 int tcp_mssdflt = TCP_MSS;
142 SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW,
143 &tcp_mssdflt , 0, "Default TCP Maximum Segment Size");
144
145 #if INET6
146 int tcp_v6mssdflt = TCP6_MSS;
147 SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
148 CTLFLAG_RW, &tcp_v6mssdflt , 0,
149 "Default TCP Maximum Segment Size for IPv6");
150 #endif
151
152 /*
153 * Minimum MSS we accept and use. This prevents DoS attacks where
154 * we are forced to a ridiculous low MSS like 20 and send hundreds
155 * of packets instead of one. The effect scales with the available
156 * bandwidth and quickly saturates the CPU and network interface
157 * with packet generation and sending. Set to zero to disable MINMSS
158 * checking. This setting prevents us from sending too small packets.
159 */
160 int tcp_minmss = TCP_MINMSS;
161 SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW,
162 &tcp_minmss , 0, "Minmum TCP Maximum Segment Size");
163
164 /*
165 * Number of TCP segments per second we accept from remote host
166 * before we start to calculate average segment size. If average
167 * segment size drops below the minimum TCP MSS we assume a DoS
168 * attack and reset+drop the connection. Care has to be taken not to
169 * set this value too small to not kill interactive type connections
170 * (telnet, SSH) which send many small packets.
171 */
172 #ifdef FIX_WORKAROUND_FOR_3894301
173 __private_extern__ int tcp_minmssoverload = TCP_MINMSSOVERLOAD;
174 #else
175 __private_extern__ int tcp_minmssoverload = 0;
176 #endif
177 SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmssoverload, CTLFLAG_RW,
178 &tcp_minmssoverload , 0, "Number of TCP Segments per Second allowed to"
179 "be under the MINMSS Size");
180
181 static int tcp_do_rfc1323 = 1;
182 SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW,
183 &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions");
184
185 static int tcp_do_rfc1644 = 0;
186 SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW,
187 &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions");
188
189 static int tcp_tcbhashsize = 0;
190 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD,
191 &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
192
193 static int do_tcpdrain = 0;
194 SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0,
195 "Enable tcp_drain routine for extra help when low on mbufs");
196
197 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD,
198 &tcbinfo.ipi_count, 0, "Number of active PCBs");
199
200 static int icmp_may_rst = 1;
201 SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, &icmp_may_rst, 0,
202 "Certain ICMP unreachable messages may abort connections in SYN_SENT");
203
204 static int tcp_strict_rfc1948 = 0;
205 SYSCTL_INT(_net_inet_tcp, OID_AUTO, strict_rfc1948, CTLFLAG_RW,
206 &tcp_strict_rfc1948, 0, "Determines if RFC1948 is followed exactly");
207
208 static int tcp_isn_reseed_interval = 0;
209 SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
210 &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
211
212 static void tcp_cleartaocache(void);
213 static void tcp_notify(struct inpcb *, int);
214 struct zone *sack_hole_zone;
215
216 /*
217 * Target size of TCP PCB hash tables. Must be a power of two.
218 *
219 * Note that this can be overridden by the kernel environment
220 * variable net.inet.tcp.tcbhashsize
221 */
222 #ifndef TCBHASHSIZE
223 #define TCBHASHSIZE 4096
224 #endif
225
226 /*
227 * This is the actual shape of what we allocate using the zone
228 * allocator. Doing it this way allows us to protect both structures
229 * using the same generation count, and also eliminates the overhead
230 * of allocating tcpcbs separately. By hiding the structure here,
231 * we avoid changing most of the rest of the code (although it needs
232 * to be changed, eventually, for greater efficiency).
233 */
234 #define ALIGNMENT 32
235 #define ALIGNM1 (ALIGNMENT - 1)
236 struct inp_tp {
237 union {
238 struct inpcb inp;
239 char align[(sizeof(struct inpcb) + ALIGNM1) & ~ALIGNM1];
240 } inp_tp_u;
241 struct tcpcb tcb;
242 };
243 #undef ALIGNMENT
244 #undef ALIGNM1
245
246 static struct tcpcb dummy_tcb;
247
248
249 extern struct inpcbhead time_wait_slots[];
250 extern int cur_tw_slot;
251 extern u_long *delack_bitmask;
252 extern u_long route_generation;
253
254
255 int get_inpcb_str_size()
256 {
257 return sizeof(struct inpcb);
258 }
259
260
261 int get_tcp_str_size()
262 {
263 return sizeof(struct tcpcb);
264 }
265
266 int tcp_freeq(struct tcpcb *tp);
267
268
269 /*
270 * Tcp initialization
271 */
272 void
273 tcp_init()
274 {
275 int hashsize = TCBHASHSIZE;
276 vm_size_t str_size;
277 int i;
278 struct inpcbinfo *pcbinfo;
279
280 tcp_ccgen = 1;
281 tcp_cleartaocache();
282
283 tcp_delacktime = TCPTV_DELACK;
284 tcp_keepinit = TCPTV_KEEP_INIT;
285 tcp_keepidle = TCPTV_KEEP_IDLE;
286 tcp_keepintvl = TCPTV_KEEPINTVL;
287 tcp_maxpersistidle = TCPTV_KEEP_IDLE;
288 tcp_msl = TCPTV_MSL;
289 read_random(&tcp_now, sizeof(tcp_now));
290 tcp_now = tcp_now & 0x7fffffff; /* Starts tcp internal 500ms clock at a random value */
291
292
293 LIST_INIT(&tcb);
294 tcbinfo.listhead = &tcb;
295 pcbinfo = &tcbinfo;
296 if (!powerof2(hashsize)) {
297 printf("WARNING: TCB hash size not a power of 2\n");
298 hashsize = 512; /* safe default */
299 }
300 tcp_tcbhashsize = hashsize;
301 tcbinfo.hashsize = hashsize;
302 tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask);
303 tcbinfo.porthashbase = hashinit(hashsize, M_PCB,
304 &tcbinfo.porthashmask);
305 str_size = (vm_size_t) sizeof(struct inp_tp);
306 tcbinfo.ipi_zone = (void *) zinit(str_size, 120000*str_size, 8192, "tcpcb");
307 sack_hole_zone = zinit(str_size, 120000*str_size, 8192, "sack_hole zone");
308 tcp_reass_maxseg = nmbclusters / 16;
309
310 #if INET6
311 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
312 #else /* INET6 */
313 #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr))
314 #endif /* INET6 */
315 if (max_protohdr < TCP_MINPROTOHDR)
316 max_protohdr = TCP_MINPROTOHDR;
317 if (max_linkhdr + TCP_MINPROTOHDR > MHLEN)
318 panic("tcp_init");
319 #undef TCP_MINPROTOHDR
320 dummy_tcb.t_state = TCP_NSTATES;
321 dummy_tcb.t_flags = 0;
322 tcbinfo.dummy_cb = (caddr_t) &dummy_tcb;
323
324 /*
325 * allocate lock group attribute and group for tcp pcb mutexes
326 */
327 pcbinfo->mtx_grp_attr = lck_grp_attr_alloc_init();
328 lck_grp_attr_setdefault(pcbinfo->mtx_grp_attr);
329 pcbinfo->mtx_grp = lck_grp_alloc_init("tcppcb", pcbinfo->mtx_grp_attr);
330
331 /*
332 * allocate the lock attribute for tcp pcb mutexes
333 */
334 pcbinfo->mtx_attr = lck_attr_alloc_init();
335 lck_attr_setdefault(pcbinfo->mtx_attr);
336
337 if ((pcbinfo->mtx = lck_rw_alloc_init(pcbinfo->mtx_grp, pcbinfo->mtx_attr)) == NULL) {
338 printf("tcp_init: mutex not alloced!\n");
339 return; /* pretty much dead if this fails... */
340 }
341
342
343 in_pcb_nat_init(&tcbinfo, AF_INET, IPPROTO_TCP, SOCK_STREAM);
344
345 delack_bitmask = _MALLOC((4 * hashsize)/32, M_PCB, M_WAITOK);
346 if (delack_bitmask == 0)
347 panic("Delack Memory");
348
349 for (i=0; i < (tcbinfo.hashsize / 32); i++)
350 delack_bitmask[i] = 0;
351
352 for (i=0; i < N_TIME_WAIT_SLOTS; i++) {
353 LIST_INIT(&time_wait_slots[i]);
354 }
355 }
356
357 /*
358 * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
359 * tcp_template used to store this data in mbufs, but we now recopy it out
360 * of the tcpcb each time to conserve mbufs.
361 */
362 void
363 tcp_fillheaders(tp, ip_ptr, tcp_ptr)
364 struct tcpcb *tp;
365 void *ip_ptr;
366 void *tcp_ptr;
367 {
368 struct inpcb *inp = tp->t_inpcb;
369 struct tcphdr *tcp_hdr = (struct tcphdr *)tcp_ptr;
370
371 #if INET6
372 if ((inp->inp_vflag & INP_IPV6) != 0) {
373 struct ip6_hdr *ip6;
374
375 ip6 = (struct ip6_hdr *)ip_ptr;
376 ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
377 (inp->in6p_flowinfo & IPV6_FLOWINFO_MASK);
378 ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
379 (IPV6_VERSION & IPV6_VERSION_MASK);
380 ip6->ip6_nxt = IPPROTO_TCP;
381 ip6->ip6_plen = sizeof(struct tcphdr);
382 ip6->ip6_src = inp->in6p_laddr;
383 ip6->ip6_dst = inp->in6p_faddr;
384 tcp_hdr->th_sum = 0;
385 } else
386 #endif
387 {
388 struct ip *ip = (struct ip *) ip_ptr;
389
390 ip->ip_vhl = IP_VHL_BORING;
391 ip->ip_tos = 0;
392 ip->ip_len = 0;
393 ip->ip_id = 0;
394 ip->ip_off = 0;
395 ip->ip_ttl = 0;
396 ip->ip_sum = 0;
397 ip->ip_p = IPPROTO_TCP;
398 ip->ip_src = inp->inp_laddr;
399 ip->ip_dst = inp->inp_faddr;
400 tcp_hdr->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
401 htons(sizeof(struct tcphdr) + IPPROTO_TCP));
402 }
403
404 tcp_hdr->th_sport = inp->inp_lport;
405 tcp_hdr->th_dport = inp->inp_fport;
406 tcp_hdr->th_seq = 0;
407 tcp_hdr->th_ack = 0;
408 tcp_hdr->th_x2 = 0;
409 tcp_hdr->th_off = 5;
410 tcp_hdr->th_flags = 0;
411 tcp_hdr->th_win = 0;
412 tcp_hdr->th_urp = 0;
413 }
414
415 /*
416 * Create template to be used to send tcp packets on a connection.
417 * Allocates an mbuf and fills in a skeletal tcp/ip header. The only
418 * use for this function is in keepalives, which use tcp_respond.
419 */
420 struct tcptemp *
421 tcp_maketemplate(tp)
422 struct tcpcb *tp;
423 {
424 struct mbuf *m;
425 struct tcptemp *n;
426
427 m = m_get(M_DONTWAIT, MT_HEADER);
428 if (m == NULL)
429 return (0);
430 m->m_len = sizeof(struct tcptemp);
431 n = mtod(m, struct tcptemp *);
432
433 tcp_fillheaders(tp, (void *)&n->tt_ipgen, (void *)&n->tt_t);
434 return (n);
435 }
436
437 /*
438 * Send a single message to the TCP at address specified by
439 * the given TCP/IP header. If m == 0, then we make a copy
440 * of the tcpiphdr at ti and send directly to the addressed host.
441 * This is used to force keep alive messages out using the TCP
442 * template for a connection. If flags are given then we send
443 * a message back to the TCP which originated the * segment ti,
444 * and discard the mbuf containing it and any other attached mbufs.
445 *
446 * In any case the ack and sequence number of the transmitted
447 * segment are as specified by the parameters.
448 *
449 * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
450 */
451 void
452 tcp_respond(tp, ipgen, th, m, ack, seq, flags)
453 struct tcpcb *tp;
454 void *ipgen;
455 register struct tcphdr *th;
456 register struct mbuf *m;
457 tcp_seq ack, seq;
458 int flags;
459 {
460 register int tlen;
461 int win = 0;
462 struct route *ro = 0;
463 struct route sro;
464 struct ip *ip;
465 struct tcphdr *nth;
466 #if INET6
467 struct route_in6 *ro6 = 0;
468 struct route_in6 sro6;
469 struct ip6_hdr *ip6;
470 int isipv6;
471 #endif /* INET6 */
472 int ipflags = 0;
473
474 #if INET6
475 isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6;
476 ip6 = ipgen;
477 #endif /* INET6 */
478 ip = ipgen;
479
480 if (tp) {
481 if (!(flags & TH_RST)) {
482 win = sbspace(&tp->t_inpcb->inp_socket->so_rcv);
483 if (win > (long)TCP_MAXWIN << tp->rcv_scale)
484 win = (long)TCP_MAXWIN << tp->rcv_scale;
485 }
486 #if INET6
487 if (isipv6)
488 ro6 = &tp->t_inpcb->in6p_route;
489 else
490 #endif /* INET6 */
491 ro = &tp->t_inpcb->inp_route;
492 } else {
493 #if INET6
494 if (isipv6) {
495 ro6 = &sro6;
496 bzero(ro6, sizeof *ro6);
497 } else
498 #endif /* INET6 */
499 {
500 ro = &sro;
501 bzero(ro, sizeof *ro);
502 }
503 }
504 if (m == 0) {
505 m = m_gethdr(M_DONTWAIT, MT_HEADER);
506 if (m == NULL)
507 return;
508 tlen = 0;
509 m->m_data += max_linkhdr;
510 #if INET6
511 if (isipv6) {
512 bcopy((caddr_t)ip6, mtod(m, caddr_t),
513 sizeof(struct ip6_hdr));
514 ip6 = mtod(m, struct ip6_hdr *);
515 nth = (struct tcphdr *)(ip6 + 1);
516 } else
517 #endif /* INET6 */
518 {
519 bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
520 ip = mtod(m, struct ip *);
521 nth = (struct tcphdr *)(ip + 1);
522 }
523 bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
524 flags = TH_ACK;
525 } else {
526 m_freem(m->m_next);
527 m->m_next = 0;
528 m->m_data = (caddr_t)ipgen;
529 /* m_len is set later */
530 tlen = 0;
531 #define xchg(a,b,type) { type t; t=a; a=b; b=t; }
532 #if INET6
533 if (isipv6) {
534 xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
535 nth = (struct tcphdr *)(ip6 + 1);
536 } else
537 #endif /* INET6 */
538 {
539 xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long);
540 nth = (struct tcphdr *)(ip + 1);
541 }
542 if (th != nth) {
543 /*
544 * this is usually a case when an extension header
545 * exists between the IPv6 header and the
546 * TCP header.
547 */
548 nth->th_sport = th->th_sport;
549 nth->th_dport = th->th_dport;
550 }
551 xchg(nth->th_dport, nth->th_sport, n_short);
552 #undef xchg
553 }
554 #if INET6
555 if (isipv6) {
556 ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) +
557 tlen));
558 tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
559 } else
560 #endif
561 {
562 tlen += sizeof (struct tcpiphdr);
563 ip->ip_len = tlen;
564 ip->ip_ttl = ip_defttl;
565 }
566 m->m_len = tlen;
567 m->m_pkthdr.len = tlen;
568 m->m_pkthdr.rcvif = 0;
569 nth->th_seq = htonl(seq);
570 nth->th_ack = htonl(ack);
571 nth->th_x2 = 0;
572 nth->th_off = sizeof (struct tcphdr) >> 2;
573 nth->th_flags = flags;
574 if (tp)
575 nth->th_win = htons((u_short) (win >> tp->rcv_scale));
576 else
577 nth->th_win = htons((u_short)win);
578 nth->th_urp = 0;
579 #if INET6
580 if (isipv6) {
581 nth->th_sum = 0;
582 nth->th_sum = in6_cksum(m, IPPROTO_TCP,
583 sizeof(struct ip6_hdr),
584 tlen - sizeof(struct ip6_hdr));
585 ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL,
586 ro6 && ro6->ro_rt ?
587 ro6->ro_rt->rt_ifp :
588 NULL);
589 } else
590 #endif /* INET6 */
591 {
592 nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
593 htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
594 m->m_pkthdr.csum_flags = CSUM_TCP;
595 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
596 }
597 #if TCPDEBUG
598 if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
599 tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
600 #endif
601 #if IPSEC
602 if (ipsec_bypass == 0 && ipsec_setsocket(m, tp ? tp->t_inpcb->inp_socket : NULL) != 0) {
603 m_freem(m);
604 return;
605 }
606 #endif
607 #if INET6
608 if (isipv6) {
609 (void)ip6_output(m, NULL, ro6, ipflags, NULL, NULL, 0);
610 if (ro6 == &sro6 && ro6->ro_rt) {
611 rtfree(ro6->ro_rt);
612 ro6->ro_rt = NULL;
613 }
614 } else
615 #endif /* INET6 */
616 {
617 (void) ip_output_list(m, 0, NULL, ro, ipflags, NULL);
618 if (ro == &sro && ro->ro_rt) {
619 rtfree(ro->ro_rt);
620 ro->ro_rt = NULL;
621 }
622 }
623 }
624
625 /*
626 * Create a new TCP control block, making an
627 * empty reassembly queue and hooking it to the argument
628 * protocol control block. The `inp' parameter must have
629 * come from the zone allocator set up in tcp_init().
630 */
631 struct tcpcb *
632 tcp_newtcpcb(inp)
633 struct inpcb *inp;
634 {
635 struct inp_tp *it;
636 register struct tcpcb *tp;
637 register struct socket *so = inp->inp_socket;
638 #if INET6
639 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
640 #endif /* INET6 */
641
642 if (so->cached_in_sock_layer == 0) {
643 it = (struct inp_tp *)inp;
644 tp = &it->tcb;
645 }
646 else
647 tp = (struct tcpcb *) inp->inp_saved_ppcb;
648
649 bzero((char *) tp, sizeof(struct tcpcb));
650 LIST_INIT(&tp->t_segq);
651 tp->t_maxseg = tp->t_maxopd =
652 #if INET6
653 isipv6 ? tcp_v6mssdflt :
654 #endif /* INET6 */
655 tcp_mssdflt;
656
657 if (tcp_do_rfc1323)
658 tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
659 tp->sack_enable = tcp_do_sack;
660 TAILQ_INIT(&tp->snd_holes);
661 tp->t_inpcb = inp; /* XXX */
662 /*
663 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
664 * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives
665 * reasonable initial retransmit time.
666 */
667 tp->t_srtt = TCPTV_SRTTBASE;
668 tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
669 tp->t_rttmin = TCPTV_MIN;
670 tp->t_rxtcur = TCPTV_RTOBASE;
671 tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
672 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
673 tp->t_rcvtime = 0;
674 /*
675 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
676 * because the socket may be bound to an IPv6 wildcard address,
677 * which may match an IPv4-mapped IPv6 address.
678 */
679 inp->inp_ip_ttl = ip_defttl;
680 inp->inp_ppcb = (caddr_t)tp;
681 return (tp); /* XXX */
682 }
683
684 /*
685 * Drop a TCP connection, reporting
686 * the specified error. If connection is synchronized,
687 * then send a RST to peer.
688 */
689 struct tcpcb *
690 tcp_drop(tp, errno)
691 register struct tcpcb *tp;
692 int errno;
693 {
694 struct socket *so = tp->t_inpcb->inp_socket;
695
696 if (TCPS_HAVERCVDSYN(tp->t_state)) {
697 tp->t_state = TCPS_CLOSED;
698 (void) tcp_output(tp);
699 tcpstat.tcps_drops++;
700 } else
701 tcpstat.tcps_conndrops++;
702 if (errno == ETIMEDOUT && tp->t_softerror)
703 errno = tp->t_softerror;
704 so->so_error = errno;
705 return (tcp_close(tp));
706 }
707
708 /*
709 * Close a TCP control block:
710 * discard all space held by the tcp
711 * discard internet protocol block
712 * wake up any sleepers
713 */
714 struct tcpcb *
715 tcp_close(tp)
716 register struct tcpcb *tp;
717 {
718 struct inpcb *inp = tp->t_inpcb;
719 struct socket *so = inp->inp_socket;
720 #if INET6
721 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
722 #endif /* INET6 */
723 register struct rtentry *rt;
724 int dosavessthresh;
725
726 if ( inp->inp_ppcb == NULL) /* tcp_close was called previously, bail */
727 return NULL;
728
729 /* Clear the timers before we delete the PCB. */
730 {
731 int i;
732 for (i = 0; i < TCPT_NTIMERS; i++) {
733 tp->t_timer[i] = 0;
734 }
735 }
736
737 KERNEL_DEBUG(DBG_FNC_TCP_CLOSE | DBG_FUNC_START, tp,0,0,0,0);
738 switch (tp->t_state)
739 {
740 case TCPS_ESTABLISHED:
741 case TCPS_FIN_WAIT_1:
742 case TCPS_CLOSING:
743 case TCPS_CLOSE_WAIT:
744 case TCPS_LAST_ACK:
745 break;
746 }
747
748
749 /*
750 * If we got enough samples through the srtt filter,
751 * save the rtt and rttvar in the routing entry.
752 * 'Enough' is arbitrarily defined as the 16 samples.
753 * 16 samples is enough for the srtt filter to converge
754 * to within 5% of the correct value; fewer samples and
755 * we could save a very bogus rtt.
756 *
757 * Don't update the default route's characteristics and don't
758 * update anything that the user "locked".
759 */
760 if (tp->t_rttupdated >= 16) {
761 register u_long i = 0;
762 #if INET6
763 if (isipv6) {
764 struct sockaddr_in6 *sin6;
765
766 if ((rt = inp->in6p_route.ro_rt) == NULL)
767 goto no_valid_rt;
768 sin6 = (struct sockaddr_in6 *)rt_key(rt);
769 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
770 goto no_valid_rt;
771 }
772 else
773 #endif /* INET6 */
774 rt = inp->inp_route.ro_rt;
775 if (rt == NULL ||
776 ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr
777 == INADDR_ANY || rt->generation_id != route_generation) {
778 if (tp->t_state >= TCPS_CLOSE_WAIT)
779 tp->t_state = TCPS_CLOSING;
780
781 goto no_valid_rt;
782 }
783
784 if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
785 i = tp->t_srtt *
786 (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTT_SCALE));
787 if (rt->rt_rmx.rmx_rtt && i)
788 /*
789 * filter this update to half the old & half
790 * the new values, converting scale.
791 * See route.h and tcp_var.h for a
792 * description of the scaling constants.
793 */
794 rt->rt_rmx.rmx_rtt =
795 (rt->rt_rmx.rmx_rtt + i) / 2;
796 else
797 rt->rt_rmx.rmx_rtt = i;
798 tcpstat.tcps_cachedrtt++;
799 }
800 if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
801 i = tp->t_rttvar *
802 (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTTVAR_SCALE));
803 if (rt->rt_rmx.rmx_rttvar && i)
804 rt->rt_rmx.rmx_rttvar =
805 (rt->rt_rmx.rmx_rttvar + i) / 2;
806 else
807 rt->rt_rmx.rmx_rttvar = i;
808 tcpstat.tcps_cachedrttvar++;
809 }
810 /*
811 * The old comment here said:
812 * update the pipelimit (ssthresh) if it has been updated
813 * already or if a pipesize was specified & the threshhold
814 * got below half the pipesize. I.e., wait for bad news
815 * before we start updating, then update on both good
816 * and bad news.
817 *
818 * But we want to save the ssthresh even if no pipesize is
819 * specified explicitly in the route, because such
820 * connections still have an implicit pipesize specified
821 * by the global tcp_sendspace. In the absence of a reliable
822 * way to calculate the pipesize, it will have to do.
823 */
824 i = tp->snd_ssthresh;
825 if (rt->rt_rmx.rmx_sendpipe != 0)
826 dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2);
827 else
828 dosavessthresh = (i < so->so_snd.sb_hiwat / 2);
829 if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
830 i != 0 && rt->rt_rmx.rmx_ssthresh != 0)
831 || dosavessthresh) {
832 /*
833 * convert the limit from user data bytes to
834 * packets then to packet data bytes.
835 */
836 i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
837 if (i < 2)
838 i = 2;
839 i *= (u_long)(tp->t_maxseg +
840 #if INET6
841 (isipv6 ? sizeof (struct ip6_hdr) +
842 sizeof (struct tcphdr) :
843 #endif
844 sizeof (struct tcpiphdr)
845 #if INET6
846 )
847 #endif
848 );
849 if (rt->rt_rmx.rmx_ssthresh)
850 rt->rt_rmx.rmx_ssthresh =
851 (rt->rt_rmx.rmx_ssthresh + i) / 2;
852 else
853 rt->rt_rmx.rmx_ssthresh = i;
854 tcpstat.tcps_cachedssthresh++;
855 }
856 }
857 rt = inp->inp_route.ro_rt;
858 if (rt) {
859 /*
860 * mark route for deletion if no information is
861 * cached.
862 */
863 if ((tp->t_flags & TF_LQ_OVERFLOW) && tcp_lq_overflow &&
864 ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0)){
865 if (rt->rt_rmx.rmx_rtt == 0)
866 rt->rt_flags |= RTF_DELCLONE;
867 }
868 }
869 no_valid_rt:
870 /* free the reassembly queue, if any */
871 (void) tcp_freeq(tp);
872
873 tcp_free_sackholes(tp);
874
875 #ifdef __APPLE__
876 if (so->cached_in_sock_layer)
877 inp->inp_saved_ppcb = (caddr_t) tp;
878 #endif
879
880 soisdisconnected(so);
881 #if INET6
882 if (INP_CHECK_SOCKAF(so, AF_INET6))
883 in6_pcbdetach(inp);
884 else
885 #endif /* INET6 */
886 in_pcbdetach(inp);
887 tcpstat.tcps_closed++;
888 KERNEL_DEBUG(DBG_FNC_TCP_CLOSE | DBG_FUNC_END, tcpstat.tcps_closed,0,0,0,0);
889 return ((struct tcpcb *)0);
890 }
891
892 int
893 tcp_freeq(tp)
894 struct tcpcb *tp;
895 {
896
897 register struct tseg_qent *q;
898 int rv = 0;
899
900 while((q = LIST_FIRST(&tp->t_segq)) != NULL) {
901 LIST_REMOVE(q, tqe_q);
902 m_freem(q->tqe_m);
903 FREE(q, M_TSEGQ);
904 tcp_reass_qsize--;
905 rv = 1;
906 }
907 return (rv);
908 }
909
910 void
911 tcp_drain()
912 {
913 /*
914 * ###LD 05/19/04 locking issue, tcpdrain is disabled, deadlock situation with tcbinfo.mtx
915 */
916 if (do_tcpdrain)
917 {
918 struct inpcb *inpb;
919 struct tcpcb *tcpb;
920 struct tseg_qent *te;
921
922 /*
923 * Walk the tcpbs, if existing, and flush the reassembly queue,
924 * if there is one...
925 * XXX: The "Net/3" implementation doesn't imply that the TCP
926 * reassembly queue should be flushed, but in a situation
927 * where we're really low on mbufs, this is potentially
928 * usefull.
929 */
930 lck_rw_lock_exclusive(tcbinfo.mtx);
931 for (inpb = LIST_FIRST(tcbinfo.listhead); inpb;
932 inpb = LIST_NEXT(inpb, inp_list)) {
933 if ((tcpb = intotcpcb(inpb))) {
934 while ((te = LIST_FIRST(&tcpb->t_segq))
935 != NULL) {
936 LIST_REMOVE(te, tqe_q);
937 m_freem(te->tqe_m);
938 FREE(te, M_TSEGQ);
939 tcp_reass_qsize--;
940 }
941 }
942 }
943 lck_rw_done(tcbinfo.mtx);
944
945 }
946 }
947
948 /*
949 * Notify a tcp user of an asynchronous error;
950 * store error as soft error, but wake up user
951 * (for now, won't do anything until can select for soft error).
952 *
953 * Do not wake up user since there currently is no mechanism for
954 * reporting soft errors (yet - a kqueue filter may be added).
955 */
956 static void
957 tcp_notify(inp, error)
958 struct inpcb *inp;
959 int error;
960 {
961 struct tcpcb *tp;
962
963 if (inp == NULL || (inp->inp_state == INPCB_STATE_DEAD))
964 return; /* pcb is gone already */
965
966 tp = (struct tcpcb *)inp->inp_ppcb;
967
968 /*
969 * Ignore some errors if we are hooked up.
970 * If connection hasn't completed, has retransmitted several times,
971 * and receives a second error, give up now. This is better
972 * than waiting a long time to establish a connection that
973 * can never complete.
974 */
975 if (tp->t_state == TCPS_ESTABLISHED &&
976 (error == EHOSTUNREACH || error == ENETUNREACH ||
977 error == EHOSTDOWN)) {
978 return;
979 } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
980 tp->t_softerror)
981 tcp_drop(tp, error);
982 else
983 tp->t_softerror = error;
984 #if 0
985 wakeup((caddr_t) &so->so_timeo);
986 sorwakeup(so);
987 sowwakeup(so);
988 #endif
989 }
990
991 static int
992 tcp_pcblist SYSCTL_HANDLER_ARGS
993 {
994 int error, i, n;
995 struct inpcb *inp, **inp_list;
996 inp_gen_t gencnt;
997 struct xinpgen xig;
998
999 /*
1000 * The process of preparing the TCB list is too time-consuming and
1001 * resource-intensive to repeat twice on every request.
1002 */
1003 lck_rw_lock_shared(tcbinfo.mtx);
1004 if (req->oldptr == USER_ADDR_NULL) {
1005 n = tcbinfo.ipi_count;
1006 req->oldidx = 2 * (sizeof xig)
1007 + (n + n/8) * sizeof(struct xtcpcb);
1008 lck_rw_done(tcbinfo.mtx);
1009 return 0;
1010 }
1011
1012 if (req->newptr != USER_ADDR_NULL) {
1013 lck_rw_done(tcbinfo.mtx);
1014 return EPERM;
1015 }
1016
1017 /*
1018 * OK, now we're committed to doing something.
1019 */
1020 gencnt = tcbinfo.ipi_gencnt;
1021 n = tcbinfo.ipi_count;
1022
1023 bzero(&xig, sizeof(xig));
1024 xig.xig_len = sizeof xig;
1025 xig.xig_count = n;
1026 xig.xig_gen = gencnt;
1027 xig.xig_sogen = so_gencnt;
1028 error = SYSCTL_OUT(req, &xig, sizeof xig);
1029 if (error) {
1030 lck_rw_done(tcbinfo.mtx);
1031 return error;
1032 }
1033 /*
1034 * We are done if there is no pcb
1035 */
1036 if (n == 0) {
1037 lck_rw_done(tcbinfo.mtx);
1038 return 0;
1039 }
1040
1041 inp_list = _MALLOC(n * sizeof *inp_list, M_TEMP, M_WAITOK);
1042 if (inp_list == 0) {
1043 lck_rw_done(tcbinfo.mtx);
1044 return ENOMEM;
1045 }
1046
1047 for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n;
1048 inp = LIST_NEXT(inp, inp_list)) {
1049 #ifdef __APPLE__
1050 if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD)
1051 #else
1052 if (inp->inp_gencnt <= gencnt && !prison_xinpcb(req->p, inp))
1053 #endif
1054 inp_list[i++] = inp;
1055 }
1056 n = i;
1057
1058 error = 0;
1059 for (i = 0; i < n; i++) {
1060 inp = inp_list[i];
1061 if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) {
1062 struct xtcpcb xt;
1063 caddr_t inp_ppcb;
1064
1065 bzero(&xt, sizeof(xt));
1066 xt.xt_len = sizeof xt;
1067 /* XXX should avoid extra copy */
1068 inpcb_to_compat(inp, &xt.xt_inp);
1069 inp_ppcb = inp->inp_ppcb;
1070 if (inp_ppcb != NULL) {
1071 bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp);
1072 }
1073 else
1074 bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
1075 if (inp->inp_socket)
1076 sotoxsocket(inp->inp_socket, &xt.xt_socket);
1077 error = SYSCTL_OUT(req, &xt, sizeof xt);
1078 }
1079 }
1080 if (!error) {
1081 /*
1082 * Give the user an updated idea of our state.
1083 * If the generation differs from what we told
1084 * her before, she knows that something happened
1085 * while we were processing this request, and it
1086 * might be necessary to retry.
1087 */
1088 bzero(&xig, sizeof(xig));
1089 xig.xig_len = sizeof xig;
1090 xig.xig_gen = tcbinfo.ipi_gencnt;
1091 xig.xig_sogen = so_gencnt;
1092 xig.xig_count = tcbinfo.ipi_count;
1093 error = SYSCTL_OUT(req, &xig, sizeof xig);
1094 }
1095 FREE(inp_list, M_TEMP);
1096 lck_rw_done(tcbinfo.mtx);
1097 return error;
1098 }
1099
1100 SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
1101 tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
1102
1103 #ifndef __APPLE__
1104 static int
1105 tcp_getcred(SYSCTL_HANDLER_ARGS)
1106 {
1107 struct sockaddr_in addrs[2];
1108 struct inpcb *inp;
1109 int error, s;
1110
1111 error = suser(req->p);
1112 if (error)
1113 return (error);
1114 error = SYSCTL_IN(req, addrs, sizeof(addrs));
1115 if (error)
1116 return (error);
1117 s = splnet();
1118 inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port,
1119 addrs[0].sin_addr, addrs[0].sin_port, 0, NULL);
1120 if (inp == NULL || inp->inp_socket == NULL) {
1121 error = ENOENT;
1122 goto out;
1123 }
1124 error = SYSCTL_OUT(req, inp->inp_socket->so_cred, sizeof(*(kauth_cred_t)0);
1125 out:
1126 splx(s);
1127 return (error);
1128 }
1129
1130 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW,
1131 0, 0, tcp_getcred, "S,ucred", "Get the ucred of a TCP connection");
1132
1133 #if INET6
1134 static int
1135 tcp6_getcred(SYSCTL_HANDLER_ARGS)
1136 {
1137 struct sockaddr_in6 addrs[2];
1138 struct inpcb *inp;
1139 int error, s, mapped = 0;
1140
1141 error = suser(req->p);
1142 if (error)
1143 return (error);
1144 error = SYSCTL_IN(req, addrs, sizeof(addrs));
1145 if (error)
1146 return (error);
1147 if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) {
1148 if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr))
1149 mapped = 1;
1150 else
1151 return (EINVAL);
1152 }
1153 s = splnet();
1154 if (mapped == 1)
1155 inp = in_pcblookup_hash(&tcbinfo,
1156 *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12],
1157 addrs[1].sin6_port,
1158 *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12],
1159 addrs[0].sin6_port,
1160 0, NULL);
1161 else
1162 inp = in6_pcblookup_hash(&tcbinfo, &addrs[1].sin6_addr,
1163 addrs[1].sin6_port,
1164 &addrs[0].sin6_addr, addrs[0].sin6_port,
1165 0, NULL);
1166 if (inp == NULL || inp->inp_socket == NULL) {
1167 error = ENOENT;
1168 goto out;
1169 }
1170 error = SYSCTL_OUT(req, inp->inp_socket->so_cred,
1171 sizeof(*(kauth_cred_t)0);
1172 out:
1173 splx(s);
1174 return (error);
1175 }
1176
1177 SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW,
1178 0, 0,
1179 tcp6_getcred, "S,ucred", "Get the ucred of a TCP6 connection");
1180 #endif
1181 #endif /* __APPLE__*/
1182
1183 void
1184 tcp_ctlinput(cmd, sa, vip)
1185 int cmd;
1186 struct sockaddr *sa;
1187 void *vip;
1188 {
1189 struct ip *ip = vip;
1190 struct tcphdr *th;
1191 struct in_addr faddr;
1192 struct inpcb *inp;
1193 struct tcpcb *tp;
1194 void (*notify)(struct inpcb *, int) = tcp_notify;
1195 tcp_seq icmp_seq;
1196
1197 faddr = ((struct sockaddr_in *)sa)->sin_addr;
1198 if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
1199 return;
1200
1201 if (cmd == PRC_QUENCH)
1202 notify = tcp_quench;
1203 else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
1204 cmd == PRC_UNREACH_PORT) && ip)
1205 notify = tcp_drop_syn_sent;
1206 else if (cmd == PRC_MSGSIZE)
1207 notify = tcp_mtudisc;
1208 else if (PRC_IS_REDIRECT(cmd)) {
1209 ip = 0;
1210 notify = in_rtchange;
1211 } else if (cmd == PRC_HOSTDEAD)
1212 ip = 0;
1213 else if ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0)
1214 return;
1215 if (ip) {
1216 th = (struct tcphdr *)((caddr_t)ip
1217 + (IP_VHL_HL(ip->ip_vhl) << 2));
1218 inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport,
1219 ip->ip_src, th->th_sport, 0, NULL);
1220 if (inp != NULL && inp->inp_socket != NULL) {
1221 tcp_lock(inp->inp_socket, 1, 0);
1222 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
1223 tcp_unlock(inp->inp_socket, 1, 0);
1224 return;
1225 }
1226 icmp_seq = htonl(th->th_seq);
1227 tp = intotcpcb(inp);
1228 if (SEQ_GEQ(icmp_seq, tp->snd_una) &&
1229 SEQ_LT(icmp_seq, tp->snd_max))
1230 (*notify)(inp, inetctlerrmap[cmd]);
1231 tcp_unlock(inp->inp_socket, 1, 0);
1232 }
1233 } else
1234 in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify);
1235 }
1236
1237 #if INET6
1238 void
1239 tcp6_ctlinput(cmd, sa, d)
1240 int cmd;
1241 struct sockaddr *sa;
1242 void *d;
1243 {
1244 struct tcphdr th;
1245 void (*notify)(struct inpcb *, int) = tcp_notify;
1246 struct ip6_hdr *ip6;
1247 struct mbuf *m;
1248 struct ip6ctlparam *ip6cp = NULL;
1249 const struct sockaddr_in6 *sa6_src = NULL;
1250 int off;
1251 struct tcp_portonly {
1252 u_int16_t th_sport;
1253 u_int16_t th_dport;
1254 } *thp;
1255
1256 if (sa->sa_family != AF_INET6 ||
1257 sa->sa_len != sizeof(struct sockaddr_in6))
1258 return;
1259
1260 if (cmd == PRC_QUENCH)
1261 notify = tcp_quench;
1262 else if (cmd == PRC_MSGSIZE)
1263 notify = tcp_mtudisc;
1264 else if (!PRC_IS_REDIRECT(cmd) &&
1265 ((unsigned)cmd > PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
1266 return;
1267
1268 /* if the parameter is from icmp6, decode it. */
1269 if (d != NULL) {
1270 ip6cp = (struct ip6ctlparam *)d;
1271 m = ip6cp->ip6c_m;
1272 ip6 = ip6cp->ip6c_ip6;
1273 off = ip6cp->ip6c_off;
1274 sa6_src = ip6cp->ip6c_src;
1275 } else {
1276 m = NULL;
1277 ip6 = NULL;
1278 off = 0; /* fool gcc */
1279 sa6_src = &sa6_any;
1280 }
1281
1282 if (ip6) {
1283 /*
1284 * XXX: We assume that when IPV6 is non NULL,
1285 * M and OFF are valid.
1286 */
1287
1288 /* check if we can safely examine src and dst ports */
1289 if (m->m_pkthdr.len < off + sizeof(*thp))
1290 return;
1291
1292 bzero(&th, sizeof(th));
1293 m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
1294
1295 in6_pcbnotify(&tcbinfo, sa, th.th_dport,
1296 (struct sockaddr *)ip6cp->ip6c_src,
1297 th.th_sport, cmd, notify);
1298 } else
1299 in6_pcbnotify(&tcbinfo, sa, 0, (struct sockaddr *)sa6_src,
1300 0, cmd, notify);
1301 }
1302 #endif /* INET6 */
1303
1304
1305 /*
1306 * Following is where TCP initial sequence number generation occurs.
1307 *
1308 * There are two places where we must use initial sequence numbers:
1309 * 1. In SYN-ACK packets.
1310 * 2. In SYN packets.
1311 *
1312 * The ISNs in SYN-ACK packets have no monotonicity requirement,
1313 * and should be as unpredictable as possible to avoid the possibility
1314 * of spoofing and/or connection hijacking. To satisfy this
1315 * requirement, SYN-ACK ISNs are generated via the arc4random()
1316 * function. If exact RFC 1948 compliance is requested via sysctl,
1317 * these ISNs will be generated just like those in SYN packets.
1318 *
1319 * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
1320 * depends on this property. In addition, these ISNs should be
1321 * unguessable so as to prevent connection hijacking. To satisfy
1322 * the requirements of this situation, the algorithm outlined in
1323 * RFC 1948 is used to generate sequence numbers.
1324 *
1325 * For more information on the theory of operation, please see
1326 * RFC 1948.
1327 *
1328 * Implementation details:
1329 *
1330 * Time is based off the system timer, and is corrected so that it
1331 * increases by one megabyte per second. This allows for proper
1332 * recycling on high speed LANs while still leaving over an hour
1333 * before rollover.
1334 *
1335 * Two sysctls control the generation of ISNs:
1336 *
1337 * net.inet.tcp.isn_reseed_interval controls the number of seconds
1338 * between seeding of isn_secret. This is normally set to zero,
1339 * as reseeding should not be necessary.
1340 *
1341 * net.inet.tcp.strict_rfc1948 controls whether RFC 1948 is followed
1342 * strictly. When strict compliance is requested, reseeding is
1343 * disabled and SYN-ACKs will be generated in the same manner as
1344 * SYNs. Strict mode is disabled by default.
1345 *
1346 */
1347
1348 #define ISN_BYTES_PER_SECOND 1048576
1349
1350 u_char isn_secret[32];
1351 int isn_last_reseed;
1352 MD5_CTX isn_ctx;
1353
1354 tcp_seq
1355 tcp_new_isn(tp)
1356 struct tcpcb *tp;
1357 {
1358 u_int32_t md5_buffer[4];
1359 tcp_seq new_isn;
1360 struct timeval timenow;
1361
1362 /* Use arc4random for SYN-ACKs when not in exact RFC1948 mode. */
1363 if (((tp->t_state == TCPS_LISTEN) || (tp->t_state == TCPS_TIME_WAIT))
1364 && tcp_strict_rfc1948 == 0)
1365 #ifdef __APPLE__
1366 return random();
1367 #else
1368 return arc4random();
1369 #endif
1370 getmicrotime(&timenow);
1371
1372 /* Seed if this is the first use, reseed if requested. */
1373 if ((isn_last_reseed == 0) ||
1374 ((tcp_strict_rfc1948 == 0) && (tcp_isn_reseed_interval > 0) &&
1375 (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz)
1376 < (u_int)timenow.tv_sec))) {
1377 #ifdef __APPLE__
1378 read_random(&isn_secret, sizeof(isn_secret));
1379 #else
1380 read_random_unlimited(&isn_secret, sizeof(isn_secret));
1381 #endif
1382 isn_last_reseed = timenow.tv_sec;
1383 }
1384
1385 /* Compute the md5 hash and return the ISN. */
1386 MD5Init(&isn_ctx);
1387 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short));
1388 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short));
1389 #if INET6
1390 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
1391 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr,
1392 sizeof(struct in6_addr));
1393 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr,
1394 sizeof(struct in6_addr));
1395 } else
1396 #endif
1397 {
1398 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr,
1399 sizeof(struct in_addr));
1400 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr,
1401 sizeof(struct in_addr));
1402 }
1403 MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret));
1404 MD5Final((u_char *) &md5_buffer, &isn_ctx);
1405 new_isn = (tcp_seq) md5_buffer[0];
1406 new_isn += timenow.tv_sec * (ISN_BYTES_PER_SECOND / hz);
1407 return new_isn;
1408 }
1409
1410 /*
1411 * When a source quench is received, close congestion window
1412 * to one segment. We will gradually open it again as we proceed.
1413 */
1414 void
1415 tcp_quench(
1416 struct inpcb *inp,
1417 __unused int errno
1418 )
1419 {
1420 struct tcpcb *tp = intotcpcb(inp);
1421
1422 if (tp)
1423 tp->snd_cwnd = tp->t_maxseg;
1424 }
1425
1426 /*
1427 * When a specific ICMP unreachable message is received and the
1428 * connection state is SYN-SENT, drop the connection. This behavior
1429 * is controlled by the icmp_may_rst sysctl.
1430 */
1431 void
1432 tcp_drop_syn_sent(inp, errno)
1433 struct inpcb *inp;
1434 int errno;
1435 {
1436 struct tcpcb *tp = intotcpcb(inp);
1437
1438 if (tp && tp->t_state == TCPS_SYN_SENT)
1439 tcp_drop(tp, errno);
1440 }
1441
1442 /*
1443 * When `need fragmentation' ICMP is received, update our idea of the MSS
1444 * based on the new value in the route. Also nudge TCP to send something,
1445 * since we know the packet we just sent was dropped.
1446 * This duplicates some code in the tcp_mss() function in tcp_input.c.
1447 */
1448 void
1449 tcp_mtudisc(
1450 struct inpcb *inp,
1451 __unused int errno
1452 )
1453 {
1454 struct tcpcb *tp = intotcpcb(inp);
1455 struct rtentry *rt;
1456 struct rmxp_tao *taop;
1457 struct socket *so = inp->inp_socket;
1458 int offered;
1459 int mss;
1460 #if INET6
1461 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
1462 #endif /* INET6 */
1463
1464 if (tp) {
1465 #if INET6
1466 if (isipv6)
1467 rt = tcp_rtlookup6(inp);
1468 else
1469 #endif /* INET6 */
1470 rt = tcp_rtlookup(inp);
1471 if (!rt || !rt->rt_rmx.rmx_mtu) {
1472 tp->t_maxopd = tp->t_maxseg =
1473 #if INET6
1474 isipv6 ? tcp_v6mssdflt :
1475 #endif /* INET6 */
1476 tcp_mssdflt;
1477 return;
1478 }
1479 taop = rmx_taop(rt->rt_rmx);
1480 offered = taop->tao_mssopt;
1481 mss = rt->rt_rmx.rmx_mtu -
1482 #if INET6
1483 (isipv6 ?
1484 sizeof(struct ip6_hdr) + sizeof(struct tcphdr) :
1485 #endif /* INET6 */
1486 sizeof(struct tcpiphdr)
1487 #if INET6
1488 )
1489 #endif /* INET6 */
1490 ;
1491
1492 if (offered)
1493 mss = min(mss, offered);
1494 /*
1495 * XXX - The above conditional probably violates the TCP
1496 * spec. The problem is that, since we don't know the
1497 * other end's MSS, we are supposed to use a conservative
1498 * default. But, if we do that, then MTU discovery will
1499 * never actually take place, because the conservative
1500 * default is much less than the MTUs typically seen
1501 * on the Internet today. For the moment, we'll sweep
1502 * this under the carpet.
1503 *
1504 * The conservative default might not actually be a problem
1505 * if the only case this occurs is when sending an initial
1506 * SYN with options and data to a host we've never talked
1507 * to before. Then, they will reply with an MSS value which
1508 * will get recorded and the new parameters should get
1509 * recomputed. For Further Study.
1510 */
1511 if (tp->t_maxopd <= mss)
1512 return;
1513 tp->t_maxopd = mss;
1514
1515 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
1516 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
1517 mss -= TCPOLEN_TSTAMP_APPA;
1518
1519 if (so->so_snd.sb_hiwat < mss)
1520 mss = so->so_snd.sb_hiwat;
1521
1522 tp->t_maxseg = mss;
1523
1524 tcpstat.tcps_mturesent++;
1525 tp->t_rtttime = 0;
1526 tp->snd_nxt = tp->snd_una;
1527 tcp_output(tp);
1528 }
1529 }
1530
1531 /*
1532 * Look-up the routing entry to the peer of this inpcb. If no route
1533 * is found and it cannot be allocated the return NULL. This routine
1534 * is called by TCP routines that access the rmx structure and by tcp_mss
1535 * to get the interface MTU.
1536 */
1537 struct rtentry *
1538 tcp_rtlookup(inp)
1539 struct inpcb *inp;
1540 {
1541 struct route *ro;
1542 struct rtentry *rt;
1543
1544 ro = &inp->inp_route;
1545 if (ro == NULL)
1546 return (NULL);
1547 rt = ro->ro_rt;
1548 if (rt == NULL || !(rt->rt_flags & RTF_UP) || rt->generation_id != route_generation) {
1549 /* No route yet, so try to acquire one */
1550 if (inp->inp_faddr.s_addr != INADDR_ANY) {
1551 ro->ro_dst.sa_family = AF_INET;
1552 ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
1553 ((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
1554 inp->inp_faddr;
1555 rtalloc(ro);
1556 rt = ro->ro_rt;
1557 }
1558 }
1559 return rt;
1560 }
1561
1562 #if INET6
1563 struct rtentry *
1564 tcp_rtlookup6(inp)
1565 struct inpcb *inp;
1566 {
1567 struct route_in6 *ro6;
1568 struct rtentry *rt;
1569
1570 ro6 = &inp->in6p_route;
1571 rt = ro6->ro_rt;
1572 if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
1573 /* No route yet, so try to acquire one */
1574 if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
1575 struct sockaddr_in6 *dst6;
1576
1577 dst6 = (struct sockaddr_in6 *)&ro6->ro_dst;
1578 dst6->sin6_family = AF_INET6;
1579 dst6->sin6_len = sizeof(*dst6);
1580 dst6->sin6_addr = inp->in6p_faddr;
1581 rtalloc((struct route *)ro6);
1582 rt = ro6->ro_rt;
1583 }
1584 }
1585 return rt;
1586 }
1587 #endif /* INET6 */
1588
1589 #if IPSEC
1590 /* compute ESP/AH header size for TCP, including outer IP header. */
1591 size_t
1592 ipsec_hdrsiz_tcp(tp)
1593 struct tcpcb *tp;
1594 {
1595 struct inpcb *inp;
1596 struct mbuf *m;
1597 size_t hdrsiz;
1598 struct ip *ip;
1599 #if INET6
1600 struct ip6_hdr *ip6 = NULL;
1601 #endif /* INET6 */
1602 struct tcphdr *th;
1603
1604 if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL))
1605 return 0;
1606 MGETHDR(m, M_DONTWAIT, MT_DATA);
1607 if (!m)
1608 return 0;
1609
1610 lck_mtx_lock(sadb_mutex);
1611 #if INET6
1612 if ((inp->inp_vflag & INP_IPV6) != 0) {
1613 ip6 = mtod(m, struct ip6_hdr *);
1614 th = (struct tcphdr *)(ip6 + 1);
1615 m->m_pkthdr.len = m->m_len =
1616 sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
1617 tcp_fillheaders(tp, ip6, th);
1618 hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
1619 } else
1620 #endif /* INET6 */
1621 {
1622 ip = mtod(m, struct ip *);
1623 th = (struct tcphdr *)(ip + 1);
1624 m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
1625 tcp_fillheaders(tp, ip, th);
1626 hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
1627 }
1628 lck_mtx_unlock(sadb_mutex);
1629 m_free(m);
1630 return hdrsiz;
1631 }
1632 #endif /*IPSEC*/
1633
1634 /*
1635 * Return a pointer to the cached information about the remote host.
1636 * The cached information is stored in the protocol specific part of
1637 * the route metrics.
1638 */
1639 struct rmxp_tao *
1640 tcp_gettaocache(inp)
1641 struct inpcb *inp;
1642 {
1643 struct rtentry *rt;
1644
1645 #if INET6
1646 if ((inp->inp_vflag & INP_IPV6) != 0)
1647 rt = tcp_rtlookup6(inp);
1648 else
1649 #endif /* INET6 */
1650 rt = tcp_rtlookup(inp);
1651
1652 /* Make sure this is a host route and is up. */
1653 if (rt == NULL ||
1654 (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST))
1655 return NULL;
1656
1657 return rmx_taop(rt->rt_rmx);
1658 }
1659
1660 /*
1661 * Clear all the TAO cache entries, called from tcp_init.
1662 *
1663 * XXX
1664 * This routine is just an empty one, because we assume that the routing
1665 * routing tables are initialized at the same time when TCP, so there is
1666 * nothing in the cache left over.
1667 */
1668 static void
1669 tcp_cleartaocache()
1670 {
1671 }
1672
1673 int
1674 tcp_lock(so, refcount, lr)
1675 struct socket *so;
1676 int refcount;
1677 int lr;
1678 {
1679 int lr_saved;
1680 #ifdef __ppc__
1681 if (lr == 0) {
1682 __asm__ volatile("mflr %0" : "=r" (lr_saved));
1683 }
1684 else lr_saved = lr;
1685 #endif
1686
1687 if (so->so_pcb) {
1688 lck_mtx_lock(((struct inpcb *)so->so_pcb)->inpcb_mtx);
1689 }
1690 else {
1691 panic("tcp_lock: so=%x NO PCB! lr=%x\n", so, lr_saved);
1692 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
1693 }
1694
1695 if (so->so_usecount < 0)
1696 panic("tcp_lock: so=%x so_pcb=%x lr=%x ref=%x\n",
1697 so, so->so_pcb, lr_saved, so->so_usecount);
1698
1699 if (refcount)
1700 so->so_usecount++;
1701 so->reserved3 = (void *)lr_saved;
1702 return (0);
1703 }
1704
1705 int
1706 tcp_unlock(so, refcount, lr)
1707 struct socket *so;
1708 int refcount;
1709 int lr;
1710 {
1711 int lr_saved;
1712 #ifdef __ppc__
1713 if (lr == 0) {
1714 __asm__ volatile("mflr %0" : "=r" (lr_saved));
1715 }
1716 else lr_saved = lr;
1717 #endif
1718
1719 #ifdef MORE_TCPLOCK_DEBUG
1720 printf("tcp_unlock: so=%x sopcb=%x lock=%x ref=%x lr=%x\n",
1721 so, so->so_pcb, ((struct inpcb *)so->so_pcb)->inpcb_mtx, so->so_usecount, lr_saved);
1722 #endif
1723 if (refcount)
1724 so->so_usecount--;
1725
1726 if (so->so_usecount < 0)
1727 panic("tcp_unlock: so=%x usecount=%x\n", so, so->so_usecount);
1728 if (so->so_pcb == NULL) {
1729 panic("tcp_unlock: so=%x NO PCB usecount=%x lr=%x\n", so, so->so_usecount, lr_saved);
1730 lck_mtx_unlock(so->so_proto->pr_domain->dom_mtx);
1731 }
1732 else {
1733 lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1734 lck_mtx_unlock(((struct inpcb *)so->so_pcb)->inpcb_mtx);
1735 }
1736 so->reserved4 = (void *)lr_saved;
1737 return (0);
1738 }
1739
1740 lck_mtx_t *
1741 tcp_getlock(so, locktype)
1742 struct socket *so;
1743 int locktype;
1744 {
1745 struct inpcb *inp = sotoinpcb(so);
1746
1747 if (so->so_pcb) {
1748 if (so->so_usecount < 0)
1749 panic("tcp_getlock: so=%x usecount=%x\n", so, so->so_usecount);
1750 return(inp->inpcb_mtx);
1751 }
1752 else {
1753 panic("tcp_getlock: so=%x NULL so_pcb\n", so);
1754 return (so->so_proto->pr_domain->dom_mtx);
1755 }
1756 }