]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/tcp_subr.c
45ae7ffed5ee9dbc9898a8565bb3a1c78b68ac27
[apple/xnu.git] / bsd / netinet / tcp_subr.c
1 /*
2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /*
23 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
24 * The Regents of the University of California. All rights reserved.
25 *
26 * Redistribution and use in source and binary forms, with or without
27 * modification, are permitted provided that the following conditions
28 * are met:
29 * 1. Redistributions of source code must retain the above copyright
30 * notice, this list of conditions and the following disclaimer.
31 * 2. Redistributions in binary form must reproduce the above copyright
32 * notice, this list of conditions and the following disclaimer in the
33 * documentation and/or other materials provided with the distribution.
34 * 3. All advertising materials mentioning features or use of this software
35 * must display the following acknowledgement:
36 * This product includes software developed by the University of
37 * California, Berkeley and its contributors.
38 * 4. Neither the name of the University nor the names of its contributors
39 * may be used to endorse or promote products derived from this software
40 * without specific prior written permission.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52 * SUCH DAMAGE.
53 *
54 * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95
55 */
56
57 #if ISFB31
58 #include "opt_compat.h"
59 #include "opt_tcpdebug.h"
60 #endif
61
62 #include <sys/param.h>
63 #include <sys/systm.h>
64 #include <sys/kernel.h>
65 #include <sys/sysctl.h>
66 #include <sys/malloc.h>
67 #include <sys/mbuf.h>
68 #include <sys/domain.h>
69 #include <sys/socket.h>
70 #include <sys/socketvar.h>
71 #include <sys/protosw.h>
72 #include <sys/syslog.h>
73
74
75 #if ISFB31
76 #include <vm/vm_zone.h>
77 #endif
78
79 #include <net/route.h>
80 #include <net/if.h>
81
82 #define _IP_VHL
83 #include <netinet/in.h>
84 #include <netinet/in_systm.h>
85 #include <netinet/ip.h>
86 #include <netinet/in_pcb.h>
87 #include <netinet/in_var.h>
88 #include <netinet/ip_var.h>
89 #if INET6
90 #include <netinet/ip6.h>
91 #include <netinet6/ip6_var.h>
92 #include <netinet6/in6_pcb.h>
93 #endif
94 #include <netinet/tcp.h>
95 #include <netinet/tcp_fsm.h>
96 #include <netinet/tcp_seq.h>
97 #include <netinet/tcp_timer.h>
98 #include <netinet/tcp_var.h>
99 #include <netinet/tcpip.h>
100 #if TCPDEBUG
101 #include <netinet/tcp_debug.h>
102 #endif
103 #include <netinet6/ip6protosw.h>
104
105 #if IPSEC
106 #include <netinet6/ipsec.h>
107 #endif /*IPSEC*/
108
109 #include <sys/kdebug.h>
110
111 #define DBG_FNC_TCP_CLOSE NETDBG_CODE(DBG_NETTCP, ((5 << 8) | 2))
112
113
114 int tcp_mssdflt = TCP_MSS;
115 SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt,
116 CTLFLAG_RW, &tcp_mssdflt , 0, "");
117
118 int tcp_v6mssdflt = TCP6_MSS;
119 SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
120 CTLFLAG_RW, &tcp_v6mssdflt , 0, "");
121
122 static int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
123 SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt,
124 CTLFLAG_RW, &tcp_rttdflt , 0, "");
125
126 static int tcp_do_rfc1323 = 1;
127 SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323,
128 CTLFLAG_RW, &tcp_do_rfc1323 , 0, "");
129
130 static int tcp_do_rfc1644 = 0;
131 SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644,
132 CTLFLAG_RW, &tcp_do_rfc1644 , 0, "");
133
134 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, &tcbinfo.ipi_count,
135 0, "Number of active PCBs");
136
137 static void tcp_cleartaocache __P((void));
138 static void tcp_notify __P((struct inpcb *, int));
139 extern u_long current_active_connections;
140
141
142
143
144 /*
145 * Target size of TCP PCB hash tables. Must be a power of two.
146 *
147 * Note that this can be overridden by the kernel environment
148 * variable net.inet.tcp.tcbhashsize
149 */
150 #ifndef TCBHASHSIZE
151 #define TCBHASHSIZE 4096
152 #endif
153
154 /*
155 * This is the actual shape of what we allocate using the zone
156 * allocator. Doing it this way allows us to protect both structures
157 * using the same generation count, and also eliminates the overhead
158 * of allocating tcpcbs separately. By hiding the structure here,
159 * we avoid changing most of the rest of the code (although it needs
160 * to be changed, eventually, for greater efficiency).
161 */
162 #define ALIGNMENT 32
163 #define ALIGNM1 (ALIGNMENT - 1)
164 struct inp_tp {
165 union {
166 struct inpcb inp;
167 char align[(sizeof(struct inpcb) + ALIGNM1) & ~ALIGNM1];
168 } inp_tp_u;
169 struct tcpcb tcb;
170 };
171 #undef ALIGNMENT
172 #undef ALIGNM1
173
174 static struct tcpcb dummy_tcb;
175
176
177 extern struct inpcbhead time_wait_slots[];
178 extern int cur_tw_slot;
179 extern u_long *delack_bitmask;
180
181
182 int get_inpcb_str_size()
183 {
184 return sizeof(struct inpcb);
185 }
186
187
188 int get_tcp_str_size()
189 {
190 return sizeof(struct tcpcb);
191 }
192
193 int tcp_freeq __P((struct tcpcb *tp));
194
195
196 /*
197 * Tcp initialization
198 */
199 void
200 tcp_init()
201 {
202 int hashsize;
203 vm_size_t str_size;
204 int i;
205
206 tcp_iss = random(); /* wrong, but better than a constant */
207 tcp_ccgen = 1;
208 tcp_cleartaocache();
209 LIST_INIT(&tcb);
210 tcbinfo.listhead = &tcb;
211 if (!(getenv_int("net.inet.tcp.tcbhashsize", &hashsize)))
212 hashsize = TCBHASHSIZE;
213 if (!powerof2(hashsize)) {
214 printf("WARNING: TCB hash size not a power of 2\n");
215 hashsize = 512; /* safe default */
216 }
217 tcbinfo.hashsize = hashsize;
218 tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask);
219 tcbinfo.porthashbase = hashinit(hashsize, M_PCB,
220 &tcbinfo.porthashmask);
221 #if ISFB31
222 tcbinfo.ipi_zone = (void *) zinit("tcpcb", sizeof(struct inp_tp), maxsockets,
223 ZONE_INTERRUPT, 0);
224 #else
225 str_size = (vm_size_t) sizeof(struct inp_tp);
226 tcbinfo.ipi_zone = (void *) zinit(str_size, 120000*str_size, 8192, "inpcb_zone");
227 #endif
228 #if INET6
229 #define TCP_LGHDR (sizeof(struct tcpip6hdr))
230 #else /* INET6 */
231 #define TCP_LGHDR (sizeof(struct tcpiphdr))
232 #endif /* INET6 */
233 if (max_protohdr < TCP_LGHDR)
234 max_protohdr = TCP_LGHDR;
235 if ((max_linkhdr + TCP_LGHDR) > MHLEN)
236 panic("tcp_init");
237
238 tcbinfo.last_pcb = 0;
239 dummy_tcb.t_state = TCP_NSTATES;
240 dummy_tcb.t_flags = 0;
241 tcbinfo.dummy_cb = (caddr_t) &dummy_tcb;
242 in_pcb_nat_init(&tcbinfo, AF_INET, IPPROTO_TCP, SOCK_STREAM);
243
244 delack_bitmask = _MALLOC((4 * hashsize)/32, M_PCB, M_NOWAIT);
245 if (delack_bitmask == 0)
246 panic("Delack Memory");
247
248 for (i=0; i < (tcbinfo.hashsize / 32); i++)
249 delack_bitmask[i] = 0;
250
251 for (i=0; i < N_TIME_WAIT_SLOTS; i++) {
252 LIST_INIT(&time_wait_slots[i]);
253 }
254 #undef TCP_LGHDR
255 }
256
257 /*
258 * Create template to be used to send tcp packets on a connection.
259 * Call after host entry created, allocates an mbuf and fills
260 * in a skeletal tcp/ip header, minimizing the amount of work
261 * necessary when the connection is used.
262 */
263 struct tcptemp *
264 tcp_template(tp)
265 struct tcpcb *tp;
266 {
267 register struct inpcb *inp = tp->t_inpcb;
268 register struct mbuf *m;
269 register struct tcptemp *n;
270
271 if ((n = tp->t_template) == 0) {
272 m = m_get(M_DONTWAIT, MT_HEADER);
273 if (m == NULL)
274 return (0);
275 m->m_len = sizeof (struct tcptemp);
276 n = mtod(m, struct tcptemp *);
277 }
278 bzero(n->tt_x1, sizeof(n->tt_x1));
279 n->tt_pr = IPPROTO_TCP;
280 n->tt_len = htons(sizeof (struct tcpiphdr) - sizeof (struct ip));
281 n->tt_src = inp->inp_laddr;
282 n->tt_dst = inp->inp_faddr;
283 n->tt_sport = inp->inp_lport;
284 n->tt_dport = inp->inp_fport;
285 n->tt_seq = 0;
286 n->tt_ack = 0;
287 n->tt_x2 = 0;
288 n->tt_off = 5;
289 n->tt_flags = 0;
290 n->tt_win = 0;
291 n->tt_sum = 0;
292 n->tt_urp = 0;
293 #if INET6
294 n->tt_flow = inp->inp_flow & IPV6_FLOWINFO_MASK;
295 if (ip6_auto_flowlabel) {
296 n->tt_flow &= ~IPV6_FLOWLABEL_MASK;
297 n->tt_flow |= (htonl(ip6_flow_seq++) & IPV6_FLOWLABEL_MASK);
298 }
299 n->tt_vfc |= IPV6_VERSION;
300 n->tt_pr6 = IPPROTO_TCP;
301 n->tt_len6 = n->tt_len;
302 n->tt_src6 = inp->in6p_laddr;
303 n->tt_dst6 = inp->in6p_faddr;
304 #endif /* INET6 */
305 return (n);
306 }
307
308 /*
309 * Send a single message to the TCP at address specified by
310 * the given TCP/IP header. If m == 0, then we make a copy
311 * of the tcpiphdr at ti and send directly to the addressed host.
312 * This is used to force keep alive messages out using the TCP
313 * template for a connection tp->t_template. If flags are given
314 * then we send a message back to the TCP which originated the
315 * segment ti, and discard the mbuf containing it and any other
316 * attached mbufs.
317 *
318 * In any case the ack and sequence number of the transmitted
319 * segment are as specified by the parameters.
320 *
321 * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
322 */
323 void
324 tcp_respond(tp, iph, th, m, ack, seq, flags, isipv6)
325 struct tcpcb *tp;
326 void *iph;
327 register struct tcphdr *th;
328 register struct mbuf *m;
329 tcp_seq ack, seq;
330 int flags;
331 #if INET6
332 int isipv6;
333 #endif
334 {
335 register int tlen;
336 int win = 0;
337 struct route *ro = 0;
338 struct route sro;
339 struct ip *ip = iph;
340 struct tcpiphdr *ti = iph;
341 struct tcphdr *nth;
342 #if INET6
343 struct route_in6 *ro6 = 0;
344 struct route_in6 sro6;
345 struct ip6_hdr *ip6 = iph;
346 struct tcpip6hdr *ti6 = iph;
347 #endif /* INET6 */
348
349 if (tp) {
350 if (!(flags & TH_RST))
351 win = sbspace(&tp->t_inpcb->inp_socket->so_rcv);
352 #if INET6
353 if (isipv6)
354 ro6 = &tp->t_inpcb->in6p_route;
355 else
356 #endif /* INET6 */
357 ro = &tp->t_inpcb->inp_route;
358 } else {
359 #if INET6
360 if (isipv6) {
361 ro6 = &sro6;
362 bzero(ro6, sizeof *ro6);
363 } else {
364 #endif /* INET6 */
365 ro = &sro;
366 bzero(ro, sizeof *ro);
367 #if INET6
368 }
369 #endif /* INET6 */
370 }
371 if (m == 0) {
372 m = m_gethdr(M_DONTWAIT, MT_HEADER);
373 if (m == NULL)
374 return;
375 #if TCP_COMPAT_42
376 tlen = 1;
377 #else
378 tlen = 0;
379 #endif
380 m->m_data += max_linkhdr;
381 #if INET6
382 if (isipv6) {
383 ti6 = mtod(m, struct tcpip6hdr *);
384 bcopy((caddr_t)ip6, (caddr_t)&ti6->ti6_i,
385 sizeof(struct ip6_hdr));
386 ip6 = &ti6->ti6_i;
387 nth = &ti6->ti6_t;
388 } else {
389 #endif /* INET6 */
390 ti = mtod(m, struct tcpiphdr *);
391 bcopy((caddr_t)ip, (caddr_t)&ti->ti_i, sizeof(struct ip));
392 ip = (struct ip *)&ti->ti_i;
393 nth = &ti->ti_t;
394 #if INET6
395 }
396 #endif /* INET6 */
397 bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
398 flags = TH_ACK;
399 } else {
400 m_freem(m->m_next);
401 m->m_next = 0;
402 m->m_data = (caddr_t)ti;
403 /* m_len is set later */
404 tlen = 0;
405 #define xchg(a,b,type) { type t; t=a; a=b; b=t; }
406 #if INET6
407 if (isipv6) {
408 struct in6_addr t;
409
410 t = ip6->ip6_dst;
411 ip6->ip6_dst = ip6->ip6_src;
412 ip6->ip6_src = t;
413 nth = (struct tcphdr *)(ip6 + 1);
414 if (th != nth) {
415 /*
416 * this is the case if an extension header
417 * exists between the IPv6 header and the
418 * TCP header.
419 */
420 nth->th_sport = th->th_sport;
421 nth->th_dport = th->th_dport;
422 }
423 } else {
424 #endif /* INET6 */
425 xchg(ti->ti_dst.s_addr, ti->ti_src.s_addr, n_long);
426 nth = th;
427 #if INET6
428 }
429 #endif /* INET6 */
430 xchg(nth->th_dport, nth->th_sport, n_short);
431 #undef xchg
432 }
433 nth->th_seq = htonl(seq);
434 nth->th_ack = htonl(ack);
435 nth->th_x2 = 0;
436 nth->th_off = sizeof (struct tcphdr) >> 2;
437 nth->th_flags = flags;
438 if (tp)
439 nth->th_win = htons((u_short) (win >> tp->rcv_scale));
440 else
441 nth->th_win = htons((u_short)win);
442 nth->th_urp = 0;
443 nth->th_sum = 0;
444 tlen += sizeof (struct tcphdr);
445 #if INET6
446 if (isipv6) {
447 m->m_len = tlen + sizeof(struct ip6_hdr);
448 m->m_pkthdr.len = tlen + sizeof(struct ip6_hdr);
449 m->m_pkthdr.rcvif = (struct ifnet *) 0;
450 ip6->ip6_plen = htons((u_short)tlen);
451 ip6->ip6_nxt = IPPROTO_TCP;
452 ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL,
453 ro6 && ro6->ro_rt ?
454 ro6->ro_rt->rt_ifp :
455 NULL);
456 nth->th_sum = in6_cksum(m, IPPROTO_TCP,
457 sizeof(struct ip6_hdr), tlen);
458 ip6->ip6_flow &= ~IPV6_FLOWLABEL_MASK;
459 if (ip6_auto_flowlabel) {
460 ip6->ip6_flow |=
461 (htonl(ip6_flow_seq++) & IPV6_FLOWLABEL_MASK);
462 }
463 } else {
464 #endif /* INET6 */
465 ti->ti_len = htons((u_short)(tlen));
466 m->m_len = tlen + sizeof(struct ip);
467 m->m_pkthdr.len = tlen + sizeof(struct ip);
468 m->m_pkthdr.rcvif = (struct ifnet *) 0;
469 bzero(ti->ti_x1, sizeof(ti->ti_x1));
470 nth->th_sum = in_cksum(m, tlen + sizeof(struct ip));
471 ip->ip_len = tlen + sizeof (struct ip);
472 ip->ip_ttl = ip_defttl;
473 #if INET6
474 }
475 #endif /* INET6 */
476 #if TCPDEBUG
477 if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
478 tcp_trace(TA_OUTPUT, 0, tp,
479 #if INET6
480 isipv6 ? (void *)ip6 :
481 #endif /* INET6 */
482 ip,
483 nth, 0);
484 #endif
485 #if IPSEC
486 ipsec_setsocket(m, tp ? tp->t_inpcb->inp_socket : NULL);
487 #endif /*IPSEC*/
488 #if INET6
489 if (isipv6) {
490 (void)ip6_output(m, NULL, ro6, 0, NULL, NULL);
491 if (ro6 == &sro6 && ro6->ro_rt)
492 RTFREE(ro6->ro_rt);
493 } else {
494 #endif /* INET6 */
495 (void)ip_output(m, NULL, ro, 0, NULL);
496 if (ro == &sro && ro->ro_rt) {
497 RTFREE(ro->ro_rt);
498 }
499 #if INET6
500 }
501 #endif /* INET6 */
502 }
503
504 /*
505 * Create a new TCP control block, making an
506 * empty reassembly queue and hooking it to the argument
507 * protocol control block. The `inp' parameter must have
508 * come from the zone allocator set up in tcp_init().
509 */
510 struct tcpcb *
511 tcp_newtcpcb(inp)
512 struct inpcb *inp;
513 {
514 struct inp_tp *it;
515 register struct tcpcb *tp;
516 register struct socket *so = inp->inp_socket;
517 #if INET6
518 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
519 #endif /* INET6 */
520
521
522 if (so->cached_in_sock_layer == 0) {
523 it = (struct inp_tp *)inp;
524 tp = &it->tcb;
525 }
526 else
527 tp = (struct tcpcb *) inp->inp_saved_ppcb;
528
529 bzero((char *) tp, sizeof(struct tcpcb));
530 tp->segq.lh_first = NULL;
531 tp->t_maxseg = tp->t_maxopd =
532 #if INET6
533 isipv6 ? tcp_v6mssdflt :
534 #endif /* INET6 */
535 tcp_mssdflt;
536
537
538 if (tcp_do_rfc1323)
539 tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
540 if (tcp_do_rfc1644)
541 tp->t_flags |= TF_REQ_CC;
542 tp->t_inpcb = inp; /* XXX */
543 /*
544 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
545 * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives
546 * reasonable initial retransmit time.
547 */
548 tp->t_srtt = TCPTV_SRTTBASE;
549 tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
550 tp->t_rttmin = TCPTV_MIN;
551 tp->t_rxtcur = TCPTV_RTOBASE;
552 tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
553 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
554 /*
555 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
556 * because the socket may be bound to an IPv6 wildcard address,
557 * which may match an IPv4-mapped IPv6 address.
558 * XXX: is there a better approach?
559 */
560 inp->inp_ip_ttl = ip_defttl;
561 inp->inp_ppcb = (caddr_t)tp;
562 return (tp); /* XXX */
563 }
564
565 /*
566 * Drop a TCP connection, reporting
567 * the specified error. If connection is synchronized,
568 * then send a RST to peer.
569 */
570 struct tcpcb *
571 tcp_drop(tp, errno)
572 register struct tcpcb *tp;
573 int errno;
574 {
575 struct socket *so = tp->t_inpcb->inp_socket;
576
577 switch (tp->t_state)
578 {
579 case TCPS_ESTABLISHED:
580 case TCPS_FIN_WAIT_1:
581 case TCPS_CLOSING:
582 case TCPS_CLOSE_WAIT:
583 case TCPS_LAST_ACK:
584 current_active_connections--;
585 break;
586 }
587
588 if (TCPS_HAVERCVDSYN(tp->t_state)) {
589 tp->t_state = TCPS_CLOSED;
590 (void) tcp_output(tp);
591 tcpstat.tcps_drops++;
592 } else
593 tcpstat.tcps_conndrops++;
594 if (errno == ETIMEDOUT && tp->t_softerror)
595 errno = tp->t_softerror;
596 so->so_error = errno;
597 return (tcp_close(tp));
598 }
599
600 /*
601 * Close a TCP control block:
602 * discard all space held by the tcp
603 * discard internet protocol block
604 * wake up any sleepers
605 */
606 struct tcpcb *
607 tcp_close(tp)
608 register struct tcpcb *tp;
609 {
610 register struct mbuf *q;
611 register struct mbuf *nq;
612 struct inpcb *inp = tp->t_inpcb;
613 struct socket *so = inp->inp_socket;
614 #if INET6
615 int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6);
616 #endif /* INET6 */
617 register struct rtentry *rt;
618 int dosavessthresh;
619
620
621 KERNEL_DEBUG(DBG_FNC_TCP_CLOSE | DBG_FUNC_START, tp,0,0,0,0);
622 switch (tp->t_state)
623 {
624 case TCPS_ESTABLISHED:
625 case TCPS_FIN_WAIT_1:
626 case TCPS_CLOSING:
627 case TCPS_CLOSE_WAIT:
628 case TCPS_LAST_ACK:
629 current_active_connections--;
630 break;
631 }
632
633
634 /*
635 * If we got enough samples through the srtt filter,
636 * save the rtt and rttvar in the routing entry.
637 * 'Enough' is arbitrarily defined as the 16 samples.
638 * 16 samples is enough for the srtt filter to converge
639 * to within 5% of the correct value; fewer samples and
640 * we could save a very bogus rtt.
641 *
642 * Don't update the default route's characteristics and don't
643 * update anything that the user "locked".
644 */
645 if (tp->t_rttupdated >= 16) {
646 register u_long i = 0;
647 #if INET6
648 if (isipv6) {
649 struct sockaddr_in6 *sin6;
650
651 if ((rt = inp->in6p_route.ro_rt) == NULL)
652 goto no_valid_rt;
653 sin6 = (struct sockaddr_in6 *)rt_key(rt);
654 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
655 goto no_valid_rt;
656 }
657 else
658 #endif /* INET6 */
659 if ((rt = inp->inp_route.ro_rt) == NULL ||
660 ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr
661 == INADDR_ANY)
662 goto no_valid_rt;
663
664 if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
665 i = tp->t_srtt *
666 (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTT_SCALE));
667 if (rt->rt_rmx.rmx_rtt && i)
668 /*
669 * filter this update to half the old & half
670 * the new values, converting scale.
671 * See route.h and tcp_var.h for a
672 * description of the scaling constants.
673 */
674 rt->rt_rmx.rmx_rtt =
675 (rt->rt_rmx.rmx_rtt + i) / 2;
676 else
677 rt->rt_rmx.rmx_rtt = i;
678 tcpstat.tcps_cachedrtt++;
679 }
680 if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
681 i = tp->t_rttvar *
682 (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTTVAR_SCALE));
683 if (rt->rt_rmx.rmx_rttvar && i)
684 rt->rt_rmx.rmx_rttvar =
685 (rt->rt_rmx.rmx_rttvar + i) / 2;
686 else
687 rt->rt_rmx.rmx_rttvar = i;
688 tcpstat.tcps_cachedrttvar++;
689 }
690 /*
691 * The old comment here said:
692 * update the pipelimit (ssthresh) if it has been updated
693 * already or if a pipesize was specified & the threshhold
694 * got below half the pipesize. I.e., wait for bad news
695 * before we start updating, then update on both good
696 * and bad news.
697 *
698 * But we want to save the ssthresh even if no pipesize is
699 * specified explicitly in the route, because such
700 * connections still have an implicit pipesize specified
701 * by the global tcp_sendspace. In the absence of a reliable
702 * way to calculate the pipesize, it will have to do.
703 */
704 i = tp->snd_ssthresh;
705 if (rt->rt_rmx.rmx_sendpipe != 0)
706 dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2);
707 else
708 dosavessthresh = (i < so->so_snd.sb_hiwat / 2);
709 if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
710 i != 0 && rt->rt_rmx.rmx_ssthresh != 0)
711 || dosavessthresh) {
712 /*
713 * convert the limit from user data bytes to
714 * packets then to packet data bytes.
715 */
716 i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
717 if (i < 2)
718 i = 2;
719 i *= (u_long)(tp->t_maxseg +
720 #if INET6
721 isipv6 ? sizeof (struct tcpip6hdr) :
722 #endif /* INET6 */
723 sizeof (struct tcpiphdr));
724 if (rt->rt_rmx.rmx_ssthresh)
725 rt->rt_rmx.rmx_ssthresh =
726 (rt->rt_rmx.rmx_ssthresh + i) / 2;
727 else
728 rt->rt_rmx.rmx_ssthresh = i;
729 tcpstat.tcps_cachedssthresh++;
730 }
731 }
732 no_valid_rt:
733 /* free the reassembly queue, if any */
734 (void) tcp_freeq(tp);
735
736 if (tp->t_template)
737 (void) m_free(dtom(tp->t_template));
738
739 if (so->cached_in_sock_layer)
740 inp->inp_saved_ppcb = (caddr_t) tp;
741
742 inp->inp_ppcb = NULL;
743 soisdisconnected(so);
744 #if INET6
745 if (isipv6)
746 in6_pcbdetach(inp);
747 else
748 #endif /* INET6 */
749 in_pcbdetach(inp);
750 tcpstat.tcps_closed++;
751 KERNEL_DEBUG(DBG_FNC_TCP_CLOSE | DBG_FUNC_END, tcpstat.tcps_closed,0,0,0,0);
752 return ((struct tcpcb *)0);
753 }
754
755 int
756 tcp_freeq(tp)
757 struct tcpcb *tp;
758 {
759 register struct ipqent *qe;
760 int rv = 0;
761
762 while ((qe = tp->segq.lh_first) != NULL) {
763 LIST_REMOVE(qe, ipqe_q);
764 m_freem(qe->ipqe_m);
765 FREE(qe, M_SONAME);
766 rv = 1;
767 }
768 return (rv);
769 }
770
771 void
772 tcp_drain()
773 {
774
775 }
776
777 /*
778 * Notify a tcp user of an asynchronous error;
779 * store error as soft error, but wake up user
780 * (for now, won't do anything until can select for soft error).
781 */
782 static void
783 tcp_notify(inp, error)
784 struct inpcb *inp;
785 int error;
786 {
787 register struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb;
788 register struct socket *so = inp->inp_socket;
789
790 /*
791 * Ignore some errors if we are hooked up.
792 * If connection hasn't completed, has retransmitted several times,
793 * and receives a second error, give up now. This is better
794 * than waiting a long time to establish a connection that
795 * can never complete.
796 */
797 if (tp->t_state == TCPS_ESTABLISHED &&
798 (error == EHOSTUNREACH || error == ENETUNREACH ||
799 error == EHOSTDOWN)) {
800 return;
801 } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
802 tp->t_softerror)
803 so->so_error = error;
804 else
805 tp->t_softerror = error;
806 wakeup((caddr_t) &so->so_timeo);
807 sorwakeup(so);
808 sowwakeup(so);
809 }
810
811
812 static int
813 tcp_pcblist SYSCTL_HANDLER_ARGS
814 {
815 int error, i, n, s;
816 struct inpcb *inp, **inp_list;
817 inp_gen_t gencnt;
818 struct xinpgen xig;
819
820 /*
821 * The process of preparing the TCB list is too time-consuming and
822 * resource-intensive to repeat twice on every request.
823 */
824 if (req->oldptr == 0) {
825 n = tcbinfo.ipi_count;
826 req->oldidx = 2 * (sizeof xig)
827 + (n + n/8) * sizeof(struct xtcpcb);
828 return 0;
829 }
830
831 if (req->newptr != 0)
832 return EPERM;
833
834 /*
835 * OK, now we're committed to doing something.
836 */
837 s = splnet();
838 gencnt = tcbinfo.ipi_gencnt;
839 n = tcbinfo.ipi_count;
840 splx(s);
841
842 xig.xig_len = sizeof xig;
843 xig.xig_count = n;
844 xig.xig_gen = gencnt;
845 xig.xig_sogen = so_gencnt;
846 error = SYSCTL_OUT(req, &xig, sizeof xig);
847 if (error)
848 return error;
849
850 inp_list = _MALLOC(n * sizeof *inp_list, M_TEMP, M_WAITOK);
851 if (inp_list == 0)
852 return ENOMEM;
853
854 s = splnet();
855 for (inp = tcbinfo.listhead->lh_first, i = 0; inp && i < n;
856 inp = inp->inp_list.le_next) {
857 if (inp->inp_gencnt <= gencnt)
858 inp_list[i++] = inp;
859 }
860 splx(s);
861 n = i;
862
863 error = 0;
864 for (i = 0; i < n; i++) {
865 inp = inp_list[i];
866 if (inp->inp_gencnt <= gencnt) {
867 struct xtcpcb xt;
868 xt.xt_len = sizeof xt;
869 /* XXX should avoid extra copy */
870 bcopy(inp, &xt.xt_inp, sizeof *inp);
871 bcopy(inp->inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp);
872 if (inp->inp_socket)
873 sotoxsocket(inp->inp_socket, &xt.xt_socket);
874 error = SYSCTL_OUT(req, &xt, sizeof xt);
875 }
876 }
877 if (!error) {
878 /*
879 * Give the user an updated idea of our state.
880 * If the generation differs from what we told
881 * her before, she knows that something happened
882 * while we were processing this request, and it
883 * might be necessary to retry.
884 */
885 s = splnet();
886 xig.xig_gen = tcbinfo.ipi_gencnt;
887 xig.xig_sogen = so_gencnt;
888 xig.xig_count = tcbinfo.ipi_count;
889 splx(s);
890 error = SYSCTL_OUT(req, &xig, sizeof xig);
891 }
892 FREE(inp_list, M_TEMP);
893 return error;
894 }
895
896
897 SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
898 tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
899
900 void
901 tcp_ctlinput(cmd, sa, vip)
902 int cmd;
903 struct sockaddr *sa;
904 void *vip;
905 {
906 register struct ip *ip = vip;
907 register struct tcphdr *th;
908 void (*notify) __P((struct inpcb *, int)) = tcp_notify;
909
910 if (cmd == PRC_QUENCH)
911 notify = tcp_quench;
912 else if (cmd == PRC_MSGSIZE)
913 notify = tcp_mtudisc;
914 else if (!PRC_IS_REDIRECT(cmd) &&
915 ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0))
916 return;
917 if (ip) {
918 th = (struct tcphdr *)((caddr_t)ip
919 + (IP_VHL_HL(ip->ip_vhl) << 2));
920 in_pcbnotify(&tcb, sa, th->th_dport, ip->ip_src, th->th_sport,
921 cmd, notify);
922 } else
923 in_pcbnotify(&tcb, sa, 0, zeroin_addr, 0, cmd, notify);
924 }
925
926 #if INET6
927 void
928 tcp6_ctlinput(cmd, sa, d)
929 int cmd;
930 struct sockaddr *sa;
931 void *d;
932 {
933 register struct tcphdr *thp;
934 struct tcphdr th;
935 void (*notify) __P((struct inpcb *, int)) = tcp_notify;
936 struct sockaddr_in6 sa6;
937 struct ip6_hdr *ip6;
938 struct mbuf *m;
939 int off = 0 ;
940
941 if (sa->sa_family != AF_INET6 ||
942 sa->sa_len != sizeof(struct sockaddr_in6))
943 return;
944
945 if (cmd == PRC_QUENCH)
946 notify = tcp_quench;
947 else if (cmd == PRC_MSGSIZE)
948 notify = tcp_mtudisc;
949 else if (!PRC_IS_REDIRECT(cmd) &&
950 ((unsigned)cmd > PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
951 return;
952
953 /* if the parameter is from icmp6, decode it. */
954 if (d != NULL) {
955 struct ip6ctlparam *ip6cp = (struct ip6ctlparam *)d;
956 m = ip6cp->ip6c_m;
957 ip6 = ip6cp->ip6c_ip6;
958 off = ip6cp->ip6c_off;
959 } else {
960 m = NULL;
961 ip6 = NULL;
962 }
963
964 /* translate addresses into internal form */
965 sa6 = *(struct sockaddr_in6 *)sa;
966 if (IN6_IS_ADDR_LINKLOCAL(&sa6.sin6_addr) && m && m->m_pkthdr.rcvif)
967 sa6.sin6_addr.s6_addr16[1] = htons(m->m_pkthdr.rcvif->if_index);
968
969 if (ip6) {
970 /*
971 * XXX: We assume that when IPV6 is non NULL,
972 * M and OFF are valid.
973 */
974 struct in6_addr s;
975
976 /* translate addresses into internal form */
977 memcpy(&s, &ip6->ip6_src, sizeof(s));
978 if (IN6_IS_ADDR_LINKLOCAL(&s))
979 s.s6_addr16[1] = htons(m->m_pkthdr.rcvif->if_index);
980
981
982 if (m->m_len < off + sizeof(*thp)) {
983 /*
984 * this should be rare case,
985 * so we compromise on this copy...
986 */
987 m_copydata(m, off, sizeof(th), (caddr_t)&th);
988 thp = &th;
989 } else
990 thp = (struct tcphdr *)(mtod(m, caddr_t) + off);
991 in6_pcbnotify(&tcb, (struct sockaddr *)&sa6, thp->th_dport,
992 &s, thp->th_sport, cmd, notify);
993 } else
994 in6_pcbnotify(&tcb, (struct sockaddr *)&sa6, 0, &zeroin6_addr,
995 0, cmd, notify);
996 }
997 #endif /* INET6 */
998
999 /*
1000 * When a source quench is received, close congestion window
1001 * to one segment. We will gradually open it again as we proceed.
1002 */
1003 void
1004 tcp_quench(inp, errno)
1005 struct inpcb *inp;
1006 int errno;
1007 {
1008 struct tcpcb *tp = intotcpcb(inp);
1009
1010 if (tp)
1011 tp->snd_cwnd = tp->t_maxseg;
1012 }
1013
1014 /*
1015 * When `need fragmentation' ICMP is received, update our idea of the MSS
1016 * based on the new value in the route. Also nudge TCP to send something,
1017 * since we know the packet we just sent was dropped.
1018 * This duplicates some code in the tcp_mss() function in tcp_input.c.
1019 */
1020 void
1021 tcp_mtudisc(inp, errno)
1022 struct inpcb *inp;
1023 int errno;
1024 {
1025 struct tcpcb *tp = intotcpcb(inp);
1026 struct rtentry *rt;
1027 struct rmxp_tao *taop;
1028 struct socket *so = inp->inp_socket;
1029 int offered;
1030 int mss;
1031 #if INET6
1032 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV4) == 0;
1033 #endif /* INET6 */
1034
1035 if (tp) {
1036 #if INET6
1037 if (isipv6)
1038 rt = tcp_rtlookup6(inp);
1039 else
1040 #endif /* INET6 */
1041 rt = tcp_rtlookup(inp);
1042 if (!rt || !rt->rt_rmx.rmx_mtu) {
1043 tp->t_maxopd = tp->t_maxseg =
1044 #if INET6
1045 isipv6 ? tcp_v6mssdflt :
1046 #endif /* INET6 */
1047 tcp_mssdflt;
1048 return;
1049 }
1050 taop = rmx_taop(rt->rt_rmx);
1051 offered = taop->tao_mssopt;
1052 mss = rt->rt_rmx.rmx_mtu -
1053 #if INET6
1054 (isipv6 ?
1055 sizeof(struct tcpip6hdr) :
1056 #endif /* INET6 */
1057 sizeof(struct tcpiphdr)
1058 #if INET6
1059 )
1060 #endif /* INET6 */
1061 ;
1062
1063 if (offered)
1064 mss = min(mss, offered);
1065 /*
1066 * XXX - The above conditional probably violates the TCP
1067 * spec. The problem is that, since we don't know the
1068 * other end's MSS, we are supposed to use a conservative
1069 * default. But, if we do that, then MTU discovery will
1070 * never actually take place, because the conservative
1071 * default is much less than the MTUs typically seen
1072 * on the Internet today. For the moment, we'll sweep
1073 * this under the carpet.
1074 *
1075 * The conservative default might not actually be a problem
1076 * if the only case this occurs is when sending an initial
1077 * SYN with options and data to a host we've never talked
1078 * to before. Then, they will reply with an MSS value which
1079 * will get recorded and the new parameters should get
1080 * recomputed. For Further Study.
1081 */
1082 if (tp->t_maxopd <= mss)
1083 return;
1084 tp->t_maxopd = mss;
1085
1086 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
1087 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
1088 mss -= TCPOLEN_TSTAMP_APPA;
1089 if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
1090 (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)
1091 mss -= TCPOLEN_CC_APPA;
1092 #if (MCLBYTES & (MCLBYTES - 1)) == 0
1093 if (mss > MCLBYTES)
1094 mss &= ~(MCLBYTES-1);
1095 #else
1096 if (mss > MCLBYTES)
1097 mss = mss / MCLBYTES * MCLBYTES;
1098 #endif
1099 if (so->so_snd.sb_hiwat < mss)
1100 mss = so->so_snd.sb_hiwat;
1101
1102 tp->t_maxseg = mss;
1103
1104 tcpstat.tcps_mturesent++;
1105 tp->t_rtt = 0;
1106 tp->snd_nxt = tp->snd_una;
1107 tcp_output(tp);
1108 }
1109 }
1110
1111 /*
1112 * Look-up the routing entry to the peer of this inpcb. If no route
1113 * is found and it cannot be allocated the return NULL. This routine
1114 * is called by TCP routines that access the rmx structure and by tcp_mss
1115 * to get the interface MTU.
1116 */
1117 struct rtentry *
1118 tcp_rtlookup(inp)
1119 struct inpcb *inp;
1120 {
1121 struct route *ro;
1122 struct rtentry *rt;
1123
1124 ro = &inp->inp_route;
1125 rt = ro->ro_rt;
1126 if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
1127 /* No route yet, so try to acquire one */
1128 if (inp->inp_faddr.s_addr != INADDR_ANY) {
1129 ro->ro_dst.sa_family = AF_INET;
1130 ro->ro_dst.sa_len = sizeof(ro->ro_dst);
1131 ((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
1132 inp->inp_faddr;
1133 rtalloc(ro);
1134 rt = ro->ro_rt;
1135 }
1136 }
1137 return rt;
1138 }
1139
1140 #if INET6
1141 struct rtentry *
1142 tcp_rtlookup6(inp)
1143 struct inpcb *inp;
1144 {
1145 struct route_in6 *ro6;
1146 struct rtentry *rt;
1147
1148 ro6 = &inp->in6p_route;
1149 rt = ro6->ro_rt;
1150 if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
1151 /* No route yet, so try to acquire one */
1152 if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
1153 ro6->ro_dst.sin6_family = AF_INET6;
1154 ro6->ro_dst.sin6_len = sizeof(ro6->ro_dst);
1155 ro6->ro_dst.sin6_addr = inp->in6p_faddr;
1156 rtalloc((struct route *)ro6);
1157 rt = ro6->ro_rt;
1158 }
1159 }
1160 return rt;
1161 }
1162 #endif /* INET6 */
1163
1164 #if IPSEC
1165 /* compute ESP/AH header size for TCP, including outer IP header. */
1166 size_t
1167 ipsec_hdrsiz_tcp(tp, isipv6)
1168 struct tcpcb *tp;
1169 #if INET6
1170 int isipv6;
1171 #endif /* INET6 */
1172 {
1173 struct inpcb *inp;
1174 struct mbuf *m;
1175 size_t hdrsiz;
1176 struct ip *ip;
1177 #if INET6
1178 struct ip6_hdr *ip6 = NULL;
1179 #endif /* INET6 */
1180 struct tcphdr *th;
1181
1182 if (!tp || !tp->t_template || !(inp = tp->t_inpcb))
1183 return 0;
1184 MGETHDR(m, M_DONTWAIT, MT_DATA);
1185 if (!m)
1186 return 0;
1187
1188 #if INET6
1189 if (isipv6) {
1190 ip6 = mtod(m, struct ip6_hdr *);
1191 th = (struct tcphdr *)(ip6 + 1);
1192 m->m_pkthdr.len = m->m_len = sizeof(struct tcpip6hdr);
1193 bcopy((caddr_t)&tp->t_template->tt_i6, (caddr_t)ip6,
1194 sizeof(struct ip6_hdr));
1195 bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th,
1196 sizeof(struct tcphdr));
1197 } else {
1198 #endif /* INET6 */
1199 ip = mtod(m, struct ip *);
1200 th = (struct tcphdr *)(ip + 1);
1201 m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
1202 bcopy((caddr_t)&tp->t_template->tt_i, (caddr_t)ip, sizeof(struct ip));
1203 bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th,
1204 sizeof(struct tcphdr));
1205 #if INET6
1206 }
1207 #endif /* INET6 */
1208
1209 #if INET6
1210 if (isipv6)
1211 hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
1212 else
1213 #endif /* INET6 */
1214 hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
1215
1216 m_free(m);
1217 return hdrsiz;
1218 }
1219 #endif /*IPSEC*/
1220
1221 /*
1222 * Return a pointer to the cached information about the remote host.
1223 * The cached information is stored in the protocol specific part of
1224 * the route metrics.
1225 */
1226 struct rmxp_tao *
1227 tcp_gettaocache(inp)
1228 struct inpcb *inp;
1229 {
1230 #if INET6
1231 int isipv6 = (inp->inp_vflag & INP_IPV4) == 0;
1232 #endif /* INET6 */
1233 struct rtentry *rt;
1234
1235 #if INET6
1236 if (isipv6)
1237 rt = tcp_rtlookup6(inp);
1238 else
1239 #endif /* INET6 */
1240 rt = tcp_rtlookup(inp);
1241
1242 /* Make sure this is a host route and is up. */
1243 if (rt == NULL ||
1244 (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST))
1245 return NULL;
1246
1247 return rmx_taop(rt->rt_rmx);
1248 }
1249
1250 /*
1251 * Clear all the TAO cache entries, called from tcp_init.
1252 *
1253 * XXX
1254 * This routine is just an empty one, because we assume that the routing
1255 * routing tables are initialized at the same time when TCP, so there is
1256 * nothing in the cache left over.
1257 */
1258 static void
1259 tcp_cleartaocache()
1260 {
1261 }