2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
23 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
24 * The Regents of the University of California. All rights reserved.
26 * Redistribution and use in source and binary forms, with or without
27 * modification, are permitted provided that the following conditions
29 * 1. Redistributions of source code must retain the above copyright
30 * notice, this list of conditions and the following disclaimer.
31 * 2. Redistributions in binary form must reproduce the above copyright
32 * notice, this list of conditions and the following disclaimer in the
33 * documentation and/or other materials provided with the distribution.
34 * 3. All advertising materials mentioning features or use of this software
35 * must display the following acknowledgement:
36 * This product includes software developed by the University of
37 * California, Berkeley and its contributors.
38 * 4. Neither the name of the University nor the names of its contributors
39 * may be used to endorse or promote products derived from this software
40 * without specific prior written permission.
42 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
54 * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95
58 #include "opt_compat.h"
59 #include "opt_tcpdebug.h"
62 #include <sys/param.h>
63 #include <sys/systm.h>
64 #include <sys/kernel.h>
65 #include <sys/sysctl.h>
66 #include <sys/malloc.h>
68 #include <sys/domain.h>
69 #include <sys/socket.h>
70 #include <sys/socketvar.h>
71 #include <sys/protosw.h>
72 #include <sys/syslog.h>
76 #include <vm/vm_zone.h>
79 #include <net/route.h>
83 #include <netinet/in.h>
84 #include <netinet/in_systm.h>
85 #include <netinet/ip.h>
86 #include <netinet/in_pcb.h>
87 #include <netinet/in_var.h>
88 #include <netinet/ip_var.h>
90 #include <netinet/ip6.h>
91 #include <netinet6/ip6_var.h>
92 #include <netinet6/in6_pcb.h>
94 #include <netinet/tcp.h>
95 #include <netinet/tcp_fsm.h>
96 #include <netinet/tcp_seq.h>
97 #include <netinet/tcp_timer.h>
98 #include <netinet/tcp_var.h>
99 #include <netinet/tcpip.h>
101 #include <netinet/tcp_debug.h>
103 #include <netinet6/ip6protosw.h>
106 #include <netinet6/ipsec.h>
109 #include <sys/kdebug.h>
111 #define DBG_FNC_TCP_CLOSE NETDBG_CODE(DBG_NETTCP, ((5 << 8) | 2))
114 int tcp_mssdflt
= TCP_MSS
;
115 SYSCTL_INT(_net_inet_tcp
, TCPCTL_MSSDFLT
, mssdflt
,
116 CTLFLAG_RW
, &tcp_mssdflt
, 0, "");
118 int tcp_v6mssdflt
= TCP6_MSS
;
119 SYSCTL_INT(_net_inet_tcp
, TCPCTL_V6MSSDFLT
, v6mssdflt
,
120 CTLFLAG_RW
, &tcp_v6mssdflt
, 0, "");
122 static int tcp_rttdflt
= TCPTV_SRTTDFLT
/ PR_SLOWHZ
;
123 SYSCTL_INT(_net_inet_tcp
, TCPCTL_RTTDFLT
, rttdflt
,
124 CTLFLAG_RW
, &tcp_rttdflt
, 0, "");
126 static int tcp_do_rfc1323
= 1;
127 SYSCTL_INT(_net_inet_tcp
, TCPCTL_DO_RFC1323
, rfc1323
,
128 CTLFLAG_RW
, &tcp_do_rfc1323
, 0, "");
130 static int tcp_do_rfc1644
= 0;
131 SYSCTL_INT(_net_inet_tcp
, TCPCTL_DO_RFC1644
, rfc1644
,
132 CTLFLAG_RW
, &tcp_do_rfc1644
, 0, "");
134 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, pcbcount
, CTLFLAG_RD
, &tcbinfo
.ipi_count
,
135 0, "Number of active PCBs");
137 static void tcp_cleartaocache
__P((void));
138 static void tcp_notify
__P((struct inpcb
*, int));
139 extern u_long current_active_connections
;
145 * Target size of TCP PCB hash tables. Must be a power of two.
147 * Note that this can be overridden by the kernel environment
148 * variable net.inet.tcp.tcbhashsize
151 #define TCBHASHSIZE 4096
155 * This is the actual shape of what we allocate using the zone
156 * allocator. Doing it this way allows us to protect both structures
157 * using the same generation count, and also eliminates the overhead
158 * of allocating tcpcbs separately. By hiding the structure here,
159 * we avoid changing most of the rest of the code (although it needs
160 * to be changed, eventually, for greater efficiency).
163 #define ALIGNM1 (ALIGNMENT - 1)
167 char align
[(sizeof(struct inpcb
) + ALIGNM1
) & ~ALIGNM1
];
174 static struct tcpcb dummy_tcb
;
177 extern struct inpcbhead time_wait_slots
[];
178 extern int cur_tw_slot
;
179 extern u_long
*delack_bitmask
;
182 int get_inpcb_str_size()
184 return sizeof(struct inpcb
);
188 int get_tcp_str_size()
190 return sizeof(struct tcpcb
);
193 int tcp_freeq
__P((struct tcpcb
*tp
));
206 tcp_iss
= random(); /* wrong, but better than a constant */
210 tcbinfo
.listhead
= &tcb
;
211 if (!(getenv_int("net.inet.tcp.tcbhashsize", &hashsize
)))
212 hashsize
= TCBHASHSIZE
;
213 if (!powerof2(hashsize
)) {
214 printf("WARNING: TCB hash size not a power of 2\n");
215 hashsize
= 512; /* safe default */
217 tcbinfo
.hashsize
= hashsize
;
218 tcbinfo
.hashbase
= hashinit(hashsize
, M_PCB
, &tcbinfo
.hashmask
);
219 tcbinfo
.porthashbase
= hashinit(hashsize
, M_PCB
,
220 &tcbinfo
.porthashmask
);
222 tcbinfo
.ipi_zone
= (void *) zinit("tcpcb", sizeof(struct inp_tp
), maxsockets
,
225 str_size
= (vm_size_t
) sizeof(struct inp_tp
);
226 tcbinfo
.ipi_zone
= (void *) zinit(str_size
, 120000*str_size
, 8192, "inpcb_zone");
229 #define TCP_LGHDR (sizeof(struct tcpip6hdr))
231 #define TCP_LGHDR (sizeof(struct tcpiphdr))
233 if (max_protohdr
< TCP_LGHDR
)
234 max_protohdr
= TCP_LGHDR
;
235 if ((max_linkhdr
+ TCP_LGHDR
) > MHLEN
)
238 tcbinfo
.last_pcb
= 0;
239 dummy_tcb
.t_state
= TCP_NSTATES
;
240 dummy_tcb
.t_flags
= 0;
241 tcbinfo
.dummy_cb
= (caddr_t
) &dummy_tcb
;
242 in_pcb_nat_init(&tcbinfo
, AF_INET
, IPPROTO_TCP
, SOCK_STREAM
);
244 delack_bitmask
= _MALLOC((4 * hashsize
)/32, M_PCB
, M_NOWAIT
);
245 if (delack_bitmask
== 0)
246 panic("Delack Memory");
248 for (i
=0; i
< (tcbinfo
.hashsize
/ 32); i
++)
249 delack_bitmask
[i
] = 0;
251 for (i
=0; i
< N_TIME_WAIT_SLOTS
; i
++) {
252 LIST_INIT(&time_wait_slots
[i
]);
258 * Create template to be used to send tcp packets on a connection.
259 * Call after host entry created, allocates an mbuf and fills
260 * in a skeletal tcp/ip header, minimizing the amount of work
261 * necessary when the connection is used.
267 register struct inpcb
*inp
= tp
->t_inpcb
;
268 register struct mbuf
*m
;
269 register struct tcptemp
*n
;
271 if ((n
= tp
->t_template
) == 0) {
272 m
= m_get(M_DONTWAIT
, MT_HEADER
);
275 m
->m_len
= sizeof (struct tcptemp
);
276 n
= mtod(m
, struct tcptemp
*);
278 bzero(n
->tt_x1
, sizeof(n
->tt_x1
));
279 n
->tt_pr
= IPPROTO_TCP
;
280 n
->tt_len
= htons(sizeof (struct tcpiphdr
) - sizeof (struct ip
));
281 n
->tt_src
= inp
->inp_laddr
;
282 n
->tt_dst
= inp
->inp_faddr
;
283 n
->tt_sport
= inp
->inp_lport
;
284 n
->tt_dport
= inp
->inp_fport
;
294 n
->tt_flow
= inp
->inp_flow
& IPV6_FLOWINFO_MASK
;
295 if (ip6_auto_flowlabel
) {
296 n
->tt_flow
&= ~IPV6_FLOWLABEL_MASK
;
297 n
->tt_flow
|= (htonl(ip6_flow_seq
++) & IPV6_FLOWLABEL_MASK
);
299 n
->tt_vfc
|= IPV6_VERSION
;
300 n
->tt_pr6
= IPPROTO_TCP
;
301 n
->tt_len6
= n
->tt_len
;
302 n
->tt_src6
= inp
->in6p_laddr
;
303 n
->tt_dst6
= inp
->in6p_faddr
;
309 * Send a single message to the TCP at address specified by
310 * the given TCP/IP header. If m == 0, then we make a copy
311 * of the tcpiphdr at ti and send directly to the addressed host.
312 * This is used to force keep alive messages out using the TCP
313 * template for a connection tp->t_template. If flags are given
314 * then we send a message back to the TCP which originated the
315 * segment ti, and discard the mbuf containing it and any other
318 * In any case the ack and sequence number of the transmitted
319 * segment are as specified by the parameters.
321 * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
324 tcp_respond(tp
, iph
, th
, m
, ack
, seq
, flags
, isipv6
)
327 register struct tcphdr
*th
;
328 register struct mbuf
*m
;
337 struct route
*ro
= 0;
340 struct tcpiphdr
*ti
= iph
;
343 struct route_in6
*ro6
= 0;
344 struct route_in6 sro6
;
345 struct ip6_hdr
*ip6
= iph
;
346 struct tcpip6hdr
*ti6
= iph
;
350 if (!(flags
& TH_RST
))
351 win
= sbspace(&tp
->t_inpcb
->inp_socket
->so_rcv
);
354 ro6
= &tp
->t_inpcb
->in6p_route
;
357 ro
= &tp
->t_inpcb
->inp_route
;
362 bzero(ro6
, sizeof *ro6
);
366 bzero(ro
, sizeof *ro
);
372 m
= m_gethdr(M_DONTWAIT
, MT_HEADER
);
380 m
->m_data
+= max_linkhdr
;
383 ti6
= mtod(m
, struct tcpip6hdr
*);
384 bcopy((caddr_t
)ip6
, (caddr_t
)&ti6
->ti6_i
,
385 sizeof(struct ip6_hdr
));
390 ti
= mtod(m
, struct tcpiphdr
*);
391 bcopy((caddr_t
)ip
, (caddr_t
)&ti
->ti_i
, sizeof(struct ip
));
392 ip
= (struct ip
*)&ti
->ti_i
;
397 bcopy((caddr_t
)th
, (caddr_t
)nth
, sizeof(struct tcphdr
));
402 m
->m_data
= (caddr_t
)ti
;
403 /* m_len is set later */
405 #define xchg(a,b,type) { type t; t=a; a=b; b=t; }
411 ip6
->ip6_dst
= ip6
->ip6_src
;
413 nth
= (struct tcphdr
*)(ip6
+ 1);
416 * this is the case if an extension header
417 * exists between the IPv6 header and the
420 nth
->th_sport
= th
->th_sport
;
421 nth
->th_dport
= th
->th_dport
;
425 xchg(ti
->ti_dst
.s_addr
, ti
->ti_src
.s_addr
, n_long
);
430 xchg(nth
->th_dport
, nth
->th_sport
, n_short
);
433 nth
->th_seq
= htonl(seq
);
434 nth
->th_ack
= htonl(ack
);
436 nth
->th_off
= sizeof (struct tcphdr
) >> 2;
437 nth
->th_flags
= flags
;
439 nth
->th_win
= htons((u_short
) (win
>> tp
->rcv_scale
));
441 nth
->th_win
= htons((u_short
)win
);
444 tlen
+= sizeof (struct tcphdr
);
447 m
->m_len
= tlen
+ sizeof(struct ip6_hdr
);
448 m
->m_pkthdr
.len
= tlen
+ sizeof(struct ip6_hdr
);
449 m
->m_pkthdr
.rcvif
= (struct ifnet
*) 0;
450 ip6
->ip6_plen
= htons((u_short
)tlen
);
451 ip6
->ip6_nxt
= IPPROTO_TCP
;
452 ip6
->ip6_hlim
= in6_selecthlim(tp
? tp
->t_inpcb
: NULL
,
456 nth
->th_sum
= in6_cksum(m
, IPPROTO_TCP
,
457 sizeof(struct ip6_hdr
), tlen
);
458 ip6
->ip6_flow
&= ~IPV6_FLOWLABEL_MASK
;
459 if (ip6_auto_flowlabel
) {
461 (htonl(ip6_flow_seq
++) & IPV6_FLOWLABEL_MASK
);
465 ti
->ti_len
= htons((u_short
)(tlen
));
466 m
->m_len
= tlen
+ sizeof(struct ip
);
467 m
->m_pkthdr
.len
= tlen
+ sizeof(struct ip
);
468 m
->m_pkthdr
.rcvif
= (struct ifnet
*) 0;
469 bzero(ti
->ti_x1
, sizeof(ti
->ti_x1
));
470 nth
->th_sum
= in_cksum(m
, tlen
+ sizeof(struct ip
));
471 ip
->ip_len
= tlen
+ sizeof (struct ip
);
472 ip
->ip_ttl
= ip_defttl
;
477 if (tp
== NULL
|| (tp
->t_inpcb
->inp_socket
->so_options
& SO_DEBUG
))
478 tcp_trace(TA_OUTPUT
, 0, tp
,
480 isipv6
? (void *)ip6
:
486 ipsec_setsocket(m
, tp
? tp
->t_inpcb
->inp_socket
: NULL
);
490 (void)ip6_output(m
, NULL
, ro6
, 0, NULL
, NULL
);
491 if (ro6
== &sro6
&& ro6
->ro_rt
)
495 (void)ip_output(m
, NULL
, ro
, 0, NULL
);
496 if (ro
== &sro
&& ro
->ro_rt
) {
505 * Create a new TCP control block, making an
506 * empty reassembly queue and hooking it to the argument
507 * protocol control block. The `inp' parameter must have
508 * come from the zone allocator set up in tcp_init().
515 register struct tcpcb
*tp
;
516 register struct socket
*so
= inp
->inp_socket
;
518 int isipv6
= (inp
->inp_vflag
& INP_IPV6
) != 0;
522 if (so
->cached_in_sock_layer
== 0) {
523 it
= (struct inp_tp
*)inp
;
527 tp
= (struct tcpcb
*) inp
->inp_saved_ppcb
;
529 bzero((char *) tp
, sizeof(struct tcpcb
));
530 tp
->segq
.lh_first
= NULL
;
531 tp
->t_maxseg
= tp
->t_maxopd
=
533 isipv6
? tcp_v6mssdflt
:
539 tp
->t_flags
= (TF_REQ_SCALE
|TF_REQ_TSTMP
);
541 tp
->t_flags
|= TF_REQ_CC
;
542 tp
->t_inpcb
= inp
; /* XXX */
544 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
545 * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives
546 * reasonable initial retransmit time.
548 tp
->t_srtt
= TCPTV_SRTTBASE
;
549 tp
->t_rttvar
= ((TCPTV_RTOBASE
- TCPTV_SRTTBASE
) << TCP_RTTVAR_SHIFT
) / 4;
550 tp
->t_rttmin
= TCPTV_MIN
;
551 tp
->t_rxtcur
= TCPTV_RTOBASE
;
552 tp
->snd_cwnd
= TCP_MAXWIN
<< TCP_MAX_WINSHIFT
;
553 tp
->snd_ssthresh
= TCP_MAXWIN
<< TCP_MAX_WINSHIFT
;
555 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
556 * because the socket may be bound to an IPv6 wildcard address,
557 * which may match an IPv4-mapped IPv6 address.
558 * XXX: is there a better approach?
560 inp
->inp_ip_ttl
= ip_defttl
;
561 inp
->inp_ppcb
= (caddr_t
)tp
;
562 return (tp
); /* XXX */
566 * Drop a TCP connection, reporting
567 * the specified error. If connection is synchronized,
568 * then send a RST to peer.
572 register struct tcpcb
*tp
;
575 struct socket
*so
= tp
->t_inpcb
->inp_socket
;
579 case TCPS_ESTABLISHED
:
580 case TCPS_FIN_WAIT_1
:
582 case TCPS_CLOSE_WAIT
:
584 current_active_connections
--;
588 if (TCPS_HAVERCVDSYN(tp
->t_state
)) {
589 tp
->t_state
= TCPS_CLOSED
;
590 (void) tcp_output(tp
);
591 tcpstat
.tcps_drops
++;
593 tcpstat
.tcps_conndrops
++;
594 if (errno
== ETIMEDOUT
&& tp
->t_softerror
)
595 errno
= tp
->t_softerror
;
596 so
->so_error
= errno
;
597 return (tcp_close(tp
));
601 * Close a TCP control block:
602 * discard all space held by the tcp
603 * discard internet protocol block
604 * wake up any sleepers
608 register struct tcpcb
*tp
;
610 register struct mbuf
*q
;
611 register struct mbuf
*nq
;
612 struct inpcb
*inp
= tp
->t_inpcb
;
613 struct socket
*so
= inp
->inp_socket
;
615 int isipv6
= INP_CHECK_SOCKAF(so
, AF_INET6
);
617 register struct rtentry
*rt
;
621 KERNEL_DEBUG(DBG_FNC_TCP_CLOSE
| DBG_FUNC_START
, tp
,0,0,0,0);
624 case TCPS_ESTABLISHED
:
625 case TCPS_FIN_WAIT_1
:
627 case TCPS_CLOSE_WAIT
:
629 current_active_connections
--;
635 * If we got enough samples through the srtt filter,
636 * save the rtt and rttvar in the routing entry.
637 * 'Enough' is arbitrarily defined as the 16 samples.
638 * 16 samples is enough for the srtt filter to converge
639 * to within 5% of the correct value; fewer samples and
640 * we could save a very bogus rtt.
642 * Don't update the default route's characteristics and don't
643 * update anything that the user "locked".
645 if (tp
->t_rttupdated
>= 16) {
646 register u_long i
= 0;
649 struct sockaddr_in6
*sin6
;
651 if ((rt
= inp
->in6p_route
.ro_rt
) == NULL
)
653 sin6
= (struct sockaddr_in6
*)rt_key(rt
);
654 if (IN6_IS_ADDR_UNSPECIFIED(&sin6
->sin6_addr
))
659 if ((rt
= inp
->inp_route
.ro_rt
) == NULL
||
660 ((struct sockaddr_in
*)rt_key(rt
))->sin_addr
.s_addr
664 if ((rt
->rt_rmx
.rmx_locks
& RTV_RTT
) == 0) {
666 (RTM_RTTUNIT
/ (PR_SLOWHZ
* TCP_RTT_SCALE
));
667 if (rt
->rt_rmx
.rmx_rtt
&& i
)
669 * filter this update to half the old & half
670 * the new values, converting scale.
671 * See route.h and tcp_var.h for a
672 * description of the scaling constants.
675 (rt
->rt_rmx
.rmx_rtt
+ i
) / 2;
677 rt
->rt_rmx
.rmx_rtt
= i
;
678 tcpstat
.tcps_cachedrtt
++;
680 if ((rt
->rt_rmx
.rmx_locks
& RTV_RTTVAR
) == 0) {
682 (RTM_RTTUNIT
/ (PR_SLOWHZ
* TCP_RTTVAR_SCALE
));
683 if (rt
->rt_rmx
.rmx_rttvar
&& i
)
684 rt
->rt_rmx
.rmx_rttvar
=
685 (rt
->rt_rmx
.rmx_rttvar
+ i
) / 2;
687 rt
->rt_rmx
.rmx_rttvar
= i
;
688 tcpstat
.tcps_cachedrttvar
++;
691 * The old comment here said:
692 * update the pipelimit (ssthresh) if it has been updated
693 * already or if a pipesize was specified & the threshhold
694 * got below half the pipesize. I.e., wait for bad news
695 * before we start updating, then update on both good
698 * But we want to save the ssthresh even if no pipesize is
699 * specified explicitly in the route, because such
700 * connections still have an implicit pipesize specified
701 * by the global tcp_sendspace. In the absence of a reliable
702 * way to calculate the pipesize, it will have to do.
704 i
= tp
->snd_ssthresh
;
705 if (rt
->rt_rmx
.rmx_sendpipe
!= 0)
706 dosavessthresh
= (i
< rt
->rt_rmx
.rmx_sendpipe
/ 2);
708 dosavessthresh
= (i
< so
->so_snd
.sb_hiwat
/ 2);
709 if (((rt
->rt_rmx
.rmx_locks
& RTV_SSTHRESH
) == 0 &&
710 i
!= 0 && rt
->rt_rmx
.rmx_ssthresh
!= 0)
713 * convert the limit from user data bytes to
714 * packets then to packet data bytes.
716 i
= (i
+ tp
->t_maxseg
/ 2) / tp
->t_maxseg
;
719 i
*= (u_long
)(tp
->t_maxseg
+
721 isipv6
? sizeof (struct tcpip6hdr
) :
723 sizeof (struct tcpiphdr
));
724 if (rt
->rt_rmx
.rmx_ssthresh
)
725 rt
->rt_rmx
.rmx_ssthresh
=
726 (rt
->rt_rmx
.rmx_ssthresh
+ i
) / 2;
728 rt
->rt_rmx
.rmx_ssthresh
= i
;
729 tcpstat
.tcps_cachedssthresh
++;
733 /* free the reassembly queue, if any */
734 (void) tcp_freeq(tp
);
737 (void) m_free(dtom(tp
->t_template
));
739 if (so
->cached_in_sock_layer
)
740 inp
->inp_saved_ppcb
= (caddr_t
) tp
;
742 inp
->inp_ppcb
= NULL
;
743 soisdisconnected(so
);
750 tcpstat
.tcps_closed
++;
751 KERNEL_DEBUG(DBG_FNC_TCP_CLOSE
| DBG_FUNC_END
, tcpstat
.tcps_closed
,0,0,0,0);
752 return ((struct tcpcb
*)0);
759 register struct ipqent
*qe
;
762 while ((qe
= tp
->segq
.lh_first
) != NULL
) {
763 LIST_REMOVE(qe
, ipqe_q
);
778 * Notify a tcp user of an asynchronous error;
779 * store error as soft error, but wake up user
780 * (for now, won't do anything until can select for soft error).
783 tcp_notify(inp
, error
)
787 register struct tcpcb
*tp
= (struct tcpcb
*)inp
->inp_ppcb
;
788 register struct socket
*so
= inp
->inp_socket
;
791 * Ignore some errors if we are hooked up.
792 * If connection hasn't completed, has retransmitted several times,
793 * and receives a second error, give up now. This is better
794 * than waiting a long time to establish a connection that
795 * can never complete.
797 if (tp
->t_state
== TCPS_ESTABLISHED
&&
798 (error
== EHOSTUNREACH
|| error
== ENETUNREACH
||
799 error
== EHOSTDOWN
)) {
801 } else if (tp
->t_state
< TCPS_ESTABLISHED
&& tp
->t_rxtshift
> 3 &&
803 so
->so_error
= error
;
805 tp
->t_softerror
= error
;
806 wakeup((caddr_t
) &so
->so_timeo
);
813 tcp_pcblist SYSCTL_HANDLER_ARGS
816 struct inpcb
*inp
, **inp_list
;
821 * The process of preparing the TCB list is too time-consuming and
822 * resource-intensive to repeat twice on every request.
824 if (req
->oldptr
== 0) {
825 n
= tcbinfo
.ipi_count
;
826 req
->oldidx
= 2 * (sizeof xig
)
827 + (n
+ n
/8) * sizeof(struct xtcpcb
);
831 if (req
->newptr
!= 0)
835 * OK, now we're committed to doing something.
838 gencnt
= tcbinfo
.ipi_gencnt
;
839 n
= tcbinfo
.ipi_count
;
842 xig
.xig_len
= sizeof xig
;
844 xig
.xig_gen
= gencnt
;
845 xig
.xig_sogen
= so_gencnt
;
846 error
= SYSCTL_OUT(req
, &xig
, sizeof xig
);
850 inp_list
= _MALLOC(n
* sizeof *inp_list
, M_TEMP
, M_WAITOK
);
855 for (inp
= tcbinfo
.listhead
->lh_first
, i
= 0; inp
&& i
< n
;
856 inp
= inp
->inp_list
.le_next
) {
857 if (inp
->inp_gencnt
<= gencnt
)
864 for (i
= 0; i
< n
; i
++) {
866 if (inp
->inp_gencnt
<= gencnt
) {
868 xt
.xt_len
= sizeof xt
;
869 /* XXX should avoid extra copy */
870 bcopy(inp
, &xt
.xt_inp
, sizeof *inp
);
871 bcopy(inp
->inp_ppcb
, &xt
.xt_tp
, sizeof xt
.xt_tp
);
873 sotoxsocket(inp
->inp_socket
, &xt
.xt_socket
);
874 error
= SYSCTL_OUT(req
, &xt
, sizeof xt
);
879 * Give the user an updated idea of our state.
880 * If the generation differs from what we told
881 * her before, she knows that something happened
882 * while we were processing this request, and it
883 * might be necessary to retry.
886 xig
.xig_gen
= tcbinfo
.ipi_gencnt
;
887 xig
.xig_sogen
= so_gencnt
;
888 xig
.xig_count
= tcbinfo
.ipi_count
;
890 error
= SYSCTL_OUT(req
, &xig
, sizeof xig
);
892 FREE(inp_list
, M_TEMP
);
897 SYSCTL_PROC(_net_inet_tcp
, TCPCTL_PCBLIST
, pcblist
, CTLFLAG_RD
, 0, 0,
898 tcp_pcblist
, "S,xtcpcb", "List of active TCP connections");
901 tcp_ctlinput(cmd
, sa
, vip
)
906 register struct ip
*ip
= vip
;
907 register struct tcphdr
*th
;
908 void (*notify
) __P((struct inpcb
*, int)) = tcp_notify
;
910 if (cmd
== PRC_QUENCH
)
912 else if (cmd
== PRC_MSGSIZE
)
913 notify
= tcp_mtudisc
;
914 else if (!PRC_IS_REDIRECT(cmd
) &&
915 ((unsigned)cmd
> PRC_NCMDS
|| inetctlerrmap
[cmd
] == 0))
918 th
= (struct tcphdr
*)((caddr_t
)ip
919 + (IP_VHL_HL(ip
->ip_vhl
) << 2));
920 in_pcbnotify(&tcb
, sa
, th
->th_dport
, ip
->ip_src
, th
->th_sport
,
923 in_pcbnotify(&tcb
, sa
, 0, zeroin_addr
, 0, cmd
, notify
);
928 tcp6_ctlinput(cmd
, sa
, d
)
933 register struct tcphdr
*thp
;
935 void (*notify
) __P((struct inpcb
*, int)) = tcp_notify
;
936 struct sockaddr_in6 sa6
;
941 if (sa
->sa_family
!= AF_INET6
||
942 sa
->sa_len
!= sizeof(struct sockaddr_in6
))
945 if (cmd
== PRC_QUENCH
)
947 else if (cmd
== PRC_MSGSIZE
)
948 notify
= tcp_mtudisc
;
949 else if (!PRC_IS_REDIRECT(cmd
) &&
950 ((unsigned)cmd
> PRC_NCMDS
|| inet6ctlerrmap
[cmd
] == 0))
953 /* if the parameter is from icmp6, decode it. */
955 struct ip6ctlparam
*ip6cp
= (struct ip6ctlparam
*)d
;
957 ip6
= ip6cp
->ip6c_ip6
;
958 off
= ip6cp
->ip6c_off
;
964 /* translate addresses into internal form */
965 sa6
= *(struct sockaddr_in6
*)sa
;
966 if (IN6_IS_ADDR_LINKLOCAL(&sa6
.sin6_addr
) && m
&& m
->m_pkthdr
.rcvif
)
967 sa6
.sin6_addr
.s6_addr16
[1] = htons(m
->m_pkthdr
.rcvif
->if_index
);
971 * XXX: We assume that when IPV6 is non NULL,
972 * M and OFF are valid.
976 /* translate addresses into internal form */
977 memcpy(&s
, &ip6
->ip6_src
, sizeof(s
));
978 if (IN6_IS_ADDR_LINKLOCAL(&s
))
979 s
.s6_addr16
[1] = htons(m
->m_pkthdr
.rcvif
->if_index
);
982 if (m
->m_len
< off
+ sizeof(*thp
)) {
984 * this should be rare case,
985 * so we compromise on this copy...
987 m_copydata(m
, off
, sizeof(th
), (caddr_t
)&th
);
990 thp
= (struct tcphdr
*)(mtod(m
, caddr_t
) + off
);
991 in6_pcbnotify(&tcb
, (struct sockaddr
*)&sa6
, thp
->th_dport
,
992 &s
, thp
->th_sport
, cmd
, notify
);
994 in6_pcbnotify(&tcb
, (struct sockaddr
*)&sa6
, 0, &zeroin6_addr
,
1000 * When a source quench is received, close congestion window
1001 * to one segment. We will gradually open it again as we proceed.
1004 tcp_quench(inp
, errno
)
1008 struct tcpcb
*tp
= intotcpcb(inp
);
1011 tp
->snd_cwnd
= tp
->t_maxseg
;
1015 * When `need fragmentation' ICMP is received, update our idea of the MSS
1016 * based on the new value in the route. Also nudge TCP to send something,
1017 * since we know the packet we just sent was dropped.
1018 * This duplicates some code in the tcp_mss() function in tcp_input.c.
1021 tcp_mtudisc(inp
, errno
)
1025 struct tcpcb
*tp
= intotcpcb(inp
);
1027 struct rmxp_tao
*taop
;
1028 struct socket
*so
= inp
->inp_socket
;
1032 int isipv6
= (tp
->t_inpcb
->inp_vflag
& INP_IPV4
) == 0;
1038 rt
= tcp_rtlookup6(inp
);
1041 rt
= tcp_rtlookup(inp
);
1042 if (!rt
|| !rt
->rt_rmx
.rmx_mtu
) {
1043 tp
->t_maxopd
= tp
->t_maxseg
=
1045 isipv6
? tcp_v6mssdflt
:
1050 taop
= rmx_taop(rt
->rt_rmx
);
1051 offered
= taop
->tao_mssopt
;
1052 mss
= rt
->rt_rmx
.rmx_mtu
-
1055 sizeof(struct tcpip6hdr
) :
1057 sizeof(struct tcpiphdr
)
1064 mss
= min(mss
, offered
);
1066 * XXX - The above conditional probably violates the TCP
1067 * spec. The problem is that, since we don't know the
1068 * other end's MSS, we are supposed to use a conservative
1069 * default. But, if we do that, then MTU discovery will
1070 * never actually take place, because the conservative
1071 * default is much less than the MTUs typically seen
1072 * on the Internet today. For the moment, we'll sweep
1073 * this under the carpet.
1075 * The conservative default might not actually be a problem
1076 * if the only case this occurs is when sending an initial
1077 * SYN with options and data to a host we've never talked
1078 * to before. Then, they will reply with an MSS value which
1079 * will get recorded and the new parameters should get
1080 * recomputed. For Further Study.
1082 if (tp
->t_maxopd
<= mss
)
1086 if ((tp
->t_flags
& (TF_REQ_TSTMP
|TF_NOOPT
)) == TF_REQ_TSTMP
&&
1087 (tp
->t_flags
& TF_RCVD_TSTMP
) == TF_RCVD_TSTMP
)
1088 mss
-= TCPOLEN_TSTAMP_APPA
;
1089 if ((tp
->t_flags
& (TF_REQ_CC
|TF_NOOPT
)) == TF_REQ_CC
&&
1090 (tp
->t_flags
& TF_RCVD_CC
) == TF_RCVD_CC
)
1091 mss
-= TCPOLEN_CC_APPA
;
1092 #if (MCLBYTES & (MCLBYTES - 1)) == 0
1094 mss
&= ~(MCLBYTES
-1);
1097 mss
= mss
/ MCLBYTES
* MCLBYTES
;
1099 if (so
->so_snd
.sb_hiwat
< mss
)
1100 mss
= so
->so_snd
.sb_hiwat
;
1104 tcpstat
.tcps_mturesent
++;
1106 tp
->snd_nxt
= tp
->snd_una
;
1112 * Look-up the routing entry to the peer of this inpcb. If no route
1113 * is found and it cannot be allocated the return NULL. This routine
1114 * is called by TCP routines that access the rmx structure and by tcp_mss
1115 * to get the interface MTU.
1124 ro
= &inp
->inp_route
;
1126 if (rt
== NULL
|| !(rt
->rt_flags
& RTF_UP
)) {
1127 /* No route yet, so try to acquire one */
1128 if (inp
->inp_faddr
.s_addr
!= INADDR_ANY
) {
1129 ro
->ro_dst
.sa_family
= AF_INET
;
1130 ro
->ro_dst
.sa_len
= sizeof(ro
->ro_dst
);
1131 ((struct sockaddr_in
*) &ro
->ro_dst
)->sin_addr
=
1145 struct route_in6
*ro6
;
1148 ro6
= &inp
->in6p_route
;
1150 if (rt
== NULL
|| !(rt
->rt_flags
& RTF_UP
)) {
1151 /* No route yet, so try to acquire one */
1152 if (!IN6_IS_ADDR_UNSPECIFIED(&inp
->in6p_faddr
)) {
1153 ro6
->ro_dst
.sin6_family
= AF_INET6
;
1154 ro6
->ro_dst
.sin6_len
= sizeof(ro6
->ro_dst
);
1155 ro6
->ro_dst
.sin6_addr
= inp
->in6p_faddr
;
1156 rtalloc((struct route
*)ro6
);
1165 /* compute ESP/AH header size for TCP, including outer IP header. */
1167 ipsec_hdrsiz_tcp(tp
, isipv6
)
1178 struct ip6_hdr
*ip6
= NULL
;
1182 if (!tp
|| !tp
->t_template
|| !(inp
= tp
->t_inpcb
))
1184 MGETHDR(m
, M_DONTWAIT
, MT_DATA
);
1190 ip6
= mtod(m
, struct ip6_hdr
*);
1191 th
= (struct tcphdr
*)(ip6
+ 1);
1192 m
->m_pkthdr
.len
= m
->m_len
= sizeof(struct tcpip6hdr
);
1193 bcopy((caddr_t
)&tp
->t_template
->tt_i6
, (caddr_t
)ip6
,
1194 sizeof(struct ip6_hdr
));
1195 bcopy((caddr_t
)&tp
->t_template
->tt_t
, (caddr_t
)th
,
1196 sizeof(struct tcphdr
));
1199 ip
= mtod(m
, struct ip
*);
1200 th
= (struct tcphdr
*)(ip
+ 1);
1201 m
->m_pkthdr
.len
= m
->m_len
= sizeof(struct tcpiphdr
);
1202 bcopy((caddr_t
)&tp
->t_template
->tt_i
, (caddr_t
)ip
, sizeof(struct ip
));
1203 bcopy((caddr_t
)&tp
->t_template
->tt_t
, (caddr_t
)th
,
1204 sizeof(struct tcphdr
));
1211 hdrsiz
= ipsec6_hdrsiz(m
, IPSEC_DIR_OUTBOUND
, inp
);
1214 hdrsiz
= ipsec4_hdrsiz(m
, IPSEC_DIR_OUTBOUND
, inp
);
1222 * Return a pointer to the cached information about the remote host.
1223 * The cached information is stored in the protocol specific part of
1224 * the route metrics.
1227 tcp_gettaocache(inp
)
1231 int isipv6
= (inp
->inp_vflag
& INP_IPV4
) == 0;
1237 rt
= tcp_rtlookup6(inp
);
1240 rt
= tcp_rtlookup(inp
);
1242 /* Make sure this is a host route and is up. */
1244 (rt
->rt_flags
& (RTF_UP
|RTF_HOST
)) != (RTF_UP
|RTF_HOST
))
1247 return rmx_taop(rt
->rt_rmx
);
1251 * Clear all the TAO cache entries, called from tcp_init.
1254 * This routine is just an empty one, because we assume that the routing
1255 * routing tables are initialized at the same time when TCP, so there is
1256 * nothing in the cache left over.