2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
23 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
24 * The Regents of the University of California. All rights reserved.
26 * Redistribution and use in source and binary forms, with or without
27 * modification, are permitted provided that the following conditions
29 * 1. Redistributions of source code must retain the above copyright
30 * notice, this list of conditions and the following disclaimer.
31 * 2. Redistributions in binary form must reproduce the above copyright
32 * notice, this list of conditions and the following disclaimer in the
33 * documentation and/or other materials provided with the distribution.
34 * 3. All advertising materials mentioning features or use of this software
35 * must display the following acknowledgement:
36 * This product includes software developed by the University of
37 * California, Berkeley and its contributors.
38 * 4. Neither the name of the University nor the names of its contributors
39 * may be used to endorse or promote products derived from this software
40 * without specific prior written permission.
42 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
54 * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95
58 #include "opt_compat.h"
59 #include "opt_tcpdebug.h"
62 #include <sys/param.h>
63 #include <sys/systm.h>
64 #include <sys/kernel.h>
65 #include <sys/sysctl.h>
66 #include <sys/malloc.h>
68 #include <sys/domain.h>
69 #include <sys/socket.h>
70 #include <sys/socketvar.h>
71 #include <sys/protosw.h>
72 #include <sys/syslog.h>
76 #include <vm/vm_zone.h>
79 #include <net/route.h>
83 #include <netinet/in.h>
84 #include <netinet/in_systm.h>
85 #include <netinet/ip.h>
86 #include <netinet/in_pcb.h>
87 #include <netinet/in_var.h>
88 #include <netinet/ip_var.h>
90 #include <netinet/ip6.h>
91 #include <netinet6/ip6_var.h>
92 #include <netinet6/in6_pcb.h>
94 #include <netinet/tcp.h>
95 #include <netinet/tcp_fsm.h>
96 #include <netinet/tcp_seq.h>
97 #include <netinet/tcp_timer.h>
98 #include <netinet/tcp_var.h>
99 #include <netinet/tcpip.h>
101 #include <netinet/tcp_debug.h>
103 #include <netinet6/ip6protosw.h>
106 #include <netinet6/ipsec.h>
109 #include <sys/kdebug.h>
111 #define DBG_FNC_TCP_CLOSE NETDBG_CODE(DBG_NETTCP, ((5 << 8) | 2))
112 #ifndef offsetof /* XXX */
113 #define offsetof(type, member) ((size_t)(&((type *)0)->member))
117 int tcp_mssdflt
= TCP_MSS
;
118 SYSCTL_INT(_net_inet_tcp
, TCPCTL_MSSDFLT
, mssdflt
,
119 CTLFLAG_RW
, &tcp_mssdflt
, 0, "");
121 int tcp_v6mssdflt
= TCP6_MSS
;
122 SYSCTL_INT(_net_inet_tcp
, TCPCTL_V6MSSDFLT
, v6mssdflt
,
123 CTLFLAG_RW
, &tcp_v6mssdflt
, 0, "");
125 static int tcp_rttdflt
= TCPTV_SRTTDFLT
/ PR_SLOWHZ
;
126 SYSCTL_INT(_net_inet_tcp
, TCPCTL_RTTDFLT
, rttdflt
,
127 CTLFLAG_RW
, &tcp_rttdflt
, 0, "");
129 static int tcp_do_rfc1323
= 1;
130 SYSCTL_INT(_net_inet_tcp
, TCPCTL_DO_RFC1323
, rfc1323
,
131 CTLFLAG_RW
, &tcp_do_rfc1323
, 0, "");
133 static int tcp_do_rfc1644
= 0;
134 SYSCTL_INT(_net_inet_tcp
, TCPCTL_DO_RFC1644
, rfc1644
,
135 CTLFLAG_RW
, &tcp_do_rfc1644
, 0, "");
137 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, pcbcount
, CTLFLAG_RD
, &tcbinfo
.ipi_count
,
138 0, "Number of active PCBs");
140 static void tcp_cleartaocache
__P((void));
141 static void tcp_notify
__P((struct inpcb
*, int));
142 extern u_long current_active_connections
;
148 * Target size of TCP PCB hash tables. Must be a power of two.
150 * Note that this can be overridden by the kernel environment
151 * variable net.inet.tcp.tcbhashsize
154 #define TCBHASHSIZE 4096
158 * This is the actual shape of what we allocate using the zone
159 * allocator. Doing it this way allows us to protect both structures
160 * using the same generation count, and also eliminates the overhead
161 * of allocating tcpcbs separately. By hiding the structure here,
162 * we avoid changing most of the rest of the code (although it needs
163 * to be changed, eventually, for greater efficiency).
166 #define ALIGNM1 (ALIGNMENT - 1)
170 char align
[(sizeof(struct inpcb
) + ALIGNM1
) & ~ALIGNM1
];
177 static struct tcpcb dummy_tcb
;
180 extern struct inpcbhead time_wait_slots
[];
181 extern int cur_tw_slot
;
182 extern u_long
*delack_bitmask
;
185 int get_inpcb_str_size()
187 return sizeof(struct inpcb
);
191 int get_tcp_str_size()
193 return sizeof(struct tcpcb
);
196 int tcp_freeq
__P((struct tcpcb
*tp
));
211 #endif /* TCP_COMPAT_42 */
215 tcbinfo
.listhead
= &tcb
;
216 if (!(getenv_int("net.inet.tcp.tcbhashsize", &hashsize
)))
217 hashsize
= TCBHASHSIZE
;
218 if (!powerof2(hashsize
)) {
219 printf("WARNING: TCB hash size not a power of 2\n");
220 hashsize
= 512; /* safe default */
222 tcbinfo
.hashsize
= hashsize
;
223 tcbinfo
.hashbase
= hashinit(hashsize
, M_PCB
, &tcbinfo
.hashmask
);
224 tcbinfo
.porthashbase
= hashinit(hashsize
, M_PCB
,
225 &tcbinfo
.porthashmask
);
227 tcbinfo
.ipi_zone
= (void *) zinit("tcpcb", sizeof(struct inp_tp
), maxsockets
,
230 str_size
= (vm_size_t
) sizeof(struct inp_tp
);
231 tcbinfo
.ipi_zone
= (void *) zinit(str_size
, 120000*str_size
, 8192, "inpcb_zone");
234 #define TCP_LGHDR (sizeof(struct tcpip6hdr))
236 #define TCP_LGHDR (sizeof(struct tcpiphdr))
238 if (max_protohdr
< TCP_LGHDR
)
239 max_protohdr
= TCP_LGHDR
;
240 if ((max_linkhdr
+ TCP_LGHDR
) > MHLEN
)
243 tcbinfo
.last_pcb
= 0;
244 dummy_tcb
.t_state
= TCP_NSTATES
;
245 dummy_tcb
.t_flags
= 0;
246 tcbinfo
.dummy_cb
= (caddr_t
) &dummy_tcb
;
247 in_pcb_nat_init(&tcbinfo
, AF_INET
, IPPROTO_TCP
, SOCK_STREAM
);
249 delack_bitmask
= _MALLOC((4 * hashsize
)/32, M_PCB
, M_WAITOK
);
250 if (delack_bitmask
== 0)
251 panic("Delack Memory");
253 for (i
=0; i
< (tcbinfo
.hashsize
/ 32); i
++)
254 delack_bitmask
[i
] = 0;
256 for (i
=0; i
< N_TIME_WAIT_SLOTS
; i
++) {
257 LIST_INIT(&time_wait_slots
[i
]);
263 * Create template to be used to send tcp packets on a connection.
264 * Call after host entry created, allocates an mbuf and fills
265 * in a skeletal tcp/ip header, minimizing the amount of work
266 * necessary when the connection is used.
272 register struct inpcb
*inp
= tp
->t_inpcb
;
273 register struct mbuf
*m
;
274 register struct tcptemp
*n
;
276 if ((n
= tp
->t_template
) == 0) {
277 m
= m_get(M_DONTWAIT
, MT_HEADER
);
280 m
->m_len
= sizeof (struct tcptemp
);
281 n
= mtod(m
, struct tcptemp
*);
283 bzero(n
->tt_x1
, sizeof(n
->tt_x1
));
284 n
->tt_pr
= IPPROTO_TCP
;
285 n
->tt_len
= htons(sizeof (struct tcpiphdr
) - sizeof (struct ip
));
286 n
->tt_src
= inp
->inp_laddr
;
287 n
->tt_dst
= inp
->inp_faddr
;
288 n
->tt_sport
= inp
->inp_lport
;
289 n
->tt_dport
= inp
->inp_fport
;
299 n
->tt_t
.th_sum
= in_pseudo(n
->tt_src
.s_addr
, n
->tt_dst
.s_addr
,
300 htons(sizeof(struct tcphdr
) + IPPROTO_TCP
));
303 n
->tt_flow
= inp
->inp_flow
& IPV6_FLOWINFO_MASK
;
304 if (ip6_auto_flowlabel
) {
305 n
->tt_flow
&= ~IPV6_FLOWLABEL_MASK
;
306 n
->tt_flow
|= (htonl(ip6_flow_seq
++) & IPV6_FLOWLABEL_MASK
);
308 n
->tt_vfc
|= IPV6_VERSION
;
309 n
->tt_pr6
= IPPROTO_TCP
;
310 n
->tt_len6
= n
->tt_len
;
311 n
->tt_src6
= inp
->in6p_laddr
;
312 n
->tt_dst6
= inp
->in6p_faddr
;
318 * Send a single message to the TCP at address specified by
319 * the given TCP/IP header. If m == 0, then we make a copy
320 * of the tcpiphdr at ti and send directly to the addressed host.
321 * This is used to force keep alive messages out using the TCP
322 * template for a connection tp->t_template. If flags are given
323 * then we send a message back to the TCP which originated the
324 * segment ti, and discard the mbuf containing it and any other
327 * In any case the ack and sequence number of the transmitted
328 * segment are as specified by the parameters.
330 * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
333 tcp_respond(tp
, iph
, th
, m
, ack
, seq
, flags
, isipv6
)
336 register struct tcphdr
*th
;
337 register struct mbuf
*m
;
346 struct route
*ro
= 0;
349 struct tcpiphdr
*ti
= iph
;
352 struct route_in6
*ro6
= 0;
353 struct route_in6 sro6
;
354 struct ip6_hdr
*ip6
= iph
;
355 struct tcpip6hdr
*ti6
= iph
;
359 if (!(flags
& TH_RST
))
360 win
= sbspace(&tp
->t_inpcb
->inp_socket
->so_rcv
);
363 ro6
= &tp
->t_inpcb
->in6p_route
;
366 ro
= &tp
->t_inpcb
->inp_route
;
371 bzero(ro6
, sizeof *ro6
);
375 bzero(ro
, sizeof *ro
);
381 m
= m_gethdr(M_DONTWAIT
, MT_HEADER
);
389 m
->m_data
+= max_linkhdr
;
392 ti6
= mtod(m
, struct tcpip6hdr
*);
393 bcopy((caddr_t
)ip6
, (caddr_t
)&ti6
->ti6_i
,
394 sizeof(struct ip6_hdr
));
399 ti
= mtod(m
, struct tcpiphdr
*);
400 bcopy((caddr_t
)ip
, (caddr_t
)&ti
->ti_i
, sizeof(struct ip
));
401 ip
= (struct ip
*)&ti
->ti_i
;
406 bcopy((caddr_t
)th
, (caddr_t
)nth
, sizeof(struct tcphdr
));
411 m
->m_data
= (caddr_t
)ti
;
412 /* m_len is set later */
414 #define xchg(a,b,type) { type t; t=a; a=b; b=t; }
420 ip6
->ip6_dst
= ip6
->ip6_src
;
422 nth
= (struct tcphdr
*)(ip6
+ 1);
425 * this is the case if an extension header
426 * exists between the IPv6 header and the
429 nth
->th_sport
= th
->th_sport
;
430 nth
->th_dport
= th
->th_dport
;
434 xchg(ti
->ti_dst
.s_addr
, ti
->ti_src
.s_addr
, n_long
);
439 xchg(nth
->th_dport
, nth
->th_sport
, n_short
);
442 nth
->th_seq
= htonl(seq
);
443 nth
->th_ack
= htonl(ack
);
445 nth
->th_off
= sizeof (struct tcphdr
) >> 2;
446 nth
->th_flags
= flags
;
448 nth
->th_win
= htons((u_short
) (win
>> tp
->rcv_scale
));
450 nth
->th_win
= htons((u_short
)win
);
452 tlen
+= sizeof (struct tcphdr
);
455 m
->m_len
= tlen
+ sizeof(struct ip6_hdr
);
456 m
->m_pkthdr
.len
= tlen
+ sizeof(struct ip6_hdr
);
457 m
->m_pkthdr
.rcvif
= (struct ifnet
*) 0;
458 ip6
->ip6_plen
= htons((u_short
)tlen
);
459 ip6
->ip6_nxt
= IPPROTO_TCP
;
460 ip6
->ip6_hlim
= in6_selecthlim(tp
? tp
->t_inpcb
: NULL
,
464 nth
->th_sum
= in6_cksum(m
, IPPROTO_TCP
,
465 sizeof(struct ip6_hdr
), tlen
);
466 ip6
->ip6_flow
&= ~IPV6_FLOWLABEL_MASK
;
467 if (ip6_auto_flowlabel
) {
469 (htonl(ip6_flow_seq
++) & IPV6_FLOWLABEL_MASK
);
473 ti
->ti_len
= htons((u_short
)(tlen
));
474 m
->m_len
= tlen
+ sizeof(struct ip
);
475 m
->m_pkthdr
.len
= tlen
+ sizeof(struct ip
);
476 m
->m_pkthdr
.rcvif
= (struct ifnet
*) 0;
477 nth
->th_sum
= in_pseudo(ip
->ip_src
.s_addr
, ip
->ip_dst
.s_addr
,
478 htons((u_short
)(tlen
+ IPPROTO_TCP
)));
479 m
->m_pkthdr
.csum_flags
= CSUM_TCP
;
480 m
->m_pkthdr
.csum_data
= offsetof(struct tcphdr
, th_sum
);
482 ip
->ip_len
= tlen
+ sizeof (struct ip
);
483 ip
->ip_ttl
= ip_defttl
;
488 if (tp
== NULL
|| (tp
->t_inpcb
->inp_socket
->so_options
& SO_DEBUG
))
489 tcp_trace(TA_OUTPUT
, 0, tp
,
491 isipv6
? (void *)ip6
:
497 ipsec_setsocket(m
, tp
? tp
->t_inpcb
->inp_socket
: NULL
);
501 (void)ip6_output(m
, NULL
, ro6
, 0, NULL
, NULL
);
502 if (ro6
== &sro6
&& ro6
->ro_rt
)
506 (void)ip_output(m
, NULL
, ro
, 0, NULL
);
507 if (ro
== &sro
&& ro
->ro_rt
) {
516 * Create a new TCP control block, making an
517 * empty reassembly queue and hooking it to the argument
518 * protocol control block. The `inp' parameter must have
519 * come from the zone allocator set up in tcp_init().
526 register struct tcpcb
*tp
;
527 register struct socket
*so
= inp
->inp_socket
;
529 int isipv6
= (inp
->inp_vflag
& INP_IPV6
) != 0;
533 if (so
->cached_in_sock_layer
== 0) {
534 it
= (struct inp_tp
*)inp
;
538 tp
= (struct tcpcb
*) inp
->inp_saved_ppcb
;
540 bzero((char *) tp
, sizeof(struct tcpcb
));
541 tp
->segq
.lh_first
= NULL
;
542 tp
->t_maxseg
= tp
->t_maxopd
=
544 isipv6
? tcp_v6mssdflt
:
550 tp
->t_flags
= (TF_REQ_SCALE
|TF_REQ_TSTMP
);
552 tp
->t_flags
|= TF_REQ_CC
;
553 tp
->t_inpcb
= inp
; /* XXX */
555 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
556 * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives
557 * reasonable initial retransmit time.
559 tp
->t_srtt
= TCPTV_SRTTBASE
;
560 tp
->t_rttvar
= ((TCPTV_RTOBASE
- TCPTV_SRTTBASE
) << TCP_RTTVAR_SHIFT
) / 4;
561 tp
->t_rttmin
= TCPTV_MIN
;
562 tp
->t_rxtcur
= TCPTV_RTOBASE
;
563 tp
->snd_cwnd
= TCP_MAXWIN
<< TCP_MAX_WINSHIFT
;
564 tp
->snd_ssthresh
= TCP_MAXWIN
<< TCP_MAX_WINSHIFT
;
566 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
567 * because the socket may be bound to an IPv6 wildcard address,
568 * which may match an IPv4-mapped IPv6 address.
569 * XXX: is there a better approach?
571 inp
->inp_ip_ttl
= ip_defttl
;
572 inp
->inp_ppcb
= (caddr_t
)tp
;
573 return (tp
); /* XXX */
577 * Drop a TCP connection, reporting
578 * the specified error. If connection is synchronized,
579 * then send a RST to peer.
583 register struct tcpcb
*tp
;
586 struct socket
*so
= tp
->t_inpcb
->inp_socket
;
590 case TCPS_ESTABLISHED
:
591 case TCPS_FIN_WAIT_1
:
593 case TCPS_CLOSE_WAIT
:
595 current_active_connections
--;
599 if (TCPS_HAVERCVDSYN(tp
->t_state
)) {
600 tp
->t_state
= TCPS_CLOSED
;
601 (void) tcp_output(tp
);
602 tcpstat
.tcps_drops
++;
604 tcpstat
.tcps_conndrops
++;
605 if (errno
== ETIMEDOUT
&& tp
->t_softerror
)
606 errno
= tp
->t_softerror
;
607 so
->so_error
= errno
;
608 return (tcp_close(tp
));
612 * Close a TCP control block:
613 * discard all space held by the tcp
614 * discard internet protocol block
615 * wake up any sleepers
619 register struct tcpcb
*tp
;
621 register struct mbuf
*q
;
622 register struct mbuf
*nq
;
623 struct inpcb
*inp
= tp
->t_inpcb
;
624 struct socket
*so
= inp
->inp_socket
;
626 int isipv6
= INP_CHECK_SOCKAF(so
, AF_INET6
);
628 register struct rtentry
*rt
;
632 KERNEL_DEBUG(DBG_FNC_TCP_CLOSE
| DBG_FUNC_START
, tp
,0,0,0,0);
635 case TCPS_ESTABLISHED
:
636 case TCPS_FIN_WAIT_1
:
638 case TCPS_CLOSE_WAIT
:
640 current_active_connections
--;
646 * If we got enough samples through the srtt filter,
647 * save the rtt and rttvar in the routing entry.
648 * 'Enough' is arbitrarily defined as the 16 samples.
649 * 16 samples is enough for the srtt filter to converge
650 * to within 5% of the correct value; fewer samples and
651 * we could save a very bogus rtt.
653 * Don't update the default route's characteristics and don't
654 * update anything that the user "locked".
656 if (tp
->t_rttupdated
>= 16) {
657 register u_long i
= 0;
660 struct sockaddr_in6
*sin6
;
662 if ((rt
= inp
->in6p_route
.ro_rt
) == NULL
)
664 sin6
= (struct sockaddr_in6
*)rt_key(rt
);
665 if (IN6_IS_ADDR_UNSPECIFIED(&sin6
->sin6_addr
))
670 if ((rt
= inp
->inp_route
.ro_rt
) == NULL
||
671 ((struct sockaddr_in
*)rt_key(rt
))->sin_addr
.s_addr
675 if ((rt
->rt_rmx
.rmx_locks
& RTV_RTT
) == 0) {
677 (RTM_RTTUNIT
/ (PR_SLOWHZ
* TCP_RTT_SCALE
));
678 if (rt
->rt_rmx
.rmx_rtt
&& i
)
680 * filter this update to half the old & half
681 * the new values, converting scale.
682 * See route.h and tcp_var.h for a
683 * description of the scaling constants.
686 (rt
->rt_rmx
.rmx_rtt
+ i
) / 2;
688 rt
->rt_rmx
.rmx_rtt
= i
;
689 tcpstat
.tcps_cachedrtt
++;
691 if ((rt
->rt_rmx
.rmx_locks
& RTV_RTTVAR
) == 0) {
693 (RTM_RTTUNIT
/ (PR_SLOWHZ
* TCP_RTTVAR_SCALE
));
694 if (rt
->rt_rmx
.rmx_rttvar
&& i
)
695 rt
->rt_rmx
.rmx_rttvar
=
696 (rt
->rt_rmx
.rmx_rttvar
+ i
) / 2;
698 rt
->rt_rmx
.rmx_rttvar
= i
;
699 tcpstat
.tcps_cachedrttvar
++;
702 * The old comment here said:
703 * update the pipelimit (ssthresh) if it has been updated
704 * already or if a pipesize was specified & the threshhold
705 * got below half the pipesize. I.e., wait for bad news
706 * before we start updating, then update on both good
709 * But we want to save the ssthresh even if no pipesize is
710 * specified explicitly in the route, because such
711 * connections still have an implicit pipesize specified
712 * by the global tcp_sendspace. In the absence of a reliable
713 * way to calculate the pipesize, it will have to do.
715 i
= tp
->snd_ssthresh
;
716 if (rt
->rt_rmx
.rmx_sendpipe
!= 0)
717 dosavessthresh
= (i
< rt
->rt_rmx
.rmx_sendpipe
/ 2);
719 dosavessthresh
= (i
< so
->so_snd
.sb_hiwat
/ 2);
720 if (((rt
->rt_rmx
.rmx_locks
& RTV_SSTHRESH
) == 0 &&
721 i
!= 0 && rt
->rt_rmx
.rmx_ssthresh
!= 0)
724 * convert the limit from user data bytes to
725 * packets then to packet data bytes.
727 i
= (i
+ tp
->t_maxseg
/ 2) / tp
->t_maxseg
;
730 i
*= (u_long
)(tp
->t_maxseg
+
732 isipv6
? sizeof (struct tcpip6hdr
) :
734 sizeof (struct tcpiphdr
));
735 if (rt
->rt_rmx
.rmx_ssthresh
)
736 rt
->rt_rmx
.rmx_ssthresh
=
737 (rt
->rt_rmx
.rmx_ssthresh
+ i
) / 2;
739 rt
->rt_rmx
.rmx_ssthresh
= i
;
740 tcpstat
.tcps_cachedssthresh
++;
744 /* free the reassembly queue, if any */
745 (void) tcp_freeq(tp
);
748 (void) m_free(dtom(tp
->t_template
));
750 if (so
->cached_in_sock_layer
)
751 inp
->inp_saved_ppcb
= (caddr_t
) tp
;
753 inp
->inp_ppcb
= NULL
;
754 soisdisconnected(so
);
761 tcpstat
.tcps_closed
++;
762 KERNEL_DEBUG(DBG_FNC_TCP_CLOSE
| DBG_FUNC_END
, tcpstat
.tcps_closed
,0,0,0,0);
763 return ((struct tcpcb
*)0);
770 register struct ipqent
*qe
;
773 while ((qe
= tp
->segq
.lh_first
) != NULL
) {
774 LIST_REMOVE(qe
, ipqe_q
);
789 * Notify a tcp user of an asynchronous error;
790 * store error as soft error, but wake up user
791 * (for now, won't do anything until can select for soft error).
794 tcp_notify(inp
, error
)
798 register struct tcpcb
*tp
= (struct tcpcb
*)inp
->inp_ppcb
;
799 register struct socket
*so
= inp
->inp_socket
;
802 * Ignore some errors if we are hooked up.
803 * If connection hasn't completed, has retransmitted several times,
804 * and receives a second error, give up now. This is better
805 * than waiting a long time to establish a connection that
806 * can never complete.
808 if (tp
->t_state
== TCPS_ESTABLISHED
&&
809 (error
== EHOSTUNREACH
|| error
== ENETUNREACH
||
810 error
== EHOSTDOWN
)) {
812 } else if (tp
->t_state
< TCPS_ESTABLISHED
&& tp
->t_rxtshift
> 3 &&
814 so
->so_error
= error
;
816 tp
->t_softerror
= error
;
817 wakeup((caddr_t
) &so
->so_timeo
);
824 tcp_pcblist SYSCTL_HANDLER_ARGS
827 struct inpcb
*inp
, **inp_list
;
832 * The process of preparing the TCB list is too time-consuming and
833 * resource-intensive to repeat twice on every request.
835 if (req
->oldptr
== 0) {
836 n
= tcbinfo
.ipi_count
;
837 req
->oldidx
= 2 * (sizeof xig
)
838 + (n
+ n
/8) * sizeof(struct xtcpcb
);
842 if (req
->newptr
!= 0)
846 * OK, now we're committed to doing something.
849 gencnt
= tcbinfo
.ipi_gencnt
;
850 n
= tcbinfo
.ipi_count
;
853 xig
.xig_len
= sizeof xig
;
855 xig
.xig_gen
= gencnt
;
856 xig
.xig_sogen
= so_gencnt
;
857 error
= SYSCTL_OUT(req
, &xig
, sizeof xig
);
861 * We are done if there is no pcb
866 inp_list
= _MALLOC(n
* sizeof *inp_list
, M_TEMP
, M_WAITOK
);
871 for (inp
= tcbinfo
.listhead
->lh_first
, i
= 0; inp
&& i
< n
;
872 inp
= inp
->inp_list
.le_next
) {
873 if (inp
->inp_gencnt
<= gencnt
)
880 for (i
= 0; i
< n
; i
++) {
882 if (inp
->inp_gencnt
<= gencnt
) {
884 xt
.xt_len
= sizeof xt
;
885 /* XXX should avoid extra copy */
886 bcopy(inp
, &xt
.xt_inp
, sizeof *inp
);
887 bcopy(inp
->inp_ppcb
, &xt
.xt_tp
, sizeof xt
.xt_tp
);
889 sotoxsocket(inp
->inp_socket
, &xt
.xt_socket
);
890 error
= SYSCTL_OUT(req
, &xt
, sizeof xt
);
895 * Give the user an updated idea of our state.
896 * If the generation differs from what we told
897 * her before, she knows that something happened
898 * while we were processing this request, and it
899 * might be necessary to retry.
902 xig
.xig_gen
= tcbinfo
.ipi_gencnt
;
903 xig
.xig_sogen
= so_gencnt
;
904 xig
.xig_count
= tcbinfo
.ipi_count
;
906 error
= SYSCTL_OUT(req
, &xig
, sizeof xig
);
908 FREE(inp_list
, M_TEMP
);
913 SYSCTL_PROC(_net_inet_tcp
, TCPCTL_PCBLIST
, pcblist
, CTLFLAG_RD
, 0, 0,
914 tcp_pcblist
, "S,xtcpcb", "List of active TCP connections");
917 tcp_ctlinput(cmd
, sa
, vip
)
922 register struct ip
*ip
= vip
;
923 register struct tcphdr
*th
;
924 void (*notify
) __P((struct inpcb
*, int)) = tcp_notify
;
926 if (cmd
== PRC_QUENCH
)
928 else if (cmd
== PRC_MSGSIZE
)
929 notify
= tcp_mtudisc
;
930 else if (!PRC_IS_REDIRECT(cmd
) &&
931 ((unsigned)cmd
> PRC_NCMDS
|| inetctlerrmap
[cmd
] == 0))
934 th
= (struct tcphdr
*)((caddr_t
)ip
935 + (IP_VHL_HL(ip
->ip_vhl
) << 2));
936 in_pcbnotify(&tcb
, sa
, th
->th_dport
, ip
->ip_src
, th
->th_sport
,
939 in_pcbnotify(&tcb
, sa
, 0, zeroin_addr
, 0, cmd
, notify
);
944 tcp6_ctlinput(cmd
, sa
, d
)
949 register struct tcphdr
*thp
;
951 void (*notify
) __P((struct inpcb
*, int)) = tcp_notify
;
952 struct sockaddr_in6 sa6
;
957 if (sa
->sa_family
!= AF_INET6
||
958 sa
->sa_len
!= sizeof(struct sockaddr_in6
))
961 if (cmd
== PRC_QUENCH
)
963 else if (cmd
== PRC_MSGSIZE
)
964 notify
= tcp_mtudisc
;
965 else if (!PRC_IS_REDIRECT(cmd
) &&
966 ((unsigned)cmd
> PRC_NCMDS
|| inet6ctlerrmap
[cmd
] == 0))
969 /* if the parameter is from icmp6, decode it. */
971 struct ip6ctlparam
*ip6cp
= (struct ip6ctlparam
*)d
;
973 ip6
= ip6cp
->ip6c_ip6
;
974 off
= ip6cp
->ip6c_off
;
980 /* translate addresses into internal form */
981 sa6
= *(struct sockaddr_in6
*)sa
;
982 if (IN6_IS_ADDR_LINKLOCAL(&sa6
.sin6_addr
) && m
&& m
->m_pkthdr
.rcvif
)
983 sa6
.sin6_addr
.s6_addr16
[1] = htons(m
->m_pkthdr
.rcvif
->if_index
);
987 * XXX: We assume that when IPV6 is non NULL,
988 * M and OFF are valid.
992 /* translate addresses into internal form */
993 memcpy(&s
, &ip6
->ip6_src
, sizeof(s
));
994 if (IN6_IS_ADDR_LINKLOCAL(&s
))
995 s
.s6_addr16
[1] = htons(m
->m_pkthdr
.rcvif
->if_index
);
998 if (m
->m_len
< off
+ sizeof(*thp
)) {
1000 * this should be rare case,
1001 * so we compromise on this copy...
1003 m_copydata(m
, off
, sizeof(th
), (caddr_t
)&th
);
1006 thp
= (struct tcphdr
*)(mtod(m
, caddr_t
) + off
);
1007 in6_pcbnotify(&tcb
, (struct sockaddr
*)&sa6
, thp
->th_dport
,
1008 &s
, thp
->th_sport
, cmd
, notify
);
1010 in6_pcbnotify(&tcb
, (struct sockaddr
*)&sa6
, 0, &zeroin6_addr
,
1015 #define TCP_RNDISS_ROUNDS 16
1016 #define TCP_RNDISS_OUT 7200
1017 #define TCP_RNDISS_MAX 30000
1019 u_int8_t tcp_rndiss_sbox
[128];
1020 u_int16_t tcp_rndiss_msb
;
1021 u_int16_t tcp_rndiss_cnt
;
1022 long tcp_rndiss_reseed
;
1025 tcp_rndiss_encrypt(val
)
1028 u_int16_t sum
= 0, i
;
1030 for (i
= 0; i
< TCP_RNDISS_ROUNDS
; i
++) {
1032 val
^= ((u_int16_t
)tcp_rndiss_sbox
[(val
^sum
) & 0x7f]) << 7;
1033 val
= ((val
& 0xff) << 7) | (val
>> 8);
1042 struct timeval time
;
1044 getmicrotime(&time
);
1045 read_random(tcp_rndiss_sbox
, sizeof(tcp_rndiss_sbox
));
1047 tcp_rndiss_reseed
= time
.tv_sec
+ TCP_RNDISS_OUT
;
1048 tcp_rndiss_msb
= tcp_rndiss_msb
== 0x8000 ? 0 : 0x8000;
1056 struct timeval time
;
1058 getmicrotime(&time
);
1060 if (tcp_rndiss_cnt
>= TCP_RNDISS_MAX
||
1061 time
.tv_sec
> tcp_rndiss_reseed
)
1066 /* (tmp & 0x7fff) ensures a 32768 byte gap between ISS */
1067 return ((tcp_rndiss_encrypt(tcp_rndiss_cnt
++) | tcp_rndiss_msb
) <<16) |
1073 * When a source quench is received, close congestion window
1074 * to one segment. We will gradually open it again as we proceed.
1077 tcp_quench(inp
, errno
)
1081 struct tcpcb
*tp
= intotcpcb(inp
);
1084 tp
->snd_cwnd
= tp
->t_maxseg
;
1088 * When `need fragmentation' ICMP is received, update our idea of the MSS
1089 * based on the new value in the route. Also nudge TCP to send something,
1090 * since we know the packet we just sent was dropped.
1091 * This duplicates some code in the tcp_mss() function in tcp_input.c.
1094 tcp_mtudisc(inp
, errno
)
1098 struct tcpcb
*tp
= intotcpcb(inp
);
1100 struct rmxp_tao
*taop
;
1101 struct socket
*so
= inp
->inp_socket
;
1105 int isipv6
= (tp
->t_inpcb
->inp_vflag
& INP_IPV4
) == 0;
1111 rt
= tcp_rtlookup6(inp
);
1114 rt
= tcp_rtlookup(inp
);
1115 if (!rt
|| !rt
->rt_rmx
.rmx_mtu
) {
1116 tp
->t_maxopd
= tp
->t_maxseg
=
1118 isipv6
? tcp_v6mssdflt
:
1123 taop
= rmx_taop(rt
->rt_rmx
);
1124 offered
= taop
->tao_mssopt
;
1125 mss
= rt
->rt_rmx
.rmx_mtu
-
1128 sizeof(struct tcpip6hdr
) :
1130 sizeof(struct tcpiphdr
)
1137 mss
= min(mss
, offered
);
1139 * XXX - The above conditional probably violates the TCP
1140 * spec. The problem is that, since we don't know the
1141 * other end's MSS, we are supposed to use a conservative
1142 * default. But, if we do that, then MTU discovery will
1143 * never actually take place, because the conservative
1144 * default is much less than the MTUs typically seen
1145 * on the Internet today. For the moment, we'll sweep
1146 * this under the carpet.
1148 * The conservative default might not actually be a problem
1149 * if the only case this occurs is when sending an initial
1150 * SYN with options and data to a host we've never talked
1151 * to before. Then, they will reply with an MSS value which
1152 * will get recorded and the new parameters should get
1153 * recomputed. For Further Study.
1155 if (tp
->t_maxopd
<= mss
)
1159 if ((tp
->t_flags
& (TF_REQ_TSTMP
|TF_NOOPT
)) == TF_REQ_TSTMP
&&
1160 (tp
->t_flags
& TF_RCVD_TSTMP
) == TF_RCVD_TSTMP
)
1161 mss
-= TCPOLEN_TSTAMP_APPA
;
1162 if ((tp
->t_flags
& (TF_REQ_CC
|TF_NOOPT
)) == TF_REQ_CC
&&
1163 (tp
->t_flags
& TF_RCVD_CC
) == TF_RCVD_CC
)
1164 mss
-= TCPOLEN_CC_APPA
;
1165 #if (MCLBYTES & (MCLBYTES - 1)) == 0
1167 mss
&= ~(MCLBYTES
-1);
1170 mss
= mss
/ MCLBYTES
* MCLBYTES
;
1172 if (so
->so_snd
.sb_hiwat
< mss
)
1173 mss
= so
->so_snd
.sb_hiwat
;
1177 tcpstat
.tcps_mturesent
++;
1179 tp
->snd_nxt
= tp
->snd_una
;
1185 * Look-up the routing entry to the peer of this inpcb. If no route
1186 * is found and it cannot be allocated the return NULL. This routine
1187 * is called by TCP routines that access the rmx structure and by tcp_mss
1188 * to get the interface MTU.
1197 ro
= &inp
->inp_route
;
1201 if (rt
== NULL
|| !(rt
->rt_flags
& RTF_UP
)) {
1202 /* No route yet, so try to acquire one */
1203 if (inp
->inp_faddr
.s_addr
!= INADDR_ANY
) {
1204 ro
->ro_dst
.sa_family
= AF_INET
;
1205 ro
->ro_dst
.sa_len
= sizeof(ro
->ro_dst
);
1206 ((struct sockaddr_in
*) &ro
->ro_dst
)->sin_addr
=
1220 struct route_in6
*ro6
;
1223 ro6
= &inp
->in6p_route
;
1225 if (rt
== NULL
|| !(rt
->rt_flags
& RTF_UP
)) {
1226 /* No route yet, so try to acquire one */
1227 if (!IN6_IS_ADDR_UNSPECIFIED(&inp
->in6p_faddr
)) {
1228 ro6
->ro_dst
.sin6_family
= AF_INET6
;
1229 ro6
->ro_dst
.sin6_len
= sizeof(ro6
->ro_dst
);
1230 ro6
->ro_dst
.sin6_addr
= inp
->in6p_faddr
;
1231 rtalloc((struct route
*)ro6
);
1240 /* compute ESP/AH header size for TCP, including outer IP header. */
1242 ipsec_hdrsiz_tcp(tp
, isipv6
)
1253 struct ip6_hdr
*ip6
= NULL
;
1257 if (!tp
|| !tp
->t_template
|| !(inp
= tp
->t_inpcb
))
1259 MGETHDR(m
, M_DONTWAIT
, MT_DATA
);
1265 ip6
= mtod(m
, struct ip6_hdr
*);
1266 th
= (struct tcphdr
*)(ip6
+ 1);
1267 m
->m_pkthdr
.len
= m
->m_len
= sizeof(struct tcpip6hdr
);
1268 bcopy((caddr_t
)&tp
->t_template
->tt_i6
, (caddr_t
)ip6
,
1269 sizeof(struct ip6_hdr
));
1270 bcopy((caddr_t
)&tp
->t_template
->tt_t
, (caddr_t
)th
,
1271 sizeof(struct tcphdr
));
1274 ip
= mtod(m
, struct ip
*);
1275 th
= (struct tcphdr
*)(ip
+ 1);
1276 m
->m_pkthdr
.len
= m
->m_len
= sizeof(struct tcpiphdr
);
1277 bcopy((caddr_t
)&tp
->t_template
->tt_i
, (caddr_t
)ip
, sizeof(struct ip
));
1278 bcopy((caddr_t
)&tp
->t_template
->tt_t
, (caddr_t
)th
,
1279 sizeof(struct tcphdr
));
1286 hdrsiz
= ipsec6_hdrsiz(m
, IPSEC_DIR_OUTBOUND
, inp
);
1289 hdrsiz
= ipsec4_hdrsiz(m
, IPSEC_DIR_OUTBOUND
, inp
);
1297 * Return a pointer to the cached information about the remote host.
1298 * The cached information is stored in the protocol specific part of
1299 * the route metrics.
1302 tcp_gettaocache(inp
)
1306 int isipv6
= (inp
->inp_vflag
& INP_IPV4
) == 0;
1312 rt
= tcp_rtlookup6(inp
);
1315 rt
= tcp_rtlookup(inp
);
1317 /* Make sure this is a host route and is up. */
1319 (rt
->rt_flags
& (RTF_UP
|RTF_HOST
)) != (RTF_UP
|RTF_HOST
))
1322 return rmx_taop(rt
->rt_rmx
);
1326 * Clear all the TAO cache entries, called from tcp_init.
1329 * This routine is just an empty one, because we assume that the routing
1330 * routing tables are initialized at the same time when TCP, so there is
1331 * nothing in the cache left over.