[apple/xnu.git] / bsd / netinet6 / in6_rmx.c

/*	$KAME: in6_rmx.c,v 1.6 2000/03/25 07:23:45 sumikawa Exp $	*/

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Copyright 1994, 1995 Massachusetts Institute of Technology
 *
 * Permission to use, copy, modify, and distribute this software and
 * its documentation for any purpose and without fee is hereby
 * granted, provided that both the above copyright notice and this
 * permission notice appear in all copies, that both the above
 * copyright notice and this permission notice appear in all
 * supporting documentation, and that the name of M.I.T. not be used
 * in advertising or publicity pertaining to distribution of the
 * software without specific, written prior permission.  M.I.T. makes
 * no representations about the suitability of this software for any
 * purpose.  It is provided "as is" without express or implied
 * warranty.
 *
 * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''.  M.I.T. DISCLAIMS
 * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
 * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 */

/*
 * This code does two things necessary for the enhanced TCP metrics to
 * function in a useful manner:
 *  1) It marks all non-host routes as `cloning', thus ensuring that
 *     every actual reference to such a route actually gets turned
 *     into a reference to a host route to the specific destination
 *     requested.
 *  2) When such routes lose all their references, it arranges for them
 *     to be deleted in some random collection of circumstances, so that
 *     a large quantity of stale routing data is not kept in kernel memory
 *     indefinitely.  See in6_rtqtimo() below for the exact mechanism.
 */

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/sysctl.h>
#include <kern/queue.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/mbuf.h>
#include <sys/syslog.h>

#include <net/if.h>
#include <net/route.h>
#include <netinet/in.h>
#if defined(__APPLE__)
#include <netinet/ip_var.h>
#endif
#include <netinet/in_var.h>

#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>

#include <netinet/icmp6.h>

#if !defined(__APPLE__)
#include <netinet6/tcp6.h>
#include <netinet6/tcp6_seq.h>
#include <netinet6/tcp6_timer.h>
#include <netinet6/tcp6_var.h>
#else
#include <netinet/tcp.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#endif

#if !defined(__APPLE__)
#define tcp_sendspace tcp6_sendspace
#define tcp_recvspace tcp6_recvspace
#define time_second time.tv_sec
#define tvtohz hzto
#endif

extern int	in6_inithead __P((void **head, int off));

#define RTPRF_OURS		RTF_PROTO3	/* set on routes we manage */

/*
 * Do what we need to do when inserting a route.
 */
static struct radix_node *
in6_addroute(void *v_arg, void *n_arg, struct radix_node_head *head,
	    struct radix_node *treenodes)
{
	struct rtentry *rt = (struct rtentry *)treenodes;
	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)rt_key(rt);
	struct radix_node *ret;

	/*
	 * For IPv6, all unicast non-host routes are automatically cloning.
	 */
	if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
		rt->rt_flags |= RTF_MULTICAST;

	if (!(rt->rt_flags & (RTF_HOST | RTF_CLONING | RTF_MULTICAST))) {
		rt->rt_flags |= RTF_PRCLONING;
	}

	/*
	 * A little bit of help for both IPv6 output and input:
	 *   For local addresses, we make sure that RTF_LOCAL is set,
	 *   with the thought that this might one day be used to speed up
	 *   ip_input().
	 *
	 * We also mark routes to multicast addresses as such, because
	 * it's easy to do and might be useful (but this is much more
	 * dubious since it's so easy to inspect the address).  (This
	 * is done above.)
	 *
	 * XXX
	 * should elaborate the code.
	 */
	if (rt->rt_flags & RTF_HOST) {
		if (IN6_ARE_ADDR_EQUAL(&satosin6(rt->rt_ifa->ifa_addr)
					->sin6_addr,
				       &sin6->sin6_addr)) {
			rt->rt_flags |= RTF_LOCAL;
		}
	}

	/*
	 * We also specify a send and receive pipe size for every
	 * route added, to help TCP a bit.  TCP doesn't actually
	 * want a true pipe size, which would be prohibitive in memory
	 * costs and is hard to compute anyway; it simply uses these
	 * values to size its buffers.  So, we fill them in with the
	 * same values that TCP would have used anyway, and allow the
	 * installing program or the link layer to override these values
	 * as it sees fit.  This will hopefully allow TCP more
	 * opportunities to save its ssthresh value.
	 */
	if (!rt->rt_rmx.rmx_sendpipe && !(rt->rt_rmx.rmx_locks & RTV_SPIPE))
		rt->rt_rmx.rmx_sendpipe = tcp_sendspace;

	if (!rt->rt_rmx.rmx_recvpipe && !(rt->rt_rmx.rmx_locks & RTV_RPIPE))
		rt->rt_rmx.rmx_recvpipe = tcp_recvspace;

	if (!rt->rt_rmx.rmx_mtu && !(rt->rt_rmx.rmx_locks & RTV_MTU)
	    && rt->rt_ifp)
		rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu;

	ret = rn_addroute(v_arg, n_arg, head, treenodes);
	if (ret == NULL && rt->rt_flags & RTF_HOST) {
		struct rtentry *rt2;
		/*
		 * We are trying to add a host route, but can't.
		 * Find out if it is because of an
		 * ARP entry and delete it if so.
		 */
		rt2 = rtalloc1((struct sockaddr *)sin6, 0,
				RTF_CLONING | RTF_PRCLONING);
		if (rt2) {
			if (rt2->rt_flags & RTF_LLINFO &&
				rt2->rt_flags & RTF_HOST &&
				rt2->rt_gateway &&
				rt2->rt_gateway->sa_family == AF_LINK) {
				rtrequest(RTM_DELETE,
					  (struct sockaddr *)rt_key(rt2),
					  rt2->rt_gateway,
					  rt_mask(rt2), rt2->rt_flags, 0);
				ret = rn_addroute(v_arg, n_arg, head,
					treenodes);
			}
			RTFREE(rt2);
		}
	} else if (ret == NULL && rt->rt_flags & RTF_CLONING) {
		struct rtentry *rt2;
		/*
		 * We are trying to add a net route, but can't.
		 * The following case should be allowed, so we'll make a
		 * special check for this:
		 *	Two IPv6 addresses with the same prefix is assigned
		 *	to a single interrface.
		 *	# ifconfig if0 inet6 3ffe:0501::1 prefix 64 alias (*1)
		 *	# ifconfig if0 inet6 3ffe:0501::2 prefix 64 alias (*2)
		 *	In this case, (*1) and (*2) want to add the same
		 *	net route entry, 3ffe:0501:: -> if0.
		 *	This case should not raise an error.
		 */
		rt2 = rtalloc1((struct sockaddr *)sin6, 0,
				RTF_CLONING | RTF_PRCLONING);
		if (rt2) {
			if ((rt2->rt_flags & (RTF_CLONING|RTF_HOST|RTF_GATEWAY))
					== RTF_CLONING
			 && rt2->rt_gateway
			 && rt2->rt_gateway->sa_family == AF_LINK
			 && rt2->rt_ifp == rt->rt_ifp) {
				ret = rt2->rt_nodes;
			}
			RTFREE(rt2);
		}
	}
	return ret;
}

/*
 * This code is the inverse of in6_clsroute: on first reference, if we
 * were managing the route, stop doing so and set the expiration timer
 * back off again.
 */
static struct radix_node *
in6_matroute(void *v_arg, struct radix_node_head *head)
{
	struct radix_node *rn = rn_match(v_arg, head);
	struct rtentry *rt = (struct rtentry *)rn;

	if (rt && rt->rt_refcnt == 0) { /* this is first reference */
		if (rt->rt_flags & RTPRF_OURS) {
			rt->rt_flags &= ~RTPRF_OURS;
			rt->rt_rmx.rmx_expire = 0;
		}
	}
	return rn;
}

static int rtq_reallyold = 60*60;
	/* one hour is ``really old'' */
				   
static int rtq_minreallyold = 10;
	/* never automatically crank down to less */
				   
static int rtq_toomany = 128;
	/* 128 cached routes is ``too many'' */
				   

/*
 * On last reference drop, mark the route as belong to us so that it can be
 * timed out.
 */
static void
in6_clsroute(struct radix_node *rn, struct radix_node_head *head)
{
	struct rtentry *rt = (struct rtentry *)rn;

	if (!(rt->rt_flags & RTF_UP))
		return;		/* prophylactic measures */

	if ((rt->rt_flags & (RTF_LLINFO | RTF_HOST)) != RTF_HOST)
		return;

	if ((rt->rt_flags & (RTF_WASCLONED | RTPRF_OURS))
	   != RTF_WASCLONED)
		return;

	/*
	 * As requested by David Greenman:
	 * If rtq_reallyold is 0, just delete the route without
	 * waiting for a timeout cycle to kill it.
	 */
	if (rtq_reallyold != 0) {
		rt->rt_flags |= RTPRF_OURS;
		rt->rt_rmx.rmx_expire = time_second + rtq_reallyold;
	} else {
		rtrequest(RTM_DELETE,
			  (struct sockaddr *)rt_key(rt),
			  rt->rt_gateway, rt_mask(rt),
			  rt->rt_flags, 0);
	}
}

struct rtqk_arg {
	struct radix_node_head *rnh;
	int mode;
	int updating;
	int draining;
	int killed;
	int found;
	time_t nextstop;
};

/*
 * Get rid of old routes.  When draining, this deletes everything, even when
 * the timeout is not expired yet.  When updating, this makes sure that
 * nothing has a timeout longer than the current value of rtq_reallyold.
 */
static int
in6_rtqkill(struct radix_node *rn, void *rock)
{
	struct rtqk_arg *ap = rock;
	struct rtentry *rt = (struct rtentry *)rn;
	int err;

	if (rt->rt_flags & RTPRF_OURS) {
		ap->found++;

		if (ap->draining || rt->rt_rmx.rmx_expire <= time_second) {
			if (rt->rt_refcnt > 0)
				panic("rtqkill route really not free");

			err = rtrequest(RTM_DELETE,
					(struct sockaddr *)rt_key(rt),
					rt->rt_gateway, rt_mask(rt),
					rt->rt_flags, 0);
			if (err) {
				log(LOG_WARNING, "in6_rtqkill: error %d", err);
			} else {
				ap->killed++;
			}
		} else {
			if (ap->updating
			   && (rt->rt_rmx.rmx_expire - time_second
			       > rtq_reallyold)) {
				rt->rt_rmx.rmx_expire = time_second
					+ rtq_reallyold;
			}
			ap->nextstop = lmin(ap->nextstop,
					    rt->rt_rmx.rmx_expire);
		}
	}

	return 0;
}

#define RTQ_TIMEOUT	60*10	/* run no less than once every ten minutes */
static int rtq_timeout = RTQ_TIMEOUT;

static void
in6_rtqtimo_funneled(void *rock)
{
#ifdef __APPLE__
    	boolean_t   funnel_state;
    	funnel_state = thread_funnel_set(network_flock, TRUE);
	in6_rtqtimo(rock);
#endif
#ifdef __APPLE__
        (void) thread_funnel_set(network_flock, FALSE);
#endif
}

static void
in6_rtqtimo(void *rock)
{
	struct radix_node_head *rnh = rock;
	struct rtqk_arg arg;
	struct timeval atv;
	static time_t last_adjusted_timeout = 0;
	int s;

	arg.found = arg.killed = 0;
	arg.rnh = rnh;
	arg.nextstop = time_second + rtq_timeout;
	arg.draining = arg.updating = 0;
	s = splnet();
	rnh->rnh_walktree(rnh, in6_rtqkill, &arg);
	splx(s);

	/*
	 * Attempt to be somewhat dynamic about this:
	 * If there are ``too many'' routes sitting around taking up space,
	 * then crank down the timeout, and see if we can't make some more
	 * go away.  However, we make sure that we will never adjust more
	 * than once in rtq_timeout seconds, to keep from cranking down too
	 * hard.
	 */
	if ((arg.found - arg.killed > rtq_toomany)
	   && (time_second - last_adjusted_timeout >= rtq_timeout)
	   && rtq_reallyold > rtq_minreallyold) {
		rtq_reallyold = 2*rtq_reallyold / 3;
		if (rtq_reallyold < rtq_minreallyold) {
			rtq_reallyold = rtq_minreallyold;
		}

		last_adjusted_timeout = time_second;
#if DIAGNOSTIC
		log(LOG_DEBUG, "in6_rtqtimo: adjusted rtq_reallyold to %d",
		    rtq_reallyold);
#endif
		arg.found = arg.killed = 0;
		arg.updating = 1;
		s = splnet();
		rnh->rnh_walktree(rnh, in6_rtqkill, &arg);
		splx(s);
	}

	atv.tv_usec = 0;
	atv.tv_sec = arg.nextstop;
	timeout(in6_rtqtimo_funneled, rock, tvtohz(&atv));
}

/*
 * Age old PMTUs.
 */
struct mtuex_arg {
	struct radix_node_head *rnh;
	time_t nextstop;
};

static int
in6_mtuexpire(struct radix_node *rn, void *rock)
{
	struct rtentry *rt = (struct rtentry *)rn;
	struct mtuex_arg *ap = rock;

	/* sanity */
	if (!rt)
		panic("rt == NULL in in6_mtuexpire");

	if (rt->rt_rmx.rmx_expire && !(rt->rt_flags & RTF_PROBEMTU)) {
		if (rt->rt_rmx.rmx_expire <= time_second) {
			rt->rt_flags |= RTF_PROBEMTU;
		} else {
			ap->nextstop = lmin(ap->nextstop,
					rt->rt_rmx.rmx_expire);
		}
	}

	return 0;
}

#define	MTUTIMO_DEFAULT	(60*1)

static void
in6_mtutimo_funneled(void *rock)
{
#ifdef __APPLE__
    	boolean_t   funnel_state;
    	funnel_state = thread_funnel_set(network_flock, TRUE);
	in6_mtutimo(rock);
#endif
#ifdef __APPLE__
        (void) thread_funnel_set(network_flock, FALSE);
#endif
}

static void
in6_mtutimo(void *rock)
{
	struct radix_node_head *rnh = rock;
	struct mtuex_arg arg;
	struct timeval atv;
	int s;

	arg.rnh = rnh;
	arg.nextstop = time_second + MTUTIMO_DEFAULT;
	s = splnet();
	rnh->rnh_walktree(rnh, in6_mtuexpire, &arg);
	splx(s);

	atv.tv_usec = 0;
	atv.tv_sec = arg.nextstop;
	if (atv.tv_sec < time_second) {
		printf("invalid mtu expiration time on routing table\n");
		arg.nextstop = time_second + 30;	/*last resort*/
	}
	timeout(in6_mtutimo_funneled, rock, tvtohz(&atv));
}

#if 0
void
in6_rtqdrain()
{
	struct radix_node_head *rnh = rt_tables[AF_INET6];
	struct rtqk_arg arg;
	int s;
	arg.found = arg.killed = 0;
	arg.rnh = rnh;
	arg.nextstop = 0;
	arg.draining = 1;
	arg.updating = 0;
	s = splnet();
	rnh->rnh_walktree(rnh, in6_rtqkill, &arg);
	splx(s);
}
#endif

/*
 * Initialize our routing tree.
 */
int
in6_inithead(void **head, int off)
{
	struct radix_node_head *rnh;

	if (!rn_inithead(head, off))
		return 0;

	if (head != (void **)&rt_tables[AF_INET6]) /* BOGUS! */
		return 1;	/* only do this for the real routing table */

	rnh = *head;
	rnh->rnh_addaddr = in6_addroute;
	rnh->rnh_matchaddr = in6_matroute;
	rnh->rnh_close = in6_clsroute;
	in6_rtqtimo(rnh);	/* kick off timeout first time */
	in6_mtutimo(rnh);	/* kick off timeout first time */
	return 1;
}
Commit	Line	Data
1c79356b A	1	/* $KAME: in6_rmx.c,v 1.6 2000/03/25 07:23:45 sumikawa Exp $ */
	2
	3	/*
	4	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	5	* All rights reserved.
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	* 1. Redistributions of source code must retain the above copyright
	11	* notice, this list of conditions and the following disclaimer.
	12	* 2. Redistributions in binary form must reproduce the above copyright
	13	* notice, this list of conditions and the following disclaimer in the
	14	* documentation and/or other materials provided with the distribution.
	15	* 3. Neither the name of the project nor the names of its contributors
	16	* may be used to endorse or promote products derived from this software
	17	* without specific prior written permission.
	18	*
	19	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	20	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	21	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	22	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	23	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	24	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	25	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	26	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	27	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	28	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	29	* SUCH DAMAGE.
	30	*/
	31
	32	/*
	33	* Copyright 1994, 1995 Massachusetts Institute of Technology
	34	*
	35	* Permission to use, copy, modify, and distribute this software and
	36	* its documentation for any purpose and without fee is hereby
	37	* granted, provided that both the above copyright notice and this
	38	* permission notice appear in all copies, that both the above
	39	* copyright notice and this permission notice appear in all
	40	* supporting documentation, and that the name of M.I.T. not be used
	41	* in advertising or publicity pertaining to distribution of the
	42	* software without specific, written prior permission. M.I.T. makes
	43	* no representations about the suitability of this software for any
	44	* purpose. It is provided "as is" without express or implied
	45	* warranty.
	46	*
	47	* THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS
	48	* ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
	49	* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
	50	* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
	51	* SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	52	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	53	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
	54	* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	55	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	56	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	57	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	58	* SUCH DAMAGE.
	59	*
	60	*/
	61
	62	/*
	63	* This code does two things necessary for the enhanced TCP metrics to
	64	* function in a useful manner:
65	* 1) It marks all non-host routes as `cloning', thus ensuring that
66	* every actual reference to such a route actually gets turned
67	* into a reference to a host route to the specific destination
68	* requested.
69	* 2) When such routes lose all their references, it arranges for them
70	* to be deleted in some random collection of circumstances, so that
71	* a large quantity of stale routing data is not kept in kernel memory
72	* indefinitely. See in6_rtqtimo() below for the exact mechanism.
73	*/
74
75	#include <sys/param.h>
76	#include <sys/systm.h>
77	#include <sys/kernel.h>
78	#include <sys/sysctl.h>
79	#include <kern/queue.h>
80	#include <sys/socket.h>
81	#include <sys/socketvar.h>
82	#include <sys/mbuf.h>
83	#include <sys/syslog.h>
84
85	#include <net/if.h>
86	#include <net/route.h>
87	#include <netinet/in.h>
88	#if defined(__APPLE__)
89	#include <netinet/ip_var.h>
90	#endif
91	#include <netinet/in_var.h>
92
93	#include <netinet/ip6.h>
94	#include <netinet6/ip6_var.h>
95
96	#include <netinet/icmp6.h>
97
98	#if !defined(__APPLE__)
99	#include <netinet6/tcp6.h>
100	#include <netinet6/tcp6_seq.h>
101	#include <netinet6/tcp6_timer.h>
102	#include <netinet6/tcp6_var.h>
103	#else
104	#include <netinet/tcp.h>
105	#include <netinet/tcp_seq.h>
106	#include <netinet/tcp_timer.h>
107	#include <netinet/tcp_var.h>
108	#endif
109
110	#if !defined(__APPLE__)
111	#define tcp_sendspace tcp6_sendspace
112	#define tcp_recvspace tcp6_recvspace
113	#define time_second time.tv_sec
114	#define tvtohz hzto
115	#endif
116
117	extern int in6_inithead __P((void **head, int off));
118
119	#define RTPRF_OURS RTF_PROTO3 /* set on routes we manage */
120
121	/*
122	* Do what we need to do when inserting a route.
123	*/
124	static struct radix_node *
125	in6_addroute(void v_arg, void n_arg, struct radix_node_head *head,
126	struct radix_node *treenodes)
127	{
128	struct rtentry rt = (struct rtentry )treenodes;
129	struct sockaddr_in6 sin6 = (struct sockaddr_in6 )rt_key(rt);
130	struct radix_node *ret;
131
132	/*
133	* For IPv6, all unicast non-host routes are automatically cloning.
134	*/
135	if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
136	rt->rt_flags \|= RTF_MULTICAST;
137
138	if (!(rt->rt_flags & (RTF_HOST \| RTF_CLONING \| RTF_MULTICAST))) {
139	rt->rt_flags \|= RTF_PRCLONING;
140	}
141
142	/*
143	* A little bit of help for both IPv6 output and input:
144	* For local addresses, we make sure that RTF_LOCAL is set,
145	* with the thought that this might one day be used to speed up
146	* ip_input().
147	*
148	* We also mark routes to multicast addresses as such, because
149	* it's easy to do and might be useful (but this is much more
150	* dubious since it's so easy to inspect the address). (This
151	* is done above.)
152	*
153	* XXX
154	* should elaborate the code.
155	*/
156	if (rt->rt_flags & RTF_HOST) {
157	if (IN6_ARE_ADDR_EQUAL(&satosin6(rt->rt_ifa->ifa_addr)
158	->sin6_addr,
159	&sin6->sin6_addr)) {
160	rt->rt_flags \|= RTF_LOCAL;
161	}
162	}
163
164	/*
165	* We also specify a send and receive pipe size for every
166	* route added, to help TCP a bit. TCP doesn't actually
167	* want a true pipe size, which would be prohibitive in memory
168	* costs and is hard to compute anyway; it simply uses these
169	* values to size its buffers. So, we fill them in with the
170	* same values that TCP would have used anyway, and allow the
171	* installing program or the link layer to override these values
172	* as it sees fit. This will hopefully allow TCP more
173	* opportunities to save its ssthresh value.
174	*/
175	if (!rt->rt_rmx.rmx_sendpipe && !(rt->rt_rmx.rmx_locks & RTV_SPIPE))
176	rt->rt_rmx.rmx_sendpipe = tcp_sendspace;
177
178	if (!rt->rt_rmx.rmx_recvpipe && !(rt->rt_rmx.rmx_locks & RTV_RPIPE))
179	rt->rt_rmx.rmx_recvpipe = tcp_recvspace;
180
181	if (!rt->rt_rmx.rmx_mtu && !(rt->rt_rmx.rmx_locks & RTV_MTU)
182	&& rt->rt_ifp)
183	rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu;
184
185	ret = rn_addroute(v_arg, n_arg, head, treenodes);
186	if (ret == NULL && rt->rt_flags & RTF_HOST) {
187	struct rtentry *rt2;
188	/*
189	* We are trying to add a host route, but can't.
190	* Find out if it is because of an
191	* ARP entry and delete it if so.
192	*/
193	rt2 = rtalloc1((struct sockaddr *)sin6, 0,
194	RTF_CLONING \| RTF_PRCLONING);
195	if (rt2) {
196	if (rt2->rt_flags & RTF_LLINFO &&
197	rt2->rt_flags & RTF_HOST &&
198	rt2->rt_gateway &&
199	rt2->rt_gateway->sa_family == AF_LINK) {
200	rtrequest(RTM_DELETE,
201	(struct sockaddr *)rt_key(rt2),
202	rt2->rt_gateway,
203	rt_mask(rt2), rt2->rt_flags, 0);
204	ret = rn_addroute(v_arg, n_arg, head,
205	treenodes);
206	}
207	RTFREE(rt2);
208	}
209	} else if (ret == NULL && rt->rt_flags & RTF_CLONING) {
210	struct rtentry *rt2;
211	/*
212	* We are trying to add a net route, but can't.
213	* The following case should be allowed, so we'll make a
214	* special check for this:
215	* Two IPv6 addresses with the same prefix is assigned
216	* to a single interrface.
217	* # ifconfig if0 inet6 3ffe:0501::1 prefix 64 alias (*1)
218	* # ifconfig if0 inet6 3ffe:0501::2 prefix 64 alias (*2)
219	* In this case, (1) and (2) want to add the same
220	* net route entry, 3ffe:0501:: -> if0.
221	* This case should not raise an error.
222	*/
223	rt2 = rtalloc1((struct sockaddr *)sin6, 0,
224	RTF_CLONING \| RTF_PRCLONING);
225	if (rt2) {
226	if ((rt2->rt_flags & (RTF_CLONING\|RTF_HOST\|RTF_GATEWAY))
227	== RTF_CLONING
228	&& rt2->rt_gateway
229	&& rt2->rt_gateway->sa_family == AF_LINK
230	&& rt2->rt_ifp == rt->rt_ifp) {
231	ret = rt2->rt_nodes;
232	}
233	RTFREE(rt2);
234	}
235	}
236	return ret;
237	}
238
239	/*
240	* This code is the inverse of in6_clsroute: on first reference, if we
241	* were managing the route, stop doing so and set the expiration timer
242	* back off again.
243	*/
244	static struct radix_node *
245	in6_matroute(void v_arg, struct radix_node_head head)
246	{
247	struct radix_node *rn = rn_match(v_arg, head);
248	struct rtentry rt = (struct rtentry )rn;
249
250	if (rt && rt->rt_refcnt == 0) { /* this is first reference */
251	if (rt->rt_flags & RTPRF_OURS) {
252	rt->rt_flags &= ~RTPRF_OURS;
253	rt->rt_rmx.rmx_expire = 0;
254	}
255	}
256	return rn;
257	}
258
259	static int rtq_reallyold = 60*60;
260	/* one hour is ``really old'' */
261
262	static int rtq_minreallyold = 10;
263	/* never automatically crank down to less */
264
265	static int rtq_toomany = 128;
266	/* 128 cached routes is ``too many'' */
267
268
269	/*
270	* On last reference drop, mark the route as belong to us so that it can be
271	* timed out.
272	*/
273	static void
274	in6_clsroute(struct radix_node rn, struct radix_node_head head)
275	{
276	struct rtentry rt = (struct rtentry )rn;
277
278	if (!(rt->rt_flags & RTF_UP))
279	return; /* prophylactic measures */
280
281	if ((rt->rt_flags & (RTF_LLINFO \| RTF_HOST)) != RTF_HOST)
282	return;
283
284	if ((rt->rt_flags & (RTF_WASCLONED \| RTPRF_OURS))
285	!= RTF_WASCLONED)
286	return;
287
288	/*
289	* As requested by David Greenman:
290	* If rtq_reallyold is 0, just delete the route without
291	* waiting for a timeout cycle to kill it.
292	*/
293	if (rtq_reallyold != 0) {
294	rt->rt_flags \|= RTPRF_OURS;
295	rt->rt_rmx.rmx_expire = time_second + rtq_reallyold;
296	} else {
297	rtrequest(RTM_DELETE,
298	(struct sockaddr *)rt_key(rt),
299	rt->rt_gateway, rt_mask(rt),
300	rt->rt_flags, 0);
301	}
302	}
303
304	struct rtqk_arg {
305	struct radix_node_head *rnh;
306	int mode;
307	int updating;
308	int draining;
309	int killed;
310	int found;
311	time_t nextstop;
312	};
313
314	/*
315	* Get rid of old routes. When draining, this deletes everything, even when
316	* the timeout is not expired yet. When updating, this makes sure that
317	* nothing has a timeout longer than the current value of rtq_reallyold.
318	*/
319	static int
320	in6_rtqkill(struct radix_node rn, void rock)
321	{
322	struct rtqk_arg *ap = rock;
323	struct rtentry rt = (struct rtentry )rn;
324	int err;
325
326	if (rt->rt_flags & RTPRF_OURS) {
327	ap->found++;
328
329	if (ap->draining \|\| rt->rt_rmx.rmx_expire <= time_second) {
330	if (rt->rt_refcnt > 0)
331	panic("rtqkill route really not free");
332
333	err = rtrequest(RTM_DELETE,
334	(struct sockaddr *)rt_key(rt),
335	rt->rt_gateway, rt_mask(rt),
336	rt->rt_flags, 0);
337	if (err) {
338	log(LOG_WARNING, "in6_rtqkill: error %d", err);
339	} else {
340	ap->killed++;
341	}
342	} else {
343	if (ap->updating
344	&& (rt->rt_rmx.rmx_expire - time_second
345	> rtq_reallyold)) {
346	rt->rt_rmx.rmx_expire = time_second
347	+ rtq_reallyold;
348	}
349	ap->nextstop = lmin(ap->nextstop,
350	rt->rt_rmx.rmx_expire);
351	}
352	}
353
354	return 0;
355	}
356
357	#define RTQ_TIMEOUT 6010 / run no less than once every ten minutes */
358	static int rtq_timeout = RTQ_TIMEOUT;
359
0b4e3aa0 A	360	static void
	361	in6_rtqtimo_funneled(void *rock)
	362	{
	363	#ifdef __APPLE__
	364	boolean_t funnel_state;
	365	funnel_state = thread_funnel_set(network_flock, TRUE);
	366	in6_rtqtimo(rock);
	367	#endif
	368	#ifdef __APPLE__
	369	(void) thread_funnel_set(network_flock, FALSE);
	370	#endif
	371	}
	372
1c79356b A	373	static void
	374	in6_rtqtimo(void *rock)
	375	{
	376	struct radix_node_head *rnh = rock;
	377	struct rtqk_arg arg;
	378	struct timeval atv;
	379	static time_t last_adjusted_timeout = 0;
	380	int s;
1c79356b A	381
	382	arg.found = arg.killed = 0;
	383	arg.rnh = rnh;
	384	arg.nextstop = time_second + rtq_timeout;
	385	arg.draining = arg.updating = 0;
	386	s = splnet();
	387	rnh->rnh_walktree(rnh, in6_rtqkill, &arg);
	388	splx(s);
	389
	390	/*
	391	* Attempt to be somewhat dynamic about this:
	392	* If there are ``too many'' routes sitting around taking up space,
	393	* then crank down the timeout, and see if we can't make some more
	394	* go away. However, we make sure that we will never adjust more
	395	* than once in rtq_timeout seconds, to keep from cranking down too
	396	* hard.
	397	*/
	398	if ((arg.found - arg.killed > rtq_toomany)
	399	&& (time_second - last_adjusted_timeout >= rtq_timeout)
	400	&& rtq_reallyold > rtq_minreallyold) {
	401	rtq_reallyold = 2*rtq_reallyold / 3;
	402	if (rtq_reallyold < rtq_minreallyold) {
	403	rtq_reallyold = rtq_minreallyold;
	404	}
	405
	406	last_adjusted_timeout = time_second;
	407	#if DIAGNOSTIC
	408	log(LOG_DEBUG, "in6_rtqtimo: adjusted rtq_reallyold to %d",
	409	rtq_reallyold);
	410	#endif
	411	arg.found = arg.killed = 0;
	412	arg.updating = 1;
	413	s = splnet();
	414	rnh->rnh_walktree(rnh, in6_rtqkill, &arg);
	415	splx(s);
	416	}
	417
	418	atv.tv_usec = 0;
	419	atv.tv_sec = arg.nextstop;
0b4e3aa0	420	timeout(in6_rtqtimo_funneled, rock, tvtohz(&atv));
1c79356b A	421	}
	422
	423	/*
	424	* Age old PMTUs.
	425	*/
	426	struct mtuex_arg {
	427	struct radix_node_head *rnh;
	428	time_t nextstop;
	429	};
	430
	431	static int
	432	in6_mtuexpire(struct radix_node rn, void rock)
	433	{
	434	struct rtentry rt = (struct rtentry )rn;
	435	struct mtuex_arg *ap = rock;
	436
	437	/* sanity */
	438	if (!rt)
	439	panic("rt == NULL in in6_mtuexpire");
	440
	441	if (rt->rt_rmx.rmx_expire && !(rt->rt_flags & RTF_PROBEMTU)) {
	442	if (rt->rt_rmx.rmx_expire <= time_second) {
	443	rt->rt_flags \|= RTF_PROBEMTU;
	444	} else {
	445	ap->nextstop = lmin(ap->nextstop,
	446	rt->rt_rmx.rmx_expire);
	447	}
	448	}
	449
	450	return 0;
	451	}
	452
	453	#define MTUTIMO_DEFAULT (60*1)
	454
0b4e3aa0 A	455	static void
	456	in6_mtutimo_funneled(void *rock)
	457	{
	458	#ifdef __APPLE__
	459	boolean_t funnel_state;
	460	funnel_state = thread_funnel_set(network_flock, TRUE);
	461	in6_mtutimo(rock);
	462	#endif
	463	#ifdef __APPLE__
	464	(void) thread_funnel_set(network_flock, FALSE);
	465	#endif
	466	}
	467
1c79356b A	468	static void
	469	in6_mtutimo(void *rock)
	470	{
	471	struct radix_node_head *rnh = rock;
	472	struct mtuex_arg arg;
	473	struct timeval atv;
	474	int s;
1c79356b A	475
	476	arg.rnh = rnh;
	477	arg.nextstop = time_second + MTUTIMO_DEFAULT;
	478	s = splnet();
	479	rnh->rnh_walktree(rnh, in6_mtuexpire, &arg);
	480	splx(s);
	481
	482	atv.tv_usec = 0;
	483	atv.tv_sec = arg.nextstop;
	484	if (atv.tv_sec < time_second) {
	485	printf("invalid mtu expiration time on routing table\n");
	486	arg.nextstop = time_second + 30; /last resort/
	487	}
0b4e3aa0	488	timeout(in6_mtutimo_funneled, rock, tvtohz(&atv));
1c79356b A	489	}
	490
	491	#if 0
	492	void
	493	in6_rtqdrain()
	494	{
	495	struct radix_node_head *rnh = rt_tables[AF_INET6];
	496	struct rtqk_arg arg;
	497	int s;
	498	arg.found = arg.killed = 0;
	499	arg.rnh = rnh;
	500	arg.nextstop = 0;
	501	arg.draining = 1;
	502	arg.updating = 0;
	503	s = splnet();
	504	rnh->rnh_walktree(rnh, in6_rtqkill, &arg);
	505	splx(s);
	506	}
	507	#endif
	508
	509	/*
	510	* Initialize our routing tree.
	511	*/
	512	int
	513	in6_inithead(void **head, int off)
	514	{
	515	struct radix_node_head *rnh;
	516
	517	if (!rn_inithead(head, off))
	518	return 0;
	519
	520	if (head != (void *)&rt_tables[AF_INET6]) / BOGUS! */
	521	return 1; /* only do this for the real routing table */
	522
	523	rnh = *head;
	524	rnh->rnh_addaddr = in6_addroute;
	525	rnh->rnh_matchaddr = in6_matroute;
	526	rnh->rnh_close = in6_clsroute;
	527	in6_rtqtimo(rnh); /* kick off timeout first time */
	528	in6_mtutimo(rnh); /* kick off timeout first time */
	529	return 1;
	530	}