]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet6/in6_rmx.c
3a5eba5ad80c61e45da959db62c670d06376b19d
[apple/xnu.git] / bsd / netinet6 / in6_rmx.c
1 /* $KAME: in6_rmx.c,v 1.6 2000/03/25 07:23:45 sumikawa Exp $ */
2
3 /*
4 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the project nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright 1994, 1995 Massachusetts Institute of Technology
34 *
35 * Permission to use, copy, modify, and distribute this software and
36 * its documentation for any purpose and without fee is hereby
37 * granted, provided that both the above copyright notice and this
38 * permission notice appear in all copies, that both the above
39 * copyright notice and this permission notice appear in all
40 * supporting documentation, and that the name of M.I.T. not be used
41 * in advertising or publicity pertaining to distribution of the
42 * software without specific, written prior permission. M.I.T. makes
43 * no representations about the suitability of this software for any
44 * purpose. It is provided "as is" without express or implied
45 * warranty.
46 *
47 * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS
48 * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
49 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
50 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
51 * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
52 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
53 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
54 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
55 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
56 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
57 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 */
61
62 /*
63 * This code does two things necessary for the enhanced TCP metrics to
64 * function in a useful manner:
65 * 1) It marks all non-host routes as `cloning', thus ensuring that
66 * every actual reference to such a route actually gets turned
67 * into a reference to a host route to the specific destination
68 * requested.
69 * 2) When such routes lose all their references, it arranges for them
70 * to be deleted in some random collection of circumstances, so that
71 * a large quantity of stale routing data is not kept in kernel memory
72 * indefinitely. See in6_rtqtimo() below for the exact mechanism.
73 */
74
75 #include <sys/param.h>
76 #include <sys/systm.h>
77 #include <sys/kernel.h>
78 #include <sys/sysctl.h>
79 #include <kern/queue.h>
80 #include <sys/socket.h>
81 #include <sys/socketvar.h>
82 #include <sys/mbuf.h>
83 #include <sys/syslog.h>
84
85 #include <net/if.h>
86 #include <net/route.h>
87 #include <netinet/in.h>
88 #if defined(__APPLE__)
89 #include <netinet/ip_var.h>
90 #endif
91 #include <netinet/in_var.h>
92
93 #include <netinet/ip6.h>
94 #include <netinet6/ip6_var.h>
95
96 #include <netinet/icmp6.h>
97
98 #if !defined(__APPLE__)
99 #include <netinet6/tcp6.h>
100 #include <netinet6/tcp6_seq.h>
101 #include <netinet6/tcp6_timer.h>
102 #include <netinet6/tcp6_var.h>
103 #else
104 #include <netinet/tcp.h>
105 #include <netinet/tcp_seq.h>
106 #include <netinet/tcp_timer.h>
107 #include <netinet/tcp_var.h>
108 #endif
109
110 #if !defined(__APPLE__)
111 #define tcp_sendspace tcp6_sendspace
112 #define tcp_recvspace tcp6_recvspace
113 #define time_second time.tv_sec
114 #define tvtohz hzto
115 #endif
116
117 extern int in6_inithead __P((void **head, int off));
118
119 #define RTPRF_OURS RTF_PROTO3 /* set on routes we manage */
120
121 /*
122 * Do what we need to do when inserting a route.
123 */
124 static struct radix_node *
125 in6_addroute(void *v_arg, void *n_arg, struct radix_node_head *head,
126 struct radix_node *treenodes)
127 {
128 struct rtentry *rt = (struct rtentry *)treenodes;
129 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)rt_key(rt);
130 struct radix_node *ret;
131
132 /*
133 * For IPv6, all unicast non-host routes are automatically cloning.
134 */
135 if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
136 rt->rt_flags |= RTF_MULTICAST;
137
138 if (!(rt->rt_flags & (RTF_HOST | RTF_CLONING | RTF_MULTICAST))) {
139 rt->rt_flags |= RTF_PRCLONING;
140 }
141
142 /*
143 * A little bit of help for both IPv6 output and input:
144 * For local addresses, we make sure that RTF_LOCAL is set,
145 * with the thought that this might one day be used to speed up
146 * ip_input().
147 *
148 * We also mark routes to multicast addresses as such, because
149 * it's easy to do and might be useful (but this is much more
150 * dubious since it's so easy to inspect the address). (This
151 * is done above.)
152 *
153 * XXX
154 * should elaborate the code.
155 */
156 if (rt->rt_flags & RTF_HOST) {
157 if (IN6_ARE_ADDR_EQUAL(&satosin6(rt->rt_ifa->ifa_addr)
158 ->sin6_addr,
159 &sin6->sin6_addr)) {
160 rt->rt_flags |= RTF_LOCAL;
161 }
162 }
163
164 /*
165 * We also specify a send and receive pipe size for every
166 * route added, to help TCP a bit. TCP doesn't actually
167 * want a true pipe size, which would be prohibitive in memory
168 * costs and is hard to compute anyway; it simply uses these
169 * values to size its buffers. So, we fill them in with the
170 * same values that TCP would have used anyway, and allow the
171 * installing program or the link layer to override these values
172 * as it sees fit. This will hopefully allow TCP more
173 * opportunities to save its ssthresh value.
174 */
175 if (!rt->rt_rmx.rmx_sendpipe && !(rt->rt_rmx.rmx_locks & RTV_SPIPE))
176 rt->rt_rmx.rmx_sendpipe = tcp_sendspace;
177
178 if (!rt->rt_rmx.rmx_recvpipe && !(rt->rt_rmx.rmx_locks & RTV_RPIPE))
179 rt->rt_rmx.rmx_recvpipe = tcp_recvspace;
180
181 if (!rt->rt_rmx.rmx_mtu && !(rt->rt_rmx.rmx_locks & RTV_MTU)
182 && rt->rt_ifp)
183 rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu;
184
185 ret = rn_addroute(v_arg, n_arg, head, treenodes);
186 if (ret == NULL && rt->rt_flags & RTF_HOST) {
187 struct rtentry *rt2;
188 /*
189 * We are trying to add a host route, but can't.
190 * Find out if it is because of an
191 * ARP entry and delete it if so.
192 */
193 rt2 = rtalloc1((struct sockaddr *)sin6, 0,
194 RTF_CLONING | RTF_PRCLONING);
195 if (rt2) {
196 if (rt2->rt_flags & RTF_LLINFO &&
197 rt2->rt_flags & RTF_HOST &&
198 rt2->rt_gateway &&
199 rt2->rt_gateway->sa_family == AF_LINK) {
200 rtrequest(RTM_DELETE,
201 (struct sockaddr *)rt_key(rt2),
202 rt2->rt_gateway,
203 rt_mask(rt2), rt2->rt_flags, 0);
204 ret = rn_addroute(v_arg, n_arg, head,
205 treenodes);
206 }
207 RTFREE(rt2);
208 }
209 } else if (ret == NULL && rt->rt_flags & RTF_CLONING) {
210 struct rtentry *rt2;
211 /*
212 * We are trying to add a net route, but can't.
213 * The following case should be allowed, so we'll make a
214 * special check for this:
215 * Two IPv6 addresses with the same prefix is assigned
216 * to a single interrface.
217 * # ifconfig if0 inet6 3ffe:0501::1 prefix 64 alias (*1)
218 * # ifconfig if0 inet6 3ffe:0501::2 prefix 64 alias (*2)
219 * In this case, (*1) and (*2) want to add the same
220 * net route entry, 3ffe:0501:: -> if0.
221 * This case should not raise an error.
222 */
223 rt2 = rtalloc1((struct sockaddr *)sin6, 0,
224 RTF_CLONING | RTF_PRCLONING);
225 if (rt2) {
226 if ((rt2->rt_flags & (RTF_CLONING|RTF_HOST|RTF_GATEWAY))
227 == RTF_CLONING
228 && rt2->rt_gateway
229 && rt2->rt_gateway->sa_family == AF_LINK
230 && rt2->rt_ifp == rt->rt_ifp) {
231 ret = rt2->rt_nodes;
232 }
233 RTFREE(rt2);
234 }
235 }
236 return ret;
237 }
238
239 /*
240 * This code is the inverse of in6_clsroute: on first reference, if we
241 * were managing the route, stop doing so and set the expiration timer
242 * back off again.
243 */
244 static struct radix_node *
245 in6_matroute(void *v_arg, struct radix_node_head *head)
246 {
247 struct radix_node *rn = rn_match(v_arg, head);
248 struct rtentry *rt = (struct rtentry *)rn;
249
250 if (rt && rt->rt_refcnt == 0) { /* this is first reference */
251 if (rt->rt_flags & RTPRF_OURS) {
252 rt->rt_flags &= ~RTPRF_OURS;
253 rt->rt_rmx.rmx_expire = 0;
254 }
255 }
256 return rn;
257 }
258
259 static int rtq_reallyold = 60*60;
260 /* one hour is ``really old'' */
261
262 static int rtq_minreallyold = 10;
263 /* never automatically crank down to less */
264
265 static int rtq_toomany = 128;
266 /* 128 cached routes is ``too many'' */
267
268
269 /*
270 * On last reference drop, mark the route as belong to us so that it can be
271 * timed out.
272 */
273 static void
274 in6_clsroute(struct radix_node *rn, struct radix_node_head *head)
275 {
276 struct rtentry *rt = (struct rtentry *)rn;
277
278 if (!(rt->rt_flags & RTF_UP))
279 return; /* prophylactic measures */
280
281 if ((rt->rt_flags & (RTF_LLINFO | RTF_HOST)) != RTF_HOST)
282 return;
283
284 if ((rt->rt_flags & (RTF_WASCLONED | RTPRF_OURS))
285 != RTF_WASCLONED)
286 return;
287
288 /*
289 * As requested by David Greenman:
290 * If rtq_reallyold is 0, just delete the route without
291 * waiting for a timeout cycle to kill it.
292 */
293 if (rtq_reallyold != 0) {
294 rt->rt_flags |= RTPRF_OURS;
295 rt->rt_rmx.rmx_expire = time_second + rtq_reallyold;
296 } else {
297 rtrequest(RTM_DELETE,
298 (struct sockaddr *)rt_key(rt),
299 rt->rt_gateway, rt_mask(rt),
300 rt->rt_flags, 0);
301 }
302 }
303
304 struct rtqk_arg {
305 struct radix_node_head *rnh;
306 int mode;
307 int updating;
308 int draining;
309 int killed;
310 int found;
311 time_t nextstop;
312 };
313
314 /*
315 * Get rid of old routes. When draining, this deletes everything, even when
316 * the timeout is not expired yet. When updating, this makes sure that
317 * nothing has a timeout longer than the current value of rtq_reallyold.
318 */
319 static int
320 in6_rtqkill(struct radix_node *rn, void *rock)
321 {
322 struct rtqk_arg *ap = rock;
323 struct rtentry *rt = (struct rtentry *)rn;
324 int err;
325
326 if (rt->rt_flags & RTPRF_OURS) {
327 ap->found++;
328
329 if (ap->draining || rt->rt_rmx.rmx_expire <= time_second) {
330 if (rt->rt_refcnt > 0)
331 panic("rtqkill route really not free");
332
333 err = rtrequest(RTM_DELETE,
334 (struct sockaddr *)rt_key(rt),
335 rt->rt_gateway, rt_mask(rt),
336 rt->rt_flags, 0);
337 if (err) {
338 log(LOG_WARNING, "in6_rtqkill: error %d", err);
339 } else {
340 ap->killed++;
341 }
342 } else {
343 if (ap->updating
344 && (rt->rt_rmx.rmx_expire - time_second
345 > rtq_reallyold)) {
346 rt->rt_rmx.rmx_expire = time_second
347 + rtq_reallyold;
348 }
349 ap->nextstop = lmin(ap->nextstop,
350 rt->rt_rmx.rmx_expire);
351 }
352 }
353
354 return 0;
355 }
356
357 #define RTQ_TIMEOUT 60*10 /* run no less than once every ten minutes */
358 static int rtq_timeout = RTQ_TIMEOUT;
359
360 static void
361 in6_rtqtimo_funneled(void *rock)
362 {
363 #ifdef __APPLE__
364 boolean_t funnel_state;
365 funnel_state = thread_funnel_set(network_flock, TRUE);
366 in6_rtqtimo(rock);
367 #endif
368 #ifdef __APPLE__
369 (void) thread_funnel_set(network_flock, FALSE);
370 #endif
371 }
372
373 static void
374 in6_rtqtimo(void *rock)
375 {
376 struct radix_node_head *rnh = rock;
377 struct rtqk_arg arg;
378 struct timeval atv;
379 static time_t last_adjusted_timeout = 0;
380 int s;
381
382 arg.found = arg.killed = 0;
383 arg.rnh = rnh;
384 arg.nextstop = time_second + rtq_timeout;
385 arg.draining = arg.updating = 0;
386 s = splnet();
387 rnh->rnh_walktree(rnh, in6_rtqkill, &arg);
388 splx(s);
389
390 /*
391 * Attempt to be somewhat dynamic about this:
392 * If there are ``too many'' routes sitting around taking up space,
393 * then crank down the timeout, and see if we can't make some more
394 * go away. However, we make sure that we will never adjust more
395 * than once in rtq_timeout seconds, to keep from cranking down too
396 * hard.
397 */
398 if ((arg.found - arg.killed > rtq_toomany)
399 && (time_second - last_adjusted_timeout >= rtq_timeout)
400 && rtq_reallyold > rtq_minreallyold) {
401 rtq_reallyold = 2*rtq_reallyold / 3;
402 if (rtq_reallyold < rtq_minreallyold) {
403 rtq_reallyold = rtq_minreallyold;
404 }
405
406 last_adjusted_timeout = time_second;
407 #if DIAGNOSTIC
408 log(LOG_DEBUG, "in6_rtqtimo: adjusted rtq_reallyold to %d",
409 rtq_reallyold);
410 #endif
411 arg.found = arg.killed = 0;
412 arg.updating = 1;
413 s = splnet();
414 rnh->rnh_walktree(rnh, in6_rtqkill, &arg);
415 splx(s);
416 }
417
418 atv.tv_usec = 0;
419 atv.tv_sec = arg.nextstop;
420 timeout(in6_rtqtimo_funneled, rock, tvtohz(&atv));
421 }
422
423 /*
424 * Age old PMTUs.
425 */
426 struct mtuex_arg {
427 struct radix_node_head *rnh;
428 time_t nextstop;
429 };
430
431 static int
432 in6_mtuexpire(struct radix_node *rn, void *rock)
433 {
434 struct rtentry *rt = (struct rtentry *)rn;
435 struct mtuex_arg *ap = rock;
436
437 /* sanity */
438 if (!rt)
439 panic("rt == NULL in in6_mtuexpire");
440
441 if (rt->rt_rmx.rmx_expire && !(rt->rt_flags & RTF_PROBEMTU)) {
442 if (rt->rt_rmx.rmx_expire <= time_second) {
443 rt->rt_flags |= RTF_PROBEMTU;
444 } else {
445 ap->nextstop = lmin(ap->nextstop,
446 rt->rt_rmx.rmx_expire);
447 }
448 }
449
450 return 0;
451 }
452
453 #define MTUTIMO_DEFAULT (60*1)
454
455 static void
456 in6_mtutimo_funneled(void *rock)
457 {
458 #ifdef __APPLE__
459 boolean_t funnel_state;
460 funnel_state = thread_funnel_set(network_flock, TRUE);
461 in6_mtutimo(rock);
462 #endif
463 #ifdef __APPLE__
464 (void) thread_funnel_set(network_flock, FALSE);
465 #endif
466 }
467
468 static void
469 in6_mtutimo(void *rock)
470 {
471 struct radix_node_head *rnh = rock;
472 struct mtuex_arg arg;
473 struct timeval atv;
474 int s;
475
476 arg.rnh = rnh;
477 arg.nextstop = time_second + MTUTIMO_DEFAULT;
478 s = splnet();
479 rnh->rnh_walktree(rnh, in6_mtuexpire, &arg);
480 splx(s);
481
482 atv.tv_usec = 0;
483 atv.tv_sec = arg.nextstop;
484 if (atv.tv_sec < time_second) {
485 printf("invalid mtu expiration time on routing table\n");
486 arg.nextstop = time_second + 30; /*last resort*/
487 }
488 timeout(in6_mtutimo_funneled, rock, tvtohz(&atv));
489 }
490
491 #if 0
492 void
493 in6_rtqdrain()
494 {
495 struct radix_node_head *rnh = rt_tables[AF_INET6];
496 struct rtqk_arg arg;
497 int s;
498 arg.found = arg.killed = 0;
499 arg.rnh = rnh;
500 arg.nextstop = 0;
501 arg.draining = 1;
502 arg.updating = 0;
503 s = splnet();
504 rnh->rnh_walktree(rnh, in6_rtqkill, &arg);
505 splx(s);
506 }
507 #endif
508
509 /*
510 * Initialize our routing tree.
511 */
512 int
513 in6_inithead(void **head, int off)
514 {
515 struct radix_node_head *rnh;
516
517 if (!rn_inithead(head, off))
518 return 0;
519
520 if (head != (void **)&rt_tables[AF_INET6]) /* BOGUS! */
521 return 1; /* only do this for the real routing table */
522
523 rnh = *head;
524 rnh->rnh_addaddr = in6_addroute;
525 rnh->rnh_matchaddr = in6_matroute;
526 rnh->rnh_close = in6_clsroute;
527 in6_rtqtimo(rnh); /* kick off timeout first time */
528 in6_mtutimo(rnh); /* kick off timeout first time */
529 return 1;
530 }