bsd/netinet6/in6_rmx.c

   1 /*      $KAME: in6_rmx.c,v 1.6 2000/03/25 07:23:45 sumikawa Exp $       */
   2
   3 /*
   4  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  * 3. Neither the name of the project nor the names of its contributors
  16  *    may be used to endorse or promote products derived from this software
  17  *    without specific prior written permission.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  29  * SUCH DAMAGE.
  30  */
  31
  32 /*
  33  * Copyright 1994, 1995 Massachusetts Institute of Technology
  34  *
  35  * Permission to use, copy, modify, and distribute this software and
  36  * its documentation for any purpose and without fee is hereby
  37  * granted, provided that both the above copyright notice and this
  38  * permission notice appear in all copies, that both the above
  39  * copyright notice and this permission notice appear in all
  40  * supporting documentation, and that the name of M.I.T. not be used
  41  * in advertising or publicity pertaining to distribution of the
  42  * software without specific, written prior permission.  M.I.T. makes
  43  * no representations about the suitability of this software for any
  44  * purpose.  It is provided "as is" without express or implied
  45  * warranty.
  46  *
  47  * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''.  M.I.T. DISCLAIMS
  48  * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
  49  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  50  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
  51  * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  52  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  53  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  54  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  55  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  56  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  57  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  58  * SUCH DAMAGE.
  59  *
  60  */
  61
  62 /*
  63  * This code does two things necessary for the enhanced TCP metrics to
  64  * function in a useful manner:
  65  *  1) It marks all non-host routes as `cloning', thus ensuring that
  66  *     every actual reference to such a route actually gets turned
  67  *     into a reference to a host route to the specific destination
  68  *     requested.
  69  *  2) When such routes lose all their references, it arranges for them
  70  *     to be deleted in some random collection of circumstances, so that
  71  *     a large quantity of stale routing data is not kept in kernel memory
  72  *     indefinitely.  See in6_rtqtimo() below for the exact mechanism.
  73  */
  74
  75 #include <sys/param.h>
  76 #include <sys/systm.h>
  77 #include <sys/kernel.h>
  78 #include <sys/sysctl.h>
  79 #include <kern/queue.h>
  80 #include <sys/socket.h>
  81 #include <sys/socketvar.h>
  82 #include <sys/mbuf.h>
  83 #include <sys/syslog.h>
  84
  85 #include <net/if.h>
  86 #include <net/route.h>
  87 #include <netinet/in.h>
  88 #if defined(__APPLE__)
  89 #include <netinet/ip_var.h>
  90 #endif
  91 #include <netinet/in_var.h>
  92
  93 #include <netinet/ip6.h>
  94 #include <netinet6/ip6_var.h>
  95
  96 #include <netinet/icmp6.h>
  97
  98 #if !defined(__APPLE__)
  99 #include <netinet6/tcp6.h>
 100 #include <netinet6/tcp6_seq.h>
 101 #include <netinet6/tcp6_timer.h>
 102 #include <netinet6/tcp6_var.h>
 103 #else
 104 #include <netinet/tcp.h>
 105 #include <netinet/tcp_seq.h>
 106 #include <netinet/tcp_timer.h>
 107 #include <netinet/tcp_var.h>
 108 #endif
 109
 110 #if !defined(__APPLE__)
 111 #define tcp_sendspace tcp6_sendspace
 112 #define tcp_recvspace tcp6_recvspace
 113 #define time_second time.tv_sec
 114 #define tvtohz hzto
 115 #endif
 116
 117 extern int      in6_inithead __P((void **head, int off));
 118
 119 #define RTPRF_OURS              RTF_PROTO3      /* set on routes we manage */
 120
 121 /*
 122  * Do what we need to do when inserting a route.
 123  */
 124 static struct radix_node *
 125 in6_addroute(void *v_arg, void *n_arg, struct radix_node_head *head,
 126             struct radix_node *treenodes)
 127 {
 128         struct rtentry *rt = (struct rtentry *)treenodes;
 129         struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)rt_key(rt);
 130         struct radix_node *ret;
 131
 132         /*
 133          * For IPv6, all unicast non-host routes are automatically cloning.
 134          */
 135         if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
 136                 rt->rt_flags |= RTF_MULTICAST;
 137
 138         if (!(rt->rt_flags & (RTF_HOST | RTF_CLONING | RTF_MULTICAST))) {
 139                 rt->rt_flags |= RTF_PRCLONING;
 140         }
 141
 142         /*
 143          * A little bit of help for both IPv6 output and input:
 144          *   For local addresses, we make sure that RTF_LOCAL is set,
 145          *   with the thought that this might one day be used to speed up
 146          *   ip_input().
 147          *
 148          * We also mark routes to multicast addresses as such, because
 149          * it's easy to do and might be useful (but this is much more
 150          * dubious since it's so easy to inspect the address).  (This
 151          * is done above.)
 152          *
 153          * XXX
 154          * should elaborate the code.
 155          */
 156         if (rt->rt_flags & RTF_HOST) {
 157                 if (IN6_ARE_ADDR_EQUAL(&satosin6(rt->rt_ifa->ifa_addr)
 158                                         ->sin6_addr,
 159                                        &sin6->sin6_addr)) {
 160                         rt->rt_flags |= RTF_LOCAL;
 161                 }
 162         }
 163
 164         /*
 165          * We also specify a send and receive pipe size for every
 166          * route added, to help TCP a bit.  TCP doesn't actually
 167          * want a true pipe size, which would be prohibitive in memory
 168          * costs and is hard to compute anyway; it simply uses these
 169          * values to size its buffers.  So, we fill them in with the
 170          * same values that TCP would have used anyway, and allow the
 171          * installing program or the link layer to override these values
 172          * as it sees fit.  This will hopefully allow TCP more
 173          * opportunities to save its ssthresh value.
 174          */
 175         if (!rt->rt_rmx.rmx_sendpipe && !(rt->rt_rmx.rmx_locks & RTV_SPIPE))
 176                 rt->rt_rmx.rmx_sendpipe = tcp_sendspace;
 177
 178         if (!rt->rt_rmx.rmx_recvpipe && !(rt->rt_rmx.rmx_locks & RTV_RPIPE))
 179                 rt->rt_rmx.rmx_recvpipe = tcp_recvspace;
 180
 181         if (!rt->rt_rmx.rmx_mtu && !(rt->rt_rmx.rmx_locks & RTV_MTU)
 182             && rt->rt_ifp)
 183                 rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu;
 184
 185         ret = rn_addroute(v_arg, n_arg, head, treenodes);
 186         if (ret == NULL && rt->rt_flags & RTF_HOST) {
 187                 struct rtentry *rt2;
 188                 /*
 189                  * We are trying to add a host route, but can't.
 190                  * Find out if it is because of an
 191                  * ARP entry and delete it if so.
 192                  */
 193                 rt2 = rtalloc1((struct sockaddr *)sin6, 0,
 194                                 RTF_CLONING | RTF_PRCLONING);
 195                 if (rt2) {
 196                         if (rt2->rt_flags & RTF_LLINFO &&
 197                                 rt2->rt_flags & RTF_HOST &&
 198                                 rt2->rt_gateway &&
 199                                 rt2->rt_gateway->sa_family == AF_LINK) {
 200                                 rtrequest(RTM_DELETE,
 201                                           (struct sockaddr *)rt_key(rt2),
 202                                           rt2->rt_gateway,
 203                                           rt_mask(rt2), rt2->rt_flags, 0);
 204                                 ret = rn_addroute(v_arg, n_arg, head,
 205                                         treenodes);
 206                         }
 207                         RTFREE(rt2);
 208                 }
 209         } else if (ret == NULL && rt->rt_flags & RTF_CLONING) {
 210                 struct rtentry *rt2;
 211                 /*
 212                  * We are trying to add a net route, but can't.
 213                  * The following case should be allowed, so we'll make a
 214                  * special check for this:
 215                  *      Two IPv6 addresses with the same prefix is assigned
 216                  *      to a single interrface.
 217                  *      # ifconfig if0 inet6 3ffe:0501::1 prefix 64 alias (*1)
 218                  *      # ifconfig if0 inet6 3ffe:0501::2 prefix 64 alias (*2)
 219                  *      In this case, (*1) and (*2) want to add the same
 220                  *      net route entry, 3ffe:0501:: -> if0.
 221                  *      This case should not raise an error.
 222                  */
 223                 rt2 = rtalloc1((struct sockaddr *)sin6, 0,
 224                                 RTF_CLONING | RTF_PRCLONING);
 225                 if (rt2) {
 226                         if ((rt2->rt_flags & (RTF_CLONING|RTF_HOST|RTF_GATEWAY))
 227                                         == RTF_CLONING
 228                          && rt2->rt_gateway
 229                          && rt2->rt_gateway->sa_family == AF_LINK
 230                          && rt2->rt_ifp == rt->rt_ifp) {
 231                                 ret = rt2->rt_nodes;
 232                         }
 233                         RTFREE(rt2);
 234                 }
 235         }
 236         return ret;
 237 }
 238
 239 /*
 240  * This code is the inverse of in6_clsroute: on first reference, if we
 241  * were managing the route, stop doing so and set the expiration timer
 242  * back off again.
 243  */
 244 static struct radix_node *
 245 in6_matroute(void *v_arg, struct radix_node_head *head)
 246 {
 247         struct radix_node *rn = rn_match(v_arg, head);
 248         struct rtentry *rt = (struct rtentry *)rn;
 249
 250         if (rt && rt->rt_refcnt == 0) { /* this is first reference */
 251                 if (rt->rt_flags & RTPRF_OURS) {
 252                         rt->rt_flags &= ~RTPRF_OURS;
 253                         rt->rt_rmx.rmx_expire = 0;
 254                 }
 255         }
 256         return rn;
 257 }
 258
 259 static int rtq_reallyold = 60*60;
 260         /* one hour is ``really old'' */
 261
 262 static int rtq_minreallyold = 10;
 263         /* never automatically crank down to less */
 264
 265 static int rtq_toomany = 128;
 266         /* 128 cached routes is ``too many'' */
 267
 268
 269 /*
 270  * On last reference drop, mark the route as belong to us so that it can be
 271  * timed out.
 272  */
 273 static void
 274 in6_clsroute(struct radix_node *rn, struct radix_node_head *head)
 275 {
 276         struct rtentry *rt = (struct rtentry *)rn;
 277
 278         if (!(rt->rt_flags & RTF_UP))
 279                 return;         /* prophylactic measures */
 280
 281         if ((rt->rt_flags & (RTF_LLINFO | RTF_HOST)) != RTF_HOST)
 282                 return;
 283
 284         if ((rt->rt_flags & (RTF_WASCLONED | RTPRF_OURS))
 285            != RTF_WASCLONED)
 286                 return;
 287
 288         /*
 289          * As requested by David Greenman:
 290          * If rtq_reallyold is 0, just delete the route without
 291          * waiting for a timeout cycle to kill it.
 292          */
 293         if (rtq_reallyold != 0) {
 294                 rt->rt_flags |= RTPRF_OURS;
 295                 rt->rt_rmx.rmx_expire = time_second + rtq_reallyold;
 296         } else {
 297                 rtrequest(RTM_DELETE,
 298                           (struct sockaddr *)rt_key(rt),
 299                           rt->rt_gateway, rt_mask(rt),
 300                           rt->rt_flags, 0);
 301         }
 302 }
 303
 304 struct rtqk_arg {
 305         struct radix_node_head *rnh;
 306         int mode;
 307         int updating;
 308         int draining;
 309         int killed;
 310         int found;
 311         time_t nextstop;
 312 };
 313
 314 /*
 315  * Get rid of old routes.  When draining, this deletes everything, even when
 316  * the timeout is not expired yet.  When updating, this makes sure that
 317  * nothing has a timeout longer than the current value of rtq_reallyold.
 318  */
 319 static int
 320 in6_rtqkill(struct radix_node *rn, void *rock)
 321 {
 322         struct rtqk_arg *ap = rock;
 323         struct rtentry *rt = (struct rtentry *)rn;
 324         int err;
 325
 326         if (rt->rt_flags & RTPRF_OURS) {
 327                 ap->found++;
 328
 329                 if (ap->draining || rt->rt_rmx.rmx_expire <= time_second) {
 330                         if (rt->rt_refcnt > 0)
 331                                 panic("rtqkill route really not free");
 332
 333                         err = rtrequest(RTM_DELETE,
 334                                         (struct sockaddr *)rt_key(rt),
 335                                         rt->rt_gateway, rt_mask(rt),
 336                                         rt->rt_flags, 0);
 337                         if (err) {
 338                                 log(LOG_WARNING, "in6_rtqkill: error %d", err);
 339                         } else {
 340                                 ap->killed++;
 341                         }
 342                 } else {
 343                         if (ap->updating
 344                            && (rt->rt_rmx.rmx_expire - time_second
 345                                > rtq_reallyold)) {
 346                                 rt->rt_rmx.rmx_expire = time_second
 347                                         + rtq_reallyold;
 348                         }
 349                         ap->nextstop = lmin(ap->nextstop,
 350                                             rt->rt_rmx.rmx_expire);
 351                 }
 352         }
 353
 354         return 0;
 355 }
 356
 357 #define RTQ_TIMEOUT     60*10   /* run no less than once every ten minutes */
 358 static int rtq_timeout = RTQ_TIMEOUT;
 359
 360 static void
 361 in6_rtqtimo_funneled(void *rock)
 362 {
 363 #ifdef __APPLE__
 364         boolean_t   funnel_state;
 365         funnel_state = thread_funnel_set(network_flock, TRUE);
 366         in6_rtqtimo(rock);
 367 #endif
 368 #ifdef __APPLE__
 369         (void) thread_funnel_set(network_flock, FALSE);
 370 #endif
 371 }
 372
 373 static void
 374 in6_rtqtimo(void *rock)
 375 {
 376         struct radix_node_head *rnh = rock;
 377         struct rtqk_arg arg;
 378         struct timeval atv;
 379         static time_t last_adjusted_timeout = 0;
 380         int s;
 381
 382         arg.found = arg.killed = 0;
 383         arg.rnh = rnh;
 384         arg.nextstop = time_second + rtq_timeout;
 385         arg.draining = arg.updating = 0;
 386         s = splnet();
 387         rnh->rnh_walktree(rnh, in6_rtqkill, &arg);
 388         splx(s);
 389
 390         /*
 391          * Attempt to be somewhat dynamic about this:
 392          * If there are ``too many'' routes sitting around taking up space,
 393          * then crank down the timeout, and see if we can't make some more
 394          * go away.  However, we make sure that we will never adjust more
 395          * than once in rtq_timeout seconds, to keep from cranking down too
 396          * hard.
 397          */
 398         if ((arg.found - arg.killed > rtq_toomany)
 399            && (time_second - last_adjusted_timeout >= rtq_timeout)
 400            && rtq_reallyold > rtq_minreallyold) {
 401                 rtq_reallyold = 2*rtq_reallyold / 3;
 402                 if (rtq_reallyold < rtq_minreallyold) {
 403                         rtq_reallyold = rtq_minreallyold;
 404                 }
 405
 406                 last_adjusted_timeout = time_second;
 407 #if DIAGNOSTIC
 408                 log(LOG_DEBUG, "in6_rtqtimo: adjusted rtq_reallyold to %d",
 409                     rtq_reallyold);
 410 #endif
 411                 arg.found = arg.killed = 0;
 412                 arg.updating = 1;
 413                 s = splnet();
 414                 rnh->rnh_walktree(rnh, in6_rtqkill, &arg);
 415                 splx(s);
 416         }
 417
 418         atv.tv_usec = 0;
 419         atv.tv_sec = arg.nextstop;
 420         timeout(in6_rtqtimo_funneled, rock, tvtohz(&atv));
 421 }
 422
 423 /*
 424  * Age old PMTUs.
 425  */
 426 struct mtuex_arg {
 427         struct radix_node_head *rnh;
 428         time_t nextstop;
 429 };
 430
 431 static int
 432 in6_mtuexpire(struct radix_node *rn, void *rock)
 433 {
 434         struct rtentry *rt = (struct rtentry *)rn;
 435         struct mtuex_arg *ap = rock;
 436
 437         /* sanity */
 438         if (!rt)
 439                 panic("rt == NULL in in6_mtuexpire");
 440
 441         if (rt->rt_rmx.rmx_expire && !(rt->rt_flags & RTF_PROBEMTU)) {
 442                 if (rt->rt_rmx.rmx_expire <= time_second) {
 443                         rt->rt_flags |= RTF_PROBEMTU;
 444                 } else {
 445                         ap->nextstop = lmin(ap->nextstop,
 446                                         rt->rt_rmx.rmx_expire);
 447                 }
 448         }
 449
 450         return 0;
 451 }
 452
 453 #define MTUTIMO_DEFAULT (60*1)
 454
 455 static void
 456 in6_mtutimo_funneled(void *rock)
 457 {
 458 #ifdef __APPLE__
 459         boolean_t   funnel_state;
 460         funnel_state = thread_funnel_set(network_flock, TRUE);
 461         in6_mtutimo(rock);
 462 #endif
 463 #ifdef __APPLE__
 464         (void) thread_funnel_set(network_flock, FALSE);
 465 #endif
 466 }
 467
 468 static void
 469 in6_mtutimo(void *rock)
 470 {
 471         struct radix_node_head *rnh = rock;
 472         struct mtuex_arg arg;
 473         struct timeval atv;
 474         int s;
 475
 476         arg.rnh = rnh;
 477         arg.nextstop = time_second + MTUTIMO_DEFAULT;
 478         s = splnet();
 479         rnh->rnh_walktree(rnh, in6_mtuexpire, &arg);
 480         splx(s);
 481
 482         atv.tv_usec = 0;
 483         atv.tv_sec = arg.nextstop;
 484         if (atv.tv_sec < time_second) {
 485                 printf("invalid mtu expiration time on routing table\n");
 486                 arg.nextstop = time_second + 30;        /*last resort*/
 487         }
 488         timeout(in6_mtutimo_funneled, rock, tvtohz(&atv));
 489 }
 490
 491 #if 0
 492 void
 493 in6_rtqdrain()
 494 {
 495         struct radix_node_head *rnh = rt_tables[AF_INET6];
 496         struct rtqk_arg arg;
 497         int s;
 498         arg.found = arg.killed = 0;
 499         arg.rnh = rnh;
 500         arg.nextstop = 0;
 501         arg.draining = 1;
 502         arg.updating = 0;
 503         s = splnet();
 504         rnh->rnh_walktree(rnh, in6_rtqkill, &arg);
 505         splx(s);
 506 }
 507 #endif
 508
 509 /*
 510  * Initialize our routing tree.
 511  */
 512 int
 513 in6_inithead(void **head, int off)
 514 {
 515         struct radix_node_head *rnh;
 516
 517         if (!rn_inithead(head, off))
 518                 return 0;
 519
 520         if (head != (void **)&rt_tables[AF_INET6]) /* BOGUS! */
 521                 return 1;       /* only do this for the real routing table */
 522
 523         rnh = *head;
 524         rnh->rnh_addaddr = in6_addroute;
 525         rnh->rnh_matchaddr = in6_matroute;
 526         rnh->rnh_close = in6_clsroute;
 527         in6_rtqtimo(rnh);       /* kick off timeout first time */
 528         in6_mtutimo(rnh);       /* kick off timeout first time */
 529         return 1;
 530 }