xnu-6153.61.1.tar.gz

[apple/xnu.git] / bsd / netinet / in_rmx.c
diff --git a/bsd/netinet/in_rmx.c b/bsd/netinet/in_rmx.c

index b3143e11d1cb1ef9a4e82e29f8000a635cccf556..0740088709b76b2d6eb0177b8b6e725d5b6476c1 100644 (file)
--- a/bsd/netinet/in_rmx.c
+++ b/bsd/netinet/in_rmx.c
@@ -1,17 +1,20 @@
  /*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   *
- * @APPLE_LICENSE_HEADER_START@
- * 
- * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
- * 
   * This file contains Original Code and/or Modifications of Original Code
   * as defined in and that are subject to the Apple Public Source License
   * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this
- * file.
- * 
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
   * The Original Code and all software distributed under the License are
   * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
   * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
@@ -19,8 +22,8 @@
   * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
   * Please see the License for the specific language governing rights and
   * limitations under the License.
- * 
- * @APPLE_LICENSE_HEADER_END@
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
   */
  /*
   * Copyright 1994, 1995 Massachusetts Institute of Technology
@@ -50,7 +53,6 @@
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   *
- * $FreeBSD: src/sys/netinet/in_rmx.c,v 1.37.2.1 2001/05/14 08:23:49 ru Exp $
   */
  
  /*
@@ -72,39 +74,67 @@
  #include <sys/sysctl.h>
  #include <sys/socket.h>
  #include <sys/mbuf.h>
+#include <sys/protosw.h>
  #include <sys/syslog.h>
+#include <sys/mcache.h>
+#include <kern/locks.h>
  
  #include <net/if.h>
  #include <net/route.h>
  #include <netinet/in.h>
  #include <netinet/in_var.h>
+#include <netinet/in_arp.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet6/nd6.h>
+
+extern int tvtohz(struct timeval *);
  
-extern int     in_inithead __P((void **head, int off));
+static int in_rtqtimo_run;              /* in_rtqtimo is scheduled to run */
+static void in_rtqtimo(void *);
+static void in_sched_rtqtimo(struct timeval *);
  
-#ifdef __APPLE__
-static void in_rtqtimo(void *rock);
-#endif
+static struct radix_node *in_addroute(void *, void *, struct radix_node_head *,
+    struct radix_node *);
+static struct radix_node *in_deleteroute(void *, void *,
+    struct radix_node_head *);
+static struct radix_node *in_matroute(void *, struct radix_node_head *);
+static struct radix_node *in_matroute_args(void *, struct radix_node_head *,
+    rn_matchf_t *f, void *);
+static void in_clsroute(struct radix_node *, struct radix_node_head *);
+static int in_rtqkill(struct radix_node *, void *);
  
-#define RTPRF_OURS             RTF_PROTO3      /* set on routes we manage */
+static int in_ifadownkill(struct radix_node *, void *);
  
  /*
   * Do what we need to do when inserting a route.
   */
  static struct radix_node *
  in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head,
-           struct radix_node *treenodes)
+    struct radix_node *treenodes)
  {
         struct rtentry *rt = (struct rtentry *)treenodes;
-       struct sockaddr_in *sin = (struct sockaddr_in *)rt_key(rt);
+       struct sockaddr_in *sin = (struct sockaddr_in *)(void *)rt_key(rt);
         struct radix_node *ret;
+       char dbuf[MAX_IPv4_STR_LEN], gbuf[MAX_IPv4_STR_LEN];
+       uint32_t flags = rt->rt_flags;
+       boolean_t verbose = (rt_verbose > 1);
+
+       LCK_MTX_ASSERT(rnh_lock, LCK_MTX_ASSERT_OWNED);
+       RT_LOCK_ASSERT_HELD(rt);
+
+       if (verbose) {
+               rt_str(rt, dbuf, sizeof(dbuf), gbuf, sizeof(gbuf));
+       }
  
         /*
          * For IP, all unicast non-host routes are automatically cloning.
          */
-       if(IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
+       if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
                 rt->rt_flags |= RTF_MULTICAST;
+       }
  
-       if(!(rt->rt_flags & (RTF_HOST | RTF_CLONING | RTF_MULTICAST))) {
+       if (!(rt->rt_flags & (RTF_HOST | RTF_CLONING | RTF_MULTICAST))) {
                 rt->rt_flags |= RTF_PRCLONING;
         }
  
@@ -128,104 +158,210 @@ in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head,
                 if (in_broadcast(sin->sin_addr, rt->rt_ifp)) {
                         rt->rt_flags |= RTF_BROADCAST;
                 } else {
-#define satosin(sa) ((struct sockaddr_in *)sa)
-                       if (satosin(rt->rt_ifa->ifa_addr)->sin_addr.s_addr
-                           == sin->sin_addr.s_addr)
+                       /* Become a regular mutex */
+                       RT_CONVERT_LOCK(rt);
+                       IFA_LOCK_SPIN(rt->rt_ifa);
+                       if (satosin(rt->rt_ifa->ifa_addr)->sin_addr.s_addr ==
+                           sin->sin_addr.s_addr) {
                                 rt->rt_flags |= RTF_LOCAL;
-#undef satosin
+                       }
+                       IFA_UNLOCK(rt->rt_ifa);
                 }
         }
  
-       if (!rt->rt_rmx.rmx_mtu && !(rt->rt_rmx.rmx_locks & RTV_MTU) 
-           && rt->rt_ifp)
+       if (!rt->rt_rmx.rmx_mtu && !(rt->rt_rmx.rmx_locks & RTV_MTU) &&
+           rt->rt_ifp) {
                 rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu;
+               if (INTF_ADJUST_MTU_FOR_CLAT46(rt->rt_ifp)) {
+                       rt->rt_rmx.rmx_mtu = IN6_LINKMTU(rt->rt_ifp);
+                       /* Further adjust the size for CLAT46 expansion */
+                       rt->rt_rmx.rmx_mtu -= CLAT46_HDR_EXPANSION_OVERHD;
+               }
+       }
  
         ret = rn_addroute(v_arg, n_arg, head, treenodes);
-       if (ret == NULL && rt->rt_flags & RTF_HOST) {
+       if (ret == NULL && (rt->rt_flags & RTF_HOST)) {
                 struct rtentry *rt2;
                 /*
                  * We are trying to add a host route, but can't.
                  * Find out if it is because of an
                  * ARP entry and delete it if so.
                  */
-               rt2 = rtalloc1((struct sockaddr *)sin, 0,
-                               RTF_CLONING | RTF_PRCLONING);
-               if (rt2) {
-                       if (rt2->rt_flags & RTF_LLINFO &&
-                               rt2->rt_flags & RTF_HOST &&
-                               rt2->rt_gateway &&
-                               rt2->rt_gateway->sa_family == AF_LINK) {
-                               rtrequest(RTM_DELETE,
-                                         (struct sockaddr *)rt_key(rt2),
-                                         rt2->rt_gateway,
-                                         rt_mask(rt2), rt2->rt_flags, 0);
+               rt2 = rtalloc1_scoped_locked(rt_key(rt), 0,
+                   RTF_CLONING | RTF_PRCLONING, sin_get_ifscope(rt_key(rt)));
+               if (rt2 != NULL) {
+                       char dbufc[MAX_IPv4_STR_LEN];
+
+                       RT_LOCK(rt2);
+                       if (verbose) {
+                               rt_str(rt2, dbufc, sizeof(dbufc), NULL, 0);
+                       }
+
+                       if ((rt2->rt_flags & RTF_LLINFO) &&
+                           (rt2->rt_flags & RTF_HOST) &&
+                           rt2->rt_gateway != NULL &&
+                           rt2->rt_gateway->sa_family == AF_LINK) {
+                               if (verbose) {
+                                       log(LOG_DEBUG, "%s: unable to insert "
+                                           "route to %s;%s, flags=%b, due to "
+                                           "existing ARP route %s->%s "
+                                           "flags=%b, attempting to delete\n",
+                                           __func__, dbuf,
+                                           (rt->rt_ifp != NULL) ?
+                                           rt->rt_ifp->if_xname : "",
+                                           rt->rt_flags, RTF_BITS, dbufc,
+                                           (rt2->rt_ifp != NULL) ?
+                                           rt2->rt_ifp->if_xname : "",
+                                           rt2->rt_flags, RTF_BITS);
+                               }
+                               /*
+                                * Safe to drop rt_lock and use rt_key,
+                                * rt_gateway, since holding rnh_lock here
+                                * prevents another thread from calling
+                                * rt_setgate() on this route.
+                                */
+                               RT_UNLOCK(rt2);
+                               (void) rtrequest_locked(RTM_DELETE, rt_key(rt2),
+                                   rt2->rt_gateway, rt_mask(rt2),
+                                   rt2->rt_flags, NULL);
                                 ret = rn_addroute(v_arg, n_arg, head,
-                                       treenodes);
+                                   treenodes);
+                       } else {
+                               RT_UNLOCK(rt2);
                         }
-                       rtfree(rt2);
+                       rtfree_locked(rt2);
                 }
         }
+
+       if (!verbose) {
+               goto done;
+       }
+
+       if (ret != NULL) {
+               if (flags != rt->rt_flags) {
+                       log(LOG_DEBUG, "%s: route to %s->%s->%s inserted, "
+                           "oflags=%b, flags=%b\n", __func__,
+                           dbuf, gbuf, (rt->rt_ifp != NULL) ?
+                           rt->rt_ifp->if_xname : "", flags, RTF_BITS,
+                           rt->rt_flags, RTF_BITS);
+               } else {
+                       log(LOG_DEBUG, "%s: route to %s->%s->%s inserted, "
+                           "flags=%b\n", __func__, dbuf, gbuf,
+                           (rt->rt_ifp != NULL) ? rt->rt_ifp->if_xname : "",
+                           rt->rt_flags, RTF_BITS);
+               }
+       } else {
+               log(LOG_DEBUG, "%s: unable to insert route to %s->%s->%s, "
+                   "flags=%b, already exists\n", __func__, dbuf, gbuf,
+                   (rt->rt_ifp != NULL) ? rt->rt_ifp->if_xname : "",
+                   rt->rt_flags, RTF_BITS);
+       }
+done:
         return ret;
  }
  
+static struct radix_node *
+in_deleteroute(void *v_arg, void *netmask_arg, struct radix_node_head *head)
+{
+       struct radix_node *rn;
+
+       LCK_MTX_ASSERT(rnh_lock, LCK_MTX_ASSERT_OWNED);
+
+       rn = rn_delete(v_arg, netmask_arg, head);
+       if (rt_verbose > 1 && rn != NULL) {
+               char dbuf[MAX_IPv4_STR_LEN], gbuf[MAX_IPv4_STR_LEN];
+               struct rtentry *rt = (struct rtentry *)rn;
+
+               RT_LOCK(rt);
+               rt_str(rt, dbuf, sizeof(dbuf), gbuf, sizeof(gbuf));
+               log(LOG_DEBUG, "%s: route to %s->%s->%s deleted, "
+                   "flags=%b\n", __func__, dbuf, gbuf, (rt->rt_ifp != NULL) ?
+                   rt->rt_ifp->if_xname : "", rt->rt_flags, RTF_BITS);
+               RT_UNLOCK(rt);
+       }
+       return rn;
+}
+
+/*
+ * Validate (unexpire) an expiring AF_INET route.
+ */
+struct radix_node *
+in_validate(struct radix_node *rn)
+{
+       struct rtentry *rt = (struct rtentry *)rn;
+
+       RT_LOCK_ASSERT_HELD(rt);
+
+       /* This is first reference? */
+       if (rt->rt_refcnt == 0) {
+               if (rt_verbose > 2) {
+                       char dbuf[MAX_IPv4_STR_LEN], gbuf[MAX_IPv4_STR_LEN];
+
+                       rt_str(rt, dbuf, sizeof(dbuf), gbuf, sizeof(gbuf));
+                       log(LOG_DEBUG, "%s: route to %s->%s->%s validated, "
+                           "flags=%b\n", __func__, dbuf, gbuf,
+                           (rt->rt_ifp != NULL) ? rt->rt_ifp->if_xname : "",
+                           rt->rt_flags, RTF_BITS);
+               }
+
+               /*
+                * It's one of ours; unexpire it.  If the timer is already
+                * scheduled, let it run later as it won't re-arm itself
+                * if there's nothing to do.
+                */
+               if (rt->rt_flags & RTPRF_OURS) {
+                       rt->rt_flags &= ~RTPRF_OURS;
+                       rt_setexpire(rt, 0);
+               }
+       }
+       return rn;
+}
+
+/*
+ * Similar to in_matroute_args except without the leaf-matching parameters.
+ */
+static struct radix_node *
+in_matroute(void *v_arg, struct radix_node_head *head)
+{
+       return in_matroute_args(v_arg, head, NULL, NULL);
+}
+
  /*
   * This code is the inverse of in_clsroute: on first reference, if we
   * were managing the route, stop doing so and set the expiration timer
   * back off again.
   */
  static struct radix_node *
-in_matroute(void *v_arg, struct radix_node_head *head)
+in_matroute_args(void *v_arg, struct radix_node_head *head,
+    rn_matchf_t *f, void *w)
  {
-       struct radix_node *rn = rn_match(v_arg, head);
-       struct rtentry *rt = (struct rtentry *)rn;
+       struct radix_node *rn = rn_match_args(v_arg, head, f, w);
  
-       if(rt && rt->rt_refcnt == 0) { /* this is first reference */
-               if(rt->rt_flags & RTPRF_OURS) {
-                       rt->rt_flags &= ~RTPRF_OURS;
-                       rt->rt_rmx.rmx_expire = 0;
-               }
+       if (rn != NULL) {
+               RT_LOCK_SPIN((struct rtentry *)rn);
+               in_validate(rn);
+               RT_UNLOCK((struct rtentry *)rn);
         }
         return rn;
  }
  
-static int rtq_reallyold = 60*60;
-       /* one hour is ``really old'' */
-SYSCTL_INT(_net_inet_ip, IPCTL_RTEXPIRE, rtexpire, CTLFLAG_RW, 
-    &rtq_reallyold , 0, 
+/* one hour is ``really old'' */
+static uint32_t rtq_reallyold = 60 * 60;
+SYSCTL_UINT(_net_inet_ip, IPCTL_RTEXPIRE, rtexpire,
+    CTLFLAG_RW | CTLFLAG_LOCKED, &rtq_reallyold, 0,
      "Default expiration time on dynamically learned routes");
-                                  
-static int rtq_minreallyold = 10;
-       /* never automatically crank down to less */
-SYSCTL_INT(_net_inet_ip, IPCTL_RTMINEXPIRE, rtminexpire, CTLFLAG_RW, 
-    &rtq_minreallyold , 0, 
+
+/* never automatically crank down to less */
+static uint32_t rtq_minreallyold = 10;
+SYSCTL_UINT(_net_inet_ip, IPCTL_RTMINEXPIRE, rtminexpire,
+    CTLFLAG_RW | CTLFLAG_LOCKED, &rtq_minreallyold, 0,
      "Minimum time to attempt to hold onto dynamically learned routes");
-                                  
-static int rtq_toomany = 128;
-       /* 128 cached routes is ``too many'' */
-SYSCTL_INT(_net_inet_ip, IPCTL_RTMAXCACHE, rtmaxcache, CTLFLAG_RW, 
-    &rtq_toomany , 0, "Upper limit on dynamically learned routes");
-
-#ifdef __APPLE__
-/* XXX LD11JUL02 Special case for AOL 5.1.2 connectivity issue to AirPort BS (Radar 2969954)
- * AOL is adding a circular route ("10.0.1.1/32 10.0.1.1") when establishing its ppp tunnel
- * to the AP BaseStation by removing the default gateway and replacing it with their tunnel entry point.
- * There is no apparent reason to add this route as there is a valid 10.0.1.1/24 route to the BS.
- * That circular route was ignored on previous version of MacOS X because of a routing bug
- * corrected with the merge to FreeBSD4.4 (a route generated from an RTF_CLONING route had the RTF_WASCLONED
- * flag set but did not have a reference to the parent route) and that entry was left in the RT. This workaround is
- * made in order to provide binary compatibility with AOL. 
- * If we catch a process adding a circular route with a /32 from the routing socket, we error it out instead of
- * confusing the routing table with a wrong route to the previous default gateway
- * If for some reason a circular route is needed, turn this sysctl (net.inet.ip.check_route_selfref) to zero.
- */
-int check_routeselfref = 1;
-SYSCTL_INT(_net_inet_ip, OID_AUTO, check_route_selfref, CTLFLAG_RW, 
-    &check_routeselfref , 0, "");
-#endif
  
-__private_extern__ int use_routegenid = 1;
-SYSCTL_INT(_net_inet_ip, OID_AUTO, use_route_genid, CTLFLAG_RW, 
-    &use_routegenid , 0, "");
+/* 128 cached routes is ``too many'' */
+static uint32_t rtq_toomany = 128;
+SYSCTL_UINT(_net_inet_ip, IPCTL_RTMAXCACHE, rtmaxcache,
+    CTLFLAG_RW | CTLFLAG_LOCKED, &rtq_toomany, 0,
+    "Upper limit on dynamically learned routes");
  
  /*
   * On last reference drop, mark the route as belong to us so that it can be
@@ -234,41 +370,101 @@ SYSCTL_INT(_net_inet_ip, OID_AUTO, use_route_genid, CTLFLAG_RW,
  static void
  in_clsroute(struct radix_node *rn, struct radix_node_head *head)
  {
+#pragma unused(head)
+       char dbuf[MAX_IPv4_STR_LEN], gbuf[MAX_IPv4_STR_LEN];
         struct rtentry *rt = (struct rtentry *)rn;
+       boolean_t verbose = (rt_verbose > 1);
  
-       if(!(rt->rt_flags & RTF_UP))
-               return;         /* prophylactic measures */
+       LCK_MTX_ASSERT(rnh_lock, LCK_MTX_ASSERT_OWNED);
+       RT_LOCK_ASSERT_HELD(rt);
  
-       if((rt->rt_flags & (RTF_LLINFO | RTF_HOST)) != RTF_HOST)
+       if (!(rt->rt_flags & RTF_UP)) {
+               return;         /* prophylactic measures */
+       }
+       if ((rt->rt_flags & (RTF_LLINFO | RTF_HOST)) != RTF_HOST) {
                 return;
+       }
  
-       if((rt->rt_flags & (RTF_WASCLONED | RTPRF_OURS))
-          != RTF_WASCLONED)
+       if (rt->rt_flags & RTPRF_OURS) {
                 return;
+       }
+
+       if (!(rt->rt_flags & (RTF_WASCLONED | RTF_DYNAMIC))) {
+               return;
+       }
+
+       if (verbose) {
+               rt_str(rt, dbuf, sizeof(dbuf), gbuf, sizeof(gbuf));
+       }
  
         /*
-        * As requested by David Greenman:
-        * If rtq_reallyold is 0, just delete the route without
-        * waiting for a timeout cycle to kill it.
+        * Delete the route immediately if RTF_DELCLONE is set or
+        * if route caching is disabled (rtq_reallyold set to 0).
+        * Otherwise, let it expire and be deleted by in_rtqkill().
          */
-       if(rtq_reallyold != 0) {
-               rt->rt_flags |= RTPRF_OURS;
-               rt->rt_rmx.rmx_expire = time_second + rtq_reallyold;
+       if ((rt->rt_flags & RTF_DELCLONE) || rtq_reallyold == 0) {
+               int err;
+
+               if (verbose) {
+                       log(LOG_DEBUG, "%s: deleting route to %s->%s->%s, "
+                           "flags=%b\n", __func__, dbuf, gbuf,
+                           (rt->rt_ifp != NULL) ? rt->rt_ifp->if_xname : "",
+                           rt->rt_flags, RTF_BITS);
+               }
+               /*
+                * Delete the route from the radix tree but since we are
+                * called when the route's reference count is 0, don't
+                * deallocate it until we return from this routine by
+                * telling rtrequest that we're interested in it.
+                * Safe to drop rt_lock and use rt_key, rt_gateway since
+                * holding rnh_lock here prevents another thread from
+                * calling rt_setgate() on this route.
+                */
+               RT_UNLOCK(rt);
+               err = rtrequest_locked(RTM_DELETE, rt_key(rt),
+                   rt->rt_gateway, rt_mask(rt), rt->rt_flags, &rt);
+               if (err == 0) {
+                       /* Now let the caller free it */
+                       RT_LOCK(rt);
+                       RT_REMREF_LOCKED(rt);
+               } else {
+                       RT_LOCK(rt);
+                       if (!verbose) {
+                               rt_str(rt, dbuf, sizeof(dbuf),
+                                   gbuf, sizeof(gbuf));
+                       }
+                       log(LOG_ERR, "%s: error deleting route to "
+                           "%s->%s->%s, flags=%b, err=%d\n", __func__,
+                           dbuf, gbuf, (rt->rt_ifp != NULL) ?
+                           rt->rt_ifp->if_xname : "", rt->rt_flags,
+                           RTF_BITS, err);
+               }
         } else {
-               rtrequest(RTM_DELETE,
-                         (struct sockaddr *)rt_key(rt),
-                         rt->rt_gateway, rt_mask(rt),
-                         rt->rt_flags, 0);
+               uint64_t timenow;
+
+               timenow = net_uptime();
+               rt->rt_flags |= RTPRF_OURS;
+               rt_setexpire(rt, timenow + rtq_reallyold);
+
+               if (verbose) {
+                       log(LOG_DEBUG, "%s: route to %s->%s->%s invalidated, "
+                           "flags=%b, expire=T+%u\n", __func__, dbuf, gbuf,
+                           (rt->rt_ifp != NULL) ? rt->rt_ifp->if_xname : "",
+                           rt->rt_flags, RTF_BITS, rt->rt_expire - timenow);
+               }
+
+               /* We have at least one entry; arm the timer if not already */
+               in_sched_rtqtimo(NULL);
         }
  }
  
  struct rtqk_arg {
         struct radix_node_head *rnh;
-       int draining;
-       int killed;
-       int found;
         int updating;
-       time_t nextstop;
+       int draining;
+       uint32_t killed;
+       uint32_t found;
+       uint64_t nextstop;
  };
  
  /*
@@ -281,69 +477,127 @@ in_rtqkill(struct radix_node *rn, void *rock)
  {
         struct rtqk_arg *ap = rock;
         struct rtentry *rt = (struct rtentry *)rn;
+       boolean_t verbose = (rt_verbose > 1);
+       uint64_t timenow;
         int err;
  
-       if(rt->rt_flags & RTPRF_OURS) {
-               ap->found++;
+       timenow = net_uptime();
+       LCK_MTX_ASSERT(rnh_lock, LCK_MTX_ASSERT_OWNED);
+
+       RT_LOCK(rt);
+       if (rt->rt_flags & RTPRF_OURS) {
+               char dbuf[MAX_IPv4_STR_LEN], gbuf[MAX_IPv4_STR_LEN];
+
+               if (verbose) {
+                       rt_str(rt, dbuf, sizeof(dbuf), gbuf, sizeof(gbuf));
+               }
  
-               if(ap->draining || rt->rt_rmx.rmx_expire <= time_second) {
-                       if(rt->rt_refcnt > 0)
-                               panic("rtqkill route really not free");
+               ap->found++;
+               VERIFY(rt->rt_expire == 0 || rt->rt_rmx.rmx_expire != 0);
+               VERIFY(rt->rt_expire != 0 || rt->rt_rmx.rmx_expire == 0);
+               if (ap->draining || rt->rt_expire <= timenow) {
+                       if (rt->rt_refcnt > 0) {
+                               panic("%s: route %p marked with RTPRF_OURS "
+                                   "with non-zero refcnt (%u)", __func__,
+                                   rt, rt->rt_refcnt);
+                               /* NOTREACHED */
+                       }
  
-                       err = rtrequest(RTM_DELETE,
-                                       (struct sockaddr *)rt_key(rt),
-                                       rt->rt_gateway, rt_mask(rt),
-                                       rt->rt_flags, 0);
-                       if(err) {
-                               log(LOG_WARNING, "in_rtqkill: error %d\n", err);
+                       if (verbose) {
+                               log(LOG_DEBUG, "%s: deleting route to "
+                                   "%s->%s->%s, flags=%b, draining=%d\n",
+                                   __func__, dbuf, gbuf, (rt->rt_ifp != NULL) ?
+                                   rt->rt_ifp->if_xname : "", rt->rt_flags,
+                                   RTF_BITS, ap->draining);
+                       }
+                       RT_ADDREF_LOCKED(rt);   /* for us to free below */
+                       /*
+                        * Delete this route since we're done with it;
+                        * the route may be freed afterwards, so we
+                        * can no longer refer to 'rt' upon returning
+                        * from rtrequest().  Safe to drop rt_lock and
+                        * use rt_key, rt_gateway since holding rnh_lock
+                        * here prevents another thread from calling
+                        * rt_setgate() on this route.
+                        */
+                       RT_UNLOCK(rt);
+                       err = rtrequest_locked(RTM_DELETE, rt_key(rt),
+                           rt->rt_gateway, rt_mask(rt), rt->rt_flags, NULL);
+                       if (err != 0) {
+                               RT_LOCK(rt);
+                               if (!verbose) {
+                                       rt_str(rt, dbuf, sizeof(dbuf),
+                                           gbuf, sizeof(gbuf));
+                               }
+                               log(LOG_ERR, "%s: error deleting route to "
+                                   "%s->%s->%s, flags=%b, err=%d\n", __func__,
+                                   dbuf, gbuf, (rt->rt_ifp != NULL) ?
+                                   rt->rt_ifp->if_xname : "", rt->rt_flags,
+                                   RTF_BITS, err);
+                               RT_UNLOCK(rt);
                         } else {
                                 ap->killed++;
                         }
+                       rtfree_locked(rt);
                 } else {
-                       if(ap->updating
-                          && (rt->rt_rmx.rmx_expire - time_second
-                              > rtq_reallyold)) {
-                               rt->rt_rmx.rmx_expire = time_second
-                                       + rtq_reallyold;
+                       uint64_t expire = (rt->rt_expire - timenow);
+
+                       if (ap->updating && expire > rtq_reallyold) {
+                               rt_setexpire(rt, timenow + rtq_reallyold);
+                               if (verbose) {
+                                       log(LOG_DEBUG, "%s: route to "
+                                           "%s->%s->%s, flags=%b, adjusted "
+                                           "expire=T+%u (was T+%u)\n",
+                                           __func__, dbuf, gbuf,
+                                           (rt->rt_ifp != NULL) ?
+                                           rt->rt_ifp->if_xname : "",
+                                           rt->rt_flags, RTF_BITS,
+                                           (rt->rt_expire - timenow), expire);
+                               }
                         }
-                       ap->nextstop = lmin(ap->nextstop,
-                                           rt->rt_rmx.rmx_expire);
+                       ap->nextstop = lmin(ap->nextstop, rt->rt_expire);
+                       RT_UNLOCK(rt);
                 }
+       } else {
+               RT_UNLOCK(rt);
         }
  
         return 0;
  }
  
-static void
-in_rtqtimo_funnel(void *rock)
-{
-       boolean_t       funnel_state;
-
-       funnel_state = thread_funnel_set(network_flock, TRUE);
-        in_rtqtimo(rock);
-       (void) thread_funnel_set(network_flock, FALSE);
-
-}
-#define RTQ_TIMEOUT    60*10   /* run no less than once every ten minutes */
+#define RTQ_TIMEOUT     60*10   /* run no less than once every ten minutes */
  static int rtq_timeout = RTQ_TIMEOUT;
  
  static void
-in_rtqtimo(void *rock)
+in_rtqtimo(void *targ)
  {
-       struct radix_node_head *rnh = rock;
+#pragma unused(targ)
+       struct radix_node_head *rnh;
         struct rtqk_arg arg;
         struct timeval atv;
-       static time_t last_adjusted_timeout = 0;
-       int s;
-
-       arg.found = arg.killed = 0;
+       static uint64_t last_adjusted_timeout = 0;
+       boolean_t verbose = (rt_verbose > 1);
+       uint64_t timenow;
+       uint32_t ours;
+
+       lck_mtx_lock(rnh_lock);
+       rnh = rt_tables[AF_INET];
+       VERIFY(rnh != NULL);
+
+       /* Get the timestamp after we acquire the lock for better accuracy */
+       timenow = net_uptime();
+       if (verbose) {
+               log(LOG_DEBUG, "%s: initial nextstop is T+%u seconds\n",
+                   __func__, rtq_timeout);
+       }
+       bzero(&arg, sizeof(arg));
         arg.rnh = rnh;
-       arg.nextstop = time_second + rtq_timeout;
-       arg.draining = arg.updating = 0;
-       s = splnet();
+       arg.nextstop = timenow + rtq_timeout;
         rnh->rnh_walktree(rnh, in_rtqkill, &arg);
-       splx(s);
-
+       if (verbose) {
+               log(LOG_DEBUG, "%s: found %u, killed %u\n", __func__,
+                   arg.found, arg.killed);
+       }
         /*
          * Attempt to be somewhat dynamic about this:
          * If there are ``too many'' routes sitting around taking up space,
@@ -352,45 +606,78 @@ in_rtqtimo(void *rock)
          * than once in rtq_timeout seconds, to keep from cranking down too
          * hard.
          */
-       if((arg.found - arg.killed > rtq_toomany)
-          && (time_second - last_adjusted_timeout >= rtq_timeout)
-          && rtq_reallyold > rtq_minreallyold) {
-               rtq_reallyold = 2*rtq_reallyold / 3;
-               if(rtq_reallyold < rtq_minreallyold) {
+       ours = (arg.found - arg.killed);
+       if (ours > rtq_toomany &&
+           ((timenow - last_adjusted_timeout) >= (uint64_t)rtq_timeout) &&
+           rtq_reallyold > rtq_minreallyold) {
+               rtq_reallyold = 2 * rtq_reallyold / 3;
+               if (rtq_reallyold < rtq_minreallyold) {
                         rtq_reallyold = rtq_minreallyold;
                 }
  
-               last_adjusted_timeout = time_second;
-#if DIAGNOSTIC
-               log(LOG_DEBUG, "in_rtqtimo: adjusted rtq_reallyold to %d\n",
-                   rtq_reallyold);
-#endif
+               last_adjusted_timeout = timenow;
+               if (verbose) {
+                       log(LOG_DEBUG, "%s: adjusted rtq_reallyold to %d "
+                           "seconds\n", __func__, rtq_reallyold);
+               }
                 arg.found = arg.killed = 0;
                 arg.updating = 1;
-               s = splnet();
                 rnh->rnh_walktree(rnh, in_rtqkill, &arg);
-               splx(s);
         }
  
         atv.tv_usec = 0;
-       atv.tv_sec = arg.nextstop - time_second;
-       timeout(in_rtqtimo_funnel, rock, tvtohz(&atv));
+       atv.tv_sec = arg.nextstop - timenow;
+       /* re-arm the timer only if there's work to do */
+       in_rtqtimo_run = 0;
+       if (ours > 0) {
+               in_sched_rtqtimo(&atv);
+       } else if (verbose) {
+               log(LOG_DEBUG, "%s: not rescheduling timer\n", __func__);
+       }
+       lck_mtx_unlock(rnh_lock);
+}
+
+static void
+in_sched_rtqtimo(struct timeval *atv)
+{
+       LCK_MTX_ASSERT(rnh_lock, LCK_MTX_ASSERT_OWNED);
+
+       if (!in_rtqtimo_run) {
+               struct timeval tv;
+
+               if (atv == NULL) {
+                       tv.tv_usec = 0;
+                       tv.tv_sec = MAX(rtq_timeout / 10, 1);
+                       atv = &tv;
+               }
+               if (rt_verbose > 1) {
+                       log(LOG_DEBUG, "%s: timer scheduled in "
+                           "T+%llus.%lluu\n", __func__,
+                           (uint64_t)atv->tv_sec, (uint64_t)atv->tv_usec);
+               }
+               in_rtqtimo_run = 1;
+               timeout(in_rtqtimo, NULL, tvtohz(atv));
+       }
  }
  
  void
  in_rtqdrain(void)
  {
-       struct radix_node_head *rnh = rt_tables[AF_INET];
+       struct radix_node_head *rnh;
         struct rtqk_arg arg;
-       int s;
-       arg.found = arg.killed = 0;
+
+       if (rt_verbose > 1) {
+               log(LOG_DEBUG, "%s: draining routes\n", __func__);
+       }
+
+       lck_mtx_lock(rnh_lock);
+       rnh = rt_tables[AF_INET];
+       VERIFY(rnh != NULL);
+       bzero(&arg, sizeof(arg));
         arg.rnh = rnh;
-       arg.nextstop = 0;
         arg.draining = 1;
-       arg.updating = 0;
-       s = splnet();
         rnh->rnh_walktree(rnh, in_rtqkill, &arg);
-       splx(s);
+       lck_mtx_unlock(rnh_lock);
  }
  
  /*
@@ -401,26 +688,31 @@ in_inithead(void **head, int off)
  {
         struct radix_node_head *rnh;
  
-#ifdef __APPLE__
-       if (*head)
-               return 1;
-#endif
+       /* If called from route_init(), make sure it is exactly once */
+       VERIFY(head != (void **)&rt_tables[AF_INET] || *head == NULL);
  
-       if(!rn_inithead(head, off))
+       if (!rn_inithead(head, off)) {
                 return 0;
+       }
  
-       if(head != (void **)&rt_tables[AF_INET]) /* BOGUS! */
-               return 1;       /* only do this for the real routing table */
-
+       /*
+        * We can get here from nfs_subs.c as well, in which case this
+        * won't be for the real routing table and thus we're done;
+        * this also takes care of the case when we're called more than
+        * once from anywhere but route_init().
+        */
+       if (head != (void **)&rt_tables[AF_INET]) {
+               return 1;     /* only do this for the real routing table */
+       }
         rnh = *head;
         rnh->rnh_addaddr = in_addroute;
+       rnh->rnh_deladdr = in_deleteroute;
         rnh->rnh_matchaddr = in_matroute;
+       rnh->rnh_matchaddr_args = in_matroute_args;
         rnh->rnh_close = in_clsroute;
-       in_rtqtimo(rnh);        /* kick off timeout first time */
         return 1;
  }
  
-\f
  /*
   * This zaps old routes when the interface goes down or interface
   * address is deleted.  In the latter case, it deletes static routes
@@ -439,26 +731,56 @@ struct in_ifadown_arg {
  static int
  in_ifadownkill(struct radix_node *rn, void *xap)
  {
+       char dbuf[MAX_IPv4_STR_LEN], gbuf[MAX_IPv4_STR_LEN];
         struct in_ifadown_arg *ap = xap;
         struct rtentry *rt = (struct rtentry *)rn;
+       boolean_t verbose = (rt_verbose != 0);
         int err;
  
+       LCK_MTX_ASSERT(rnh_lock, LCK_MTX_ASSERT_OWNED);
+
+       RT_LOCK(rt);
         if (rt->rt_ifa == ap->ifa &&
             (ap->del || !(rt->rt_flags & RTF_STATIC))) {
+               rt_str(rt, dbuf, sizeof(dbuf), gbuf, sizeof(gbuf));
+               if (verbose) {
+                       log(LOG_DEBUG, "%s: deleting route to %s->%s->%s, "
+                           "flags=%b\n", __func__, dbuf, gbuf,
+                           (rt->rt_ifp != NULL) ? rt->rt_ifp->if_xname : "",
+                           rt->rt_flags, RTF_BITS);
+               }
+               RT_ADDREF_LOCKED(rt);   /* for us to free below */
                 /*
                  * We need to disable the automatic prune that happens
                  * in this case in rtrequest() because it will blow
                  * away the pointers that rn_walktree() needs in order
                  * continue our descent.  We will end up deleting all
                  * the routes that rtrequest() would have in any case,
-                * so that behavior is not needed there.
+                * so that behavior is not needed there.  Safe to drop
+                * rt_lock and use rt_key, rt_gateway, since holding
+                * rnh_lock here prevents another thread from calling
+                * rt_setgate() on this route.
                  */
                 rt->rt_flags &= ~(RTF_CLONING | RTF_PRCLONING);
-               err = rtrequest(RTM_DELETE, (struct sockaddr *)rt_key(rt),
-                               rt->rt_gateway, rt_mask(rt), rt->rt_flags, 0);
-               if (err) {
-                       log(LOG_WARNING, "in_ifadownkill: error %d\n", err);
+               RT_UNLOCK(rt);
+               err = rtrequest_locked(RTM_DELETE, rt_key(rt),
+                   rt->rt_gateway, rt_mask(rt), rt->rt_flags, NULL);
+               if (err != 0) {
+                       RT_LOCK(rt);
+                       if (!verbose) {
+                               rt_str(rt, dbuf, sizeof(dbuf),
+                                   gbuf, sizeof(gbuf));
+                       }
+                       log(LOG_ERR, "%s: error deleting route to "
+                           "%s->%s->%s, flags=%b, err=%d\n", __func__,
+                           dbuf, gbuf, (rt->rt_ifp != NULL) ?
+                           rt->rt_ifp->if_xname : "", rt->rt_flags,
+                           RTF_BITS, err);
+                       RT_UNLOCK(rt);
                 }
+               rtfree_locked(rt);
+       } else {
+               RT_UNLOCK(rt);
         }
         return 0;
  }
@@ -469,13 +791,26 @@ in_ifadown(struct ifaddr *ifa, int delete)
         struct in_ifadown_arg arg;
         struct radix_node_head *rnh;
  
-       if (ifa->ifa_addr->sa_family != AF_INET)
+       LCK_MTX_ASSERT(rnh_lock, LCK_MTX_ASSERT_OWNED);
+
+       /*
+        * Holding rnh_lock here prevents the possibility of
+        * ifa from changing (e.g. in_ifinit), so it is safe
+        * to access its ifa_addr without locking.
+        */
+       if (ifa->ifa_addr->sa_family != AF_INET) {
                 return 1;
+       }
+
+       /* trigger route cache reevaluation */
+       routegenid_inet_update();
  
         arg.rnh = rnh = rt_tables[AF_INET];
         arg.ifa = ifa;
         arg.del = delete;
         rnh->rnh_walktree(rnh, in_ifadownkill, &arg);
+       IFA_LOCK_SPIN(ifa);
         ifa->ifa_flags &= ~IFA_ROUTE;
+       IFA_UNLOCK(ifa);
         return 0;
  }