]> git.saurik.com Git - apple/xnu.git/blobdiff - bsd/netinet/in_rmx.c
xnu-3789.1.32.tar.gz
[apple/xnu.git] / bsd / netinet / in_rmx.c
index 2d0c2735d10d7fc46b8a703161e6c09c0e43e72c..fa4b580dc6c797229741f71f335c6822019136e7 100644 (file)
@@ -1,8 +1,8 @@
 /*
- * Copyright (c) 2000-2011 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
+ *
  * This file contains Original Code and/or Modifications of Original Code
  * as defined in and that are subject to the Apple Public Source License
  * Version 2.0 (the 'License'). You may not use this file except in
  * unlawful or unlicensed copies of an Apple operating system, or to
  * circumvent, violate, or enable the circumvention or violation of, any
  * terms of an Apple operating system software license agreement.
- * 
+ *
  * Please obtain a copy of the License at
  * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
+ *
  * The Original Code and all software distributed under the License are
  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
@@ -22,7 +22,7 @@
  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  * Please see the License for the specific language governing rights and
  * limitations under the License.
- * 
+ *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 /*
@@ -53,7 +53,6 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/netinet/in_rmx.c,v 1.37.2.1 2001/05/14 08:23:49 ru Exp $
  */
 
 /*
@@ -78,7 +77,7 @@
 #include <sys/protosw.h>
 #include <sys/syslog.h>
 #include <sys/mcache.h>
-#include <kern/lock.h>
+#include <kern/locks.h>
 
 #include <net/if.h>
 #include <net/route.h>
 #include <netinet/in_arp.h>
 
 extern int tvtohz(struct timeval *);
-extern int     in_inithead(void **head, int off);
 
-#ifdef __APPLE__
-static void in_rtqtimo(void *rock);
-#endif
+static int in_rtqtimo_run;             /* in_rtqtimo is scheduled to run */
+static void in_rtqtimo(void *);
+static void in_sched_rtqtimo(struct timeval *);
 
+static struct radix_node *in_addroute(void *, void *, struct radix_node_head *,
+    struct radix_node *);
+static struct radix_node *in_deleteroute(void *, void *,
+    struct radix_node_head *);
+static struct radix_node *in_matroute(void *, struct radix_node_head *);
 static struct radix_node *in_matroute_args(void *, struct radix_node_head *,
     rn_matchf_t *f, void *);
+static void in_clsroute(struct radix_node *, struct radix_node_head *);
+static int in_rtqkill(struct radix_node *, void *);
+
+static int in_ifadownkill(struct radix_node *, void *);
 
-#define RTPRF_OURS             RTF_PROTO3      /* set on routes we manage */
+#define        RTPRF_OURS              RTF_PROTO3      /* set on routes we manage */
 
 /*
  * Do what we need to do when inserting a route.
  */
 static struct radix_node *
 in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head,
-           struct radix_node *treenodes)
+    struct radix_node *treenodes)
 {
        struct rtentry *rt = (struct rtentry *)treenodes;
-       struct sockaddr_in *sin = (struct sockaddr_in *)rt_key(rt);
+       struct sockaddr_in *sin = (struct sockaddr_in *)(void *)rt_key(rt);
        struct radix_node *ret;
+       char dbuf[MAX_IPv4_STR_LEN], gbuf[MAX_IPv4_STR_LEN];
+       uint32_t flags = rt->rt_flags;
+       boolean_t verbose = (rt_verbose > 1);
 
        lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED);
        RT_LOCK_ASSERT_HELD(rt);
 
+       if (verbose)
+               rt_str(rt, dbuf, sizeof (dbuf), gbuf, sizeof (gbuf));
+
        /*
         * For IP, all unicast non-host routes are automatically cloning.
         */
        if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
                rt->rt_flags |= RTF_MULTICAST;
 
-       if (!(rt->rt_flags & (RTF_HOST | RTF_CLONING | RTF_MULTICAST))) {
+       if (!(rt->rt_flags & (RTF_HOST | RTF_CLONING | RTF_MULTICAST)))
                rt->rt_flags |= RTF_PRCLONING;
-       }
 
        /*
         * A little bit of help for both IP output and input:
@@ -145,21 +157,19 @@ in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head,
                        /* Become a regular mutex */
                        RT_CONVERT_LOCK(rt);
                        IFA_LOCK_SPIN(rt->rt_ifa);
-#define satosin(sa) ((struct sockaddr_in *)sa)
-                       if (satosin(rt->rt_ifa->ifa_addr)->sin_addr.s_addr
-                           == sin->sin_addr.s_addr)
+                       if (satosin(rt->rt_ifa->ifa_addr)->sin_addr.s_addr ==
+                           sin->sin_addr.s_addr)
                                rt->rt_flags |= RTF_LOCAL;
-#undef satosin
                        IFA_UNLOCK(rt->rt_ifa);
                }
        }
 
-       if (!rt->rt_rmx.rmx_mtu && !(rt->rt_rmx.rmx_locks & RTV_MTU) 
-           && rt->rt_ifp)
+       if (!rt->rt_rmx.rmx_mtu && !(rt->rt_rmx.rmx_locks & RTV_MTU) &&
+           rt->rt_ifp)
                rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu;
 
        ret = rn_addroute(v_arg, n_arg, head, treenodes);
-       if (ret == NULL && rt->rt_flags & RTF_HOST) {
+       if (ret == NULL && (rt->rt_flags & RTF_HOST)) {
                struct rtentry *rt2;
                /*
                 * We are trying to add a host route, but can't.
@@ -168,12 +178,30 @@ in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head,
                 */
                rt2 = rtalloc1_scoped_locked(rt_key(rt), 0,
                    RTF_CLONING | RTF_PRCLONING, sin_get_ifscope(rt_key(rt)));
-               if (rt2) {
+               if (rt2 != NULL) {
+                       char dbufc[MAX_IPv4_STR_LEN];
+
                        RT_LOCK(rt2);
+                       if (verbose)
+                               rt_str(rt2, dbufc, sizeof (dbufc), NULL, 0);
+
                        if ((rt2->rt_flags & RTF_LLINFO) &&
                            (rt2->rt_flags & RTF_HOST) &&
                            rt2->rt_gateway != NULL &&
                            rt2->rt_gateway->sa_family == AF_LINK) {
+                               if (verbose) {
+                                       log(LOG_DEBUG, "%s: unable to insert "
+                                           "route to %s;%s, flags=%b, due to "
+                                           "existing ARP route %s->%s "
+                                           "flags=%b, attempting to delete\n",
+                                           __func__, dbuf,
+                                           (rt->rt_ifp != NULL) ?
+                                           rt->rt_ifp->if_xname : "",
+                                           rt->rt_flags, RTF_BITS, dbufc,
+                                           (rt2->rt_ifp != NULL) ?
+                                           rt2->rt_ifp->if_xname : "",
+                                           rt2->rt_flags, RTF_BITS);
+                               }
                                /*
                                 * Safe to drop rt_lock and use rt_key,
                                 * rt_gateway, since holding rnh_lock here
@@ -181,18 +209,64 @@ in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head,
                                 * rt_setgate() on this route.
                                 */
                                RT_UNLOCK(rt2);
-                               rtrequest_locked(RTM_DELETE, rt_key(rt2),
+                               (void) rtrequest_locked(RTM_DELETE, rt_key(rt2),
                                    rt2->rt_gateway, rt_mask(rt2),
-                                   rt2->rt_flags, 0);
+                                   rt2->rt_flags, NULL);
                                ret = rn_addroute(v_arg, n_arg, head,
-                                       treenodes);
+                                   treenodes);
                        } else {
                                RT_UNLOCK(rt2);
                        }
                        rtfree_locked(rt2);
                }
        }
-       return ret;
+
+       if (!verbose)
+               goto done;
+
+       if (ret != NULL) {
+               if (flags != rt->rt_flags) {
+                       log(LOG_DEBUG, "%s: route to %s->%s->%s inserted, "
+                           "oflags=%b, flags=%b\n", __func__,
+                           dbuf, gbuf, (rt->rt_ifp != NULL) ?
+                           rt->rt_ifp->if_xname : "", flags, RTF_BITS,
+                           rt->rt_flags, RTF_BITS);
+               } else {
+                       log(LOG_DEBUG, "%s: route to %s->%s->%s inserted, "
+                           "flags=%b\n", __func__, dbuf, gbuf,
+                           (rt->rt_ifp != NULL) ? rt->rt_ifp->if_xname : "",
+                           rt->rt_flags, RTF_BITS);
+               }
+       } else {
+               log(LOG_DEBUG, "%s: unable to insert route to %s->%s->%s, "
+                   "flags=%b, already exists\n", __func__, dbuf, gbuf,
+                   (rt->rt_ifp != NULL) ? rt->rt_ifp->if_xname : "",
+                   rt->rt_flags, RTF_BITS);
+       }
+done:
+       return (ret);
+}
+
+static struct radix_node *
+in_deleteroute(void *v_arg, void *netmask_arg, struct radix_node_head *head)
+{
+       struct radix_node *rn;
+
+       lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED);
+
+       rn = rn_delete(v_arg, netmask_arg, head);
+       if (rt_verbose > 1 && rn != NULL) {
+               char dbuf[MAX_IPv4_STR_LEN], gbuf[MAX_IPv4_STR_LEN];
+               struct rtentry *rt = (struct rtentry *)rn;
+
+               RT_LOCK(rt);
+               rt_str(rt, dbuf, sizeof (dbuf), gbuf, sizeof (gbuf));
+               log(LOG_DEBUG, "%s: route to %s->%s->%s deleted, "
+                   "flags=%b\n", __func__, dbuf, gbuf, (rt->rt_ifp != NULL) ?
+                   rt->rt_ifp->if_xname : "", rt->rt_flags, RTF_BITS);
+               RT_UNLOCK(rt);
+       }
+       return (rn);
 }
 
 /*
@@ -207,15 +281,24 @@ in_validate(struct radix_node *rn)
 
        /* This is first reference? */
        if (rt->rt_refcnt == 0) {
+               if (rt_verbose > 2) {
+                       char dbuf[MAX_IPv4_STR_LEN], gbuf[MAX_IPv4_STR_LEN];
+
+                       rt_str(rt, dbuf, sizeof (dbuf), gbuf, sizeof (gbuf));
+                       log(LOG_DEBUG, "%s: route to %s->%s->%s validated, "
+                           "flags=%b\n", __func__, dbuf, gbuf,
+                           (rt->rt_ifp != NULL) ? rt->rt_ifp->if_xname : "",
+                           rt->rt_flags, RTF_BITS);
+               }
+
+               /*
+                * It's one of ours; unexpire it.  If the timer is already
+                * scheduled, let it run later as it won't re-arm itself
+                * if there's nothing to do.
+                */
                if (rt->rt_flags & RTPRF_OURS) {
-                       /* It's one of ours; unexpire it */
                        rt->rt_flags &= ~RTPRF_OURS;
                        rt_setexpire(rt, 0);
-               } else if ((rt->rt_flags & RTF_LLINFO) &&
-                   (rt->rt_flags & RTF_HOST) && rt->rt_gateway != NULL &&
-                   rt->rt_gateway->sa_family == AF_LINK) {
-                       /* It's ARP; let it be handled there */
-                       arp_validate(rt);
                }
        }
        return (rn);
@@ -249,53 +332,35 @@ in_matroute_args(void *v_arg, struct radix_node_head *head,
        return (rn);
 }
 
-static int rtq_reallyold = 60*60;
-       /* one hour is ``really old'' */
-SYSCTL_INT(_net_inet_ip, IPCTL_RTEXPIRE, rtexpire, CTLFLAG_RW | CTLFLAG_LOCKED, 
-    &rtq_reallyold , 0, 
-    "Default expiration time on dynamically learned routes");
-                                  
-static int rtq_minreallyold = 10;
-       /* never automatically crank down to less */
-SYSCTL_INT(_net_inet_ip, IPCTL_RTMINEXPIRE, rtminexpire, CTLFLAG_RW | CTLFLAG_LOCKED, 
-    &rtq_minreallyold , 0, 
-    "Minimum time to attempt to hold onto dynamically learned routes");
-                                  
-static int rtq_toomany = 128;
-       /* 128 cached routes is ``too many'' */
-SYSCTL_INT(_net_inet_ip, IPCTL_RTMAXCACHE, rtmaxcache, CTLFLAG_RW | CTLFLAG_LOCKED, 
-    &rtq_toomany , 0, "Upper limit on dynamically learned routes");
-
-#ifdef __APPLE__
-/* XXX LD11JUL02 Special case for AOL 5.1.2 connectivity issue to AirPort BS (Radar 2969954)
- * AOL is adding a circular route ("10.0.1.1/32 10.0.1.1") when establishing its ppp tunnel
- * to the AP BaseStation by removing the default gateway and replacing it with their tunnel entry point.
- * There is no apparent reason to add this route as there is a valid 10.0.1.1/24 route to the BS.
- * That circular route was ignored on previous version of MacOS X because of a routing bug
- * corrected with the merge to FreeBSD4.4 (a route generated from an RTF_CLONING route had the RTF_WASCLONED
- * flag set but did not have a reference to the parent route) and that entry was left in the RT. This workaround is
- * made in order to provide binary compatibility with AOL. 
- * If we catch a process adding a circular route with a /32 from the routing socket, we error it out instead of
- * confusing the routing table with a wrong route to the previous default gateway
- * If for some reason a circular route is needed, turn this sysctl (net.inet.ip.check_route_selfref) to zero.
- */
-int check_routeselfref = 1;
-SYSCTL_INT(_net_inet_ip, OID_AUTO, check_route_selfref, CTLFLAG_RW | CTLFLAG_LOCKED,
-    &check_routeselfref , 0, "");
-#endif
+/* one hour is ``really old'' */
+static uint32_t rtq_reallyold = 60*60;
+SYSCTL_UINT(_net_inet_ip, IPCTL_RTEXPIRE, rtexpire,
+       CTLFLAG_RW | CTLFLAG_LOCKED, &rtq_reallyold, 0,
+       "Default expiration time on dynamically learned routes");
 
-int use_routegenid = 1;
-SYSCTL_INT(_net_inet_ip, OID_AUTO, use_route_genid, CTLFLAG_RW | CTLFLAG_LOCKED,
-    &use_routegenid , 0, "");
+/* never automatically crank down to less */
+static uint32_t rtq_minreallyold = 10;
+SYSCTL_UINT(_net_inet_ip, IPCTL_RTMINEXPIRE, rtminexpire,
+       CTLFLAG_RW | CTLFLAG_LOCKED, &rtq_minreallyold, 0,
+       "Minimum time to attempt to hold onto dynamically learned routes");
+
+/* 128 cached routes is ``too many'' */
+static uint32_t rtq_toomany = 128;
+SYSCTL_UINT(_net_inet_ip, IPCTL_RTMAXCACHE, rtmaxcache,
+       CTLFLAG_RW | CTLFLAG_LOCKED, &rtq_toomany, 0,
+       "Upper limit on dynamically learned routes");
 
 /*
  * On last reference drop, mark the route as belong to us so that it can be
  * timed out.
  */
 static void
-in_clsroute(struct radix_node *rn, __unused struct radix_node_head *head)
+in_clsroute(struct radix_node *rn, struct radix_node_head *head)
 {
+#pragma unused(head)
+       char dbuf[MAX_IPv4_STR_LEN], gbuf[MAX_IPv4_STR_LEN];
        struct rtentry *rt = (struct rtentry *)rn;
+       boolean_t verbose = (rt_verbose > 1);
 
        lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED);
        RT_LOCK_ASSERT_HELD(rt);
@@ -306,15 +371,29 @@ in_clsroute(struct radix_node *rn, __unused struct radix_node_head *head)
        if ((rt->rt_flags & (RTF_LLINFO | RTF_HOST)) != RTF_HOST)
                return;
 
-       if ((rt->rt_flags & (RTF_WASCLONED | RTPRF_OURS)) != RTF_WASCLONED)
+       if (rt->rt_flags & RTPRF_OURS)
+               return;
+
+       if (!(rt->rt_flags & (RTF_WASCLONED | RTF_DYNAMIC)))
                return;
 
+       if (verbose)
+               rt_str(rt, dbuf, sizeof (dbuf), gbuf, sizeof (gbuf));
+
        /*
         * Delete the route immediately if RTF_DELCLONE is set or
         * if route caching is disabled (rtq_reallyold set to 0).
         * Otherwise, let it expire and be deleted by in_rtqkill().
         */
        if ((rt->rt_flags & RTF_DELCLONE) || rtq_reallyold == 0) {
+               int err;
+
+               if (verbose) {
+                       log(LOG_DEBUG, "%s: deleting route to %s->%s->%s, "
+                           "flags=%b\n", __func__, dbuf, gbuf,
+                           (rt->rt_ifp != NULL) ? rt->rt_ifp->if_xname : "",
+                           rt->rt_flags, RTF_BITS);
+               }
                /*
                 * Delete the route from the radix tree but since we are
                 * called when the route's reference count is 0, don't
@@ -325,30 +404,48 @@ in_clsroute(struct radix_node *rn, __unused struct radix_node_head *head)
                 * calling rt_setgate() on this route.
                 */
                RT_UNLOCK(rt);
-               if (rtrequest_locked(RTM_DELETE, (struct sockaddr *)rt_key(rt),
-                   rt->rt_gateway, rt_mask(rt), rt->rt_flags, &rt) == 0) {
+               err = rtrequest_locked(RTM_DELETE, rt_key(rt),
+                   rt->rt_gateway, rt_mask(rt), rt->rt_flags, &rt);
+               if (err == 0) {
                        /* Now let the caller free it */
                        RT_LOCK(rt);
                        RT_REMREF_LOCKED(rt);
                } else {
                        RT_LOCK(rt);
+                       if (!verbose)
+                               rt_str(rt, dbuf, sizeof (dbuf),
+                                   gbuf, sizeof (gbuf));
+                       log(LOG_ERR, "%s: error deleting route to "
+                           "%s->%s->%s, flags=%b, err=%d\n", __func__,
+                           dbuf, gbuf, (rt->rt_ifp != NULL) ?
+                           rt->rt_ifp->if_xname : "", rt->rt_flags,
+                           RTF_BITS, err);
                }
        } else {
                uint64_t timenow;
 
                timenow = net_uptime();
                rt->rt_flags |= RTPRF_OURS;
-               rt_setexpire(rt,
-                   rt_expiry(rt, timenow, rtq_reallyold));
+               rt_setexpire(rt, timenow + rtq_reallyold);
+
+               if (verbose) {
+                       log(LOG_DEBUG, "%s: route to %s->%s->%s invalidated, "
+                           "flags=%b, expire=T+%u\n", __func__, dbuf, gbuf,
+                           (rt->rt_ifp != NULL) ? rt->rt_ifp->if_xname : "",
+                           rt->rt_flags, RTF_BITS, rt->rt_expire - timenow);
+               }
+
+               /* We have at least one entry; arm the timer if not already */
+               in_sched_rtqtimo(NULL);
        }
 }
 
 struct rtqk_arg {
        struct radix_node_head *rnh;
-       int draining;
-       int killed;
-       int found;
        int updating;
+       int draining;
+       uint32_t killed;
+       uint32_t found;
        uint64_t nextstop;
 };
 
@@ -362,22 +459,38 @@ in_rtqkill(struct radix_node *rn, void *rock)
 {
        struct rtqk_arg *ap = rock;
        struct rtentry *rt = (struct rtentry *)rn;
-       int err;
+       boolean_t verbose = (rt_verbose > 1);
        uint64_t timenow;
+       int err;
 
        timenow = net_uptime();
        lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED);
 
        RT_LOCK(rt);
        if (rt->rt_flags & RTPRF_OURS) {
-               ap->found++;
+               char dbuf[MAX_IPv4_STR_LEN], gbuf[MAX_IPv4_STR_LEN];
+
+               if (verbose)
+                       rt_str(rt, dbuf, sizeof (dbuf), gbuf, sizeof (gbuf));
 
+               ap->found++;
                VERIFY(rt->rt_expire == 0 || rt->rt_rmx.rmx_expire != 0);
                VERIFY(rt->rt_expire != 0 || rt->rt_rmx.rmx_expire == 0);
                if (ap->draining || rt->rt_expire <= timenow) {
-                       if (rt->rt_refcnt > 0)
-                               panic("rtqkill route really not free");
-
+                       if (rt->rt_refcnt > 0) {
+                               panic("%s: route %p marked with RTPRF_OURS "
+                                   "with non-zero refcnt (%u)", __func__,
+                                   rt, rt->rt_refcnt);
+                               /* NOTREACHED */
+                       }
+                       if (verbose) {
+                               log(LOG_DEBUG, "%s: deleting route to "
+                                   "%s->%s->%s, flags=%b, draining=%d\n",
+                                   __func__, dbuf, gbuf, (rt->rt_ifp != NULL) ?
+                                   rt->rt_ifp->if_xname : "", rt->rt_flags,
+                                   RTF_BITS, ap->draining);
+                       }
+                       RT_ADDREF_LOCKED(rt);   /* for us to free below */
                        /*
                         * Delete this route since we're done with it;
                         * the route may be freed afterwards, so we
@@ -389,58 +502,81 @@ in_rtqkill(struct radix_node *rn, void *rock)
                         */
                        RT_UNLOCK(rt);
                        err = rtrequest_locked(RTM_DELETE, rt_key(rt),
-                           rt->rt_gateway, rt_mask(rt), rt->rt_flags, 0);
-                       if (err) {
-                               log(LOG_WARNING, "in_rtqkill: error %d\n", err);
+                           rt->rt_gateway, rt_mask(rt), rt->rt_flags, NULL);
+                       if (err != 0) {
+                               RT_LOCK(rt);
+                               if (!verbose)
+                                       rt_str(rt, dbuf, sizeof (dbuf),
+                                           gbuf, sizeof (gbuf));
+                               log(LOG_ERR, "%s: error deleting route to "
+                                   "%s->%s->%s, flags=%b, err=%d\n", __func__,
+                                   dbuf, gbuf, (rt->rt_ifp != NULL) ?
+                                   rt->rt_ifp->if_xname : "", rt->rt_flags,
+                                   RTF_BITS, err);
+                               RT_UNLOCK(rt);
                        } else {
                                ap->killed++;
                        }
+                       rtfree_locked(rt);
                } else {
-                       if (ap->updating &&
-                           (rt->rt_expire - timenow) >
-                           rt_expiry(rt, 0, rtq_reallyold)) {
-                               rt_setexpire(rt, rt_expiry(rt,
-                                   timenow, rtq_reallyold));
+                       uint64_t expire = (rt->rt_expire - timenow);
+
+                       if (ap->updating && expire > rtq_reallyold) {
+                               rt_setexpire(rt, timenow + rtq_reallyold);
+                               if (verbose) {
+                                       log(LOG_DEBUG, "%s: route to "
+                                           "%s->%s->%s, flags=%b, adjusted "
+                                           "expire=T+%u (was T+%u)\n",
+                                           __func__, dbuf, gbuf,
+                                           (rt->rt_ifp != NULL) ?
+                                           rt->rt_ifp->if_xname : "",
+                                           rt->rt_flags, RTF_BITS,
+                                           (rt->rt_expire - timenow), expire);
+                               }
                        }
-                       ap->nextstop = lmin(ap->nextstop,
-                                           rt->rt_expire);
+                       ap->nextstop = lmin(ap->nextstop, rt->rt_expire);
                        RT_UNLOCK(rt);
                }
        } else {
                RT_UNLOCK(rt);
        }
 
-       return 0;
+       return (0);
 }
 
-static void
-in_rtqtimo_funnel(void *rock)
-{
-        in_rtqtimo(rock);
-
-}
-#define RTQ_TIMEOUT    60*10   /* run no less than once every ten minutes */
+#define        RTQ_TIMEOUT     60*10   /* run no less than once every ten minutes */
 static int rtq_timeout = RTQ_TIMEOUT;
 
 static void
-in_rtqtimo(void *rock)
+in_rtqtimo(void *targ)
 {
-       struct radix_node_head *rnh = rock;
+#pragma unused(targ)
+       struct radix_node_head *rnh;
        struct rtqk_arg arg;
        struct timeval atv;
        static uint64_t last_adjusted_timeout = 0;
+       boolean_t verbose = (rt_verbose > 1);
        uint64_t timenow;
+       uint32_t ours;
 
        lck_mtx_lock(rnh_lock);
-       /* Get the timestamp after we acquire the lock for better accuracy */
-        timenow = net_uptime();
+       rnh = rt_tables[AF_INET];
+       VERIFY(rnh != NULL);
 
-       arg.found = arg.killed = 0;
+       /* Get the timestamp after we acquire the lock for better accuracy */
+       timenow = net_uptime();
+       if (verbose) {
+               log(LOG_DEBUG, "%s: initial nextstop is T+%u seconds\n",
+                   __func__, rtq_timeout);
+       }
+       bzero(&arg, sizeof (arg));
        arg.rnh = rnh;
        arg.nextstop = timenow + rtq_timeout;
-       arg.draining = arg.updating = 0;
        rnh->rnh_walktree(rnh, in_rtqkill, &arg);
-
+       if (verbose) {
+               log(LOG_DEBUG, "%s: found %u, killed %u\n", __func__,
+                   arg.found, arg.killed);
+       }
        /*
         * Attempt to be somewhat dynamic about this:
         * If there are ``too many'' routes sitting around taking up space,
@@ -449,19 +585,19 @@ in_rtqtimo(void *rock)
         * than once in rtq_timeout seconds, to keep from cranking down too
         * hard.
         */
-       if((arg.found - arg.killed > rtq_toomany)
-          && ((timenow - last_adjusted_timeout) >= (uint64_t)rtq_timeout)
-          && rtq_reallyold > rtq_minreallyold) {
-               rtq_reallyold = 2*rtq_reallyold / 3;
-               if(rtq_reallyold < rtq_minreallyold) {
+       ours = (arg.found - arg.killed);
+       if (ours > rtq_toomany &&
+           ((timenow - last_adjusted_timeout) >= (uint64_t)rtq_timeout) &&
+           rtq_reallyold > rtq_minreallyold) {
+               rtq_reallyold = 2 * rtq_reallyold / 3;
+               if (rtq_reallyold < rtq_minreallyold)
                        rtq_reallyold = rtq_minreallyold;
-               }
 
                last_adjusted_timeout = timenow;
-#if DIAGNOSTIC
-               log(LOG_DEBUG, "in_rtqtimo: adjusted rtq_reallyold to %d\n",
-                   rtq_reallyold);
-#endif
+               if (verbose) {
+                       log(LOG_DEBUG, "%s: adjusted rtq_reallyold to %d "
+                           "seconds\n", __func__, rtq_reallyold);
+               }
                arg.found = arg.killed = 0;
                arg.updating = 1;
                rnh->rnh_walktree(rnh, in_rtqkill, &arg);
@@ -469,21 +605,53 @@ in_rtqtimo(void *rock)
 
        atv.tv_usec = 0;
        atv.tv_sec = arg.nextstop - timenow;
+       /* re-arm the timer only if there's work to do */
+       in_rtqtimo_run = 0;
+       if (ours > 0)
+               in_sched_rtqtimo(&atv);
+       else if (verbose)
+               log(LOG_DEBUG, "%s: not rescheduling timer\n", __func__);
        lck_mtx_unlock(rnh_lock);
-       timeout(in_rtqtimo_funnel, rock, tvtohz(&atv));
+}
+
+static void
+in_sched_rtqtimo(struct timeval *atv)
+{
+       lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED);
+
+       if (!in_rtqtimo_run) {
+               struct timeval tv;
+
+               if (atv == NULL) {
+                       tv.tv_usec = 0;
+                       tv.tv_sec = MAX(rtq_timeout / 10, 1);
+                       atv = &tv;
+               }
+               if (rt_verbose > 1) {
+                       log(LOG_DEBUG, "%s: timer scheduled in "
+                           "T+%llus.%lluu\n", __func__,
+                           (uint64_t)atv->tv_sec, (uint64_t)atv->tv_usec);
+               }
+               in_rtqtimo_run = 1;
+               timeout(in_rtqtimo, NULL, tvtohz(atv));
+       }
 }
 
 void
 in_rtqdrain(void)
 {
-       struct radix_node_head *rnh = rt_tables[AF_INET];
+       struct radix_node_head *rnh;
        struct rtqk_arg arg;
-       arg.found = arg.killed = 0;
+
+       if (rt_verbose > 1)
+               log(LOG_DEBUG, "%s: draining routes\n", __func__);
+
+       lck_mtx_lock(rnh_lock);
+       rnh = rt_tables[AF_INET];
+       VERIFY(rnh != NULL);
+       bzero(&arg, sizeof (arg));
        arg.rnh = rnh;
-       arg.nextstop = 0;
        arg.draining = 1;
-       arg.updating = 0;
-       lck_mtx_lock(rnh_lock);
        rnh->rnh_walktree(rnh, in_rtqkill, &arg);
        lck_mtx_unlock(rnh_lock);
 }
@@ -496,27 +664,30 @@ in_inithead(void **head, int off)
 {
        struct radix_node_head *rnh;
 
-#ifdef __APPLE__
-       if (*head)
-               return 1;
-#endif
+       /* If called from route_init(), make sure it is exactly once */
+       VERIFY(head != (void **)&rt_tables[AF_INET] || *head == NULL);
 
-       if(!rn_inithead(head, off))
-               return 0;
+       if (!rn_inithead(head, off))
+               return (0);
 
-       if(head != (void **)&rt_tables[AF_INET]) /* BOGUS! */
-               return 1;       /* only do this for the real routing table */
+       /*
+        * We can get here from nfs_subs.c as well, in which case this
+        * won't be for the real routing table and thus we're done;
+        * this also takes care of the case when we're called more than
+        * once from anywhere but route_init().
+        */
+       if (head != (void **)&rt_tables[AF_INET])
+               return (1);     /* only do this for the real routing table */
 
        rnh = *head;
        rnh->rnh_addaddr = in_addroute;
+       rnh->rnh_deladdr = in_deleteroute;
        rnh->rnh_matchaddr = in_matroute;
        rnh->rnh_matchaddr_args = in_matroute_args;
        rnh->rnh_close = in_clsroute;
-       in_rtqtimo(rnh);        /* kick off timeout first time */
-       return 1;
+       return (1);
 }
 
-\f
 /*
  * This zaps old routes when the interface goes down or interface
  * address is deleted.  In the latter case, it deletes static routes
@@ -535,13 +706,25 @@ struct in_ifadown_arg {
 static int
 in_ifadownkill(struct radix_node *rn, void *xap)
 {
+       char dbuf[MAX_IPv4_STR_LEN], gbuf[MAX_IPv4_STR_LEN];
        struct in_ifadown_arg *ap = xap;
        struct rtentry *rt = (struct rtentry *)rn;
+       boolean_t verbose = (rt_verbose != 0);
        int err;
 
+       lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED);
+
        RT_LOCK(rt);
        if (rt->rt_ifa == ap->ifa &&
            (ap->del || !(rt->rt_flags & RTF_STATIC))) {
+               rt_str(rt, dbuf, sizeof (dbuf), gbuf, sizeof (gbuf));
+               if (verbose) {
+                       log(LOG_DEBUG, "%s: deleting route to %s->%s->%s, "
+                           "flags=%b\n", __func__, dbuf, gbuf,
+                           (rt->rt_ifp != NULL) ? rt->rt_ifp->if_xname : "",
+                           rt->rt_flags, RTF_BITS);
+               }
+               RT_ADDREF_LOCKED(rt);   /* for us to free below */
                /*
                 * We need to disable the automatic prune that happens
                 * in this case in rtrequest() because it will blow
@@ -556,14 +739,24 @@ in_ifadownkill(struct radix_node *rn, void *xap)
                rt->rt_flags &= ~(RTF_CLONING | RTF_PRCLONING);
                RT_UNLOCK(rt);
                err = rtrequest_locked(RTM_DELETE, rt_key(rt),
-                   rt->rt_gateway, rt_mask(rt), rt->rt_flags, 0);
-               if (err) {
-                       log(LOG_WARNING, "in_ifadownkill: error %d\n", err);
+                   rt->rt_gateway, rt_mask(rt), rt->rt_flags, NULL);
+               if (err != 0) {
+                       RT_LOCK(rt);
+                       if (!verbose)
+                               rt_str(rt, dbuf, sizeof (dbuf),
+                                   gbuf, sizeof (gbuf));
+                       log(LOG_ERR, "%s: error deleting route to "
+                           "%s->%s->%s, flags=%b, err=%d\n", __func__,
+                           dbuf, gbuf, (rt->rt_ifp != NULL) ?
+                           rt->rt_ifp->if_xname : "", rt->rt_flags,
+                           RTF_BITS, err);
+                       RT_UNLOCK(rt);
                }
+               rtfree_locked(rt);
        } else {
                RT_UNLOCK(rt);
        }
-       return 0;
+       return (0);
 }
 
 int
@@ -583,8 +776,7 @@ in_ifadown(struct ifaddr *ifa, int delete)
                return (1);
 
        /* trigger route cache reevaluation */
-       if (use_routegenid)
-               routegenid_update();
+       routegenid_inet_update();
 
        arg.rnh = rnh = rt_tables[AF_INET];
        arg.ifa = ifa;