X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/55e303ae13a4cf49d70f2294092726f2fffb9ef2..5ba3f43ea354af8ad55bea84372a2bc834d8757c:/bsd/net/rtsock.c?ds=inline diff --git a/bsd/net/rtsock.c b/bsd/net/rtsock.c index ed72eaf27..457eaf4eb 100644 --- a/bsd/net/rtsock.c +++ b/bsd/net/rtsock.c @@ -1,17 +1,20 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2017 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * - * @APPLE_LICENSE_HEADER_START@ - * - * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. - * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this - * file. - * + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -19,8 +22,8 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * - * @APPLE_LICENSE_HEADER_END@ + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* * Copyright (c) 1988, 1991, 1993 @@ -57,9 +60,9 @@ * @(#)rtsock.c 8.5 (Berkeley) 11/2/94 */ - #include #include +#include #include #include #include @@ -70,18 +73,36 @@ #include #include #include +#include +#include +#include #include #include +#include #include #include +#include +#include +#include + +extern struct rtstat rtstat; +extern struct domain routedomain_s; +static struct domain *routedomain = NULL; MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables"); -static struct sockaddr route_dst = { 2, PF_ROUTE, }; -static struct sockaddr route_src = { 2, PF_ROUTE, }; -static struct sockaddr sa_zero = { sizeof(sa_zero), AF_INET, }; -static struct sockproto route_proto = { PF_ROUTE, }; +static struct sockaddr route_dst = { 2, PF_ROUTE, { 0, } }; +static struct sockaddr route_src = { 2, PF_ROUTE, { 0, } }; +static struct sockaddr sa_zero = { sizeof (sa_zero), AF_INET, { 0, } }; + +struct route_cb { + u_int32_t ip_count; /* attached w/ AF_INET */ + u_int32_t ip6_count; /* attached w/ AF_INET6 */ + u_int32_t any_count; /* total attached */ +}; + +static struct route_cb route_cb; struct walkarg { int w_tmemsize; @@ -90,26 +111,52 @@ struct walkarg { struct sysctl_req *w_req; }; -static struct mbuf * - rt_msg1 __P((int, struct rt_addrinfo *)); -static int rt_msg2 __P((int, - struct rt_addrinfo *, caddr_t, struct walkarg *)); -static int rt_xaddrs __P((caddr_t, caddr_t, struct rt_addrinfo *)); -static int sysctl_dumpentry __P((struct radix_node *rn, void *vw)); -static int sysctl_iflist __P((int af, struct walkarg *w)); -static int route_output __P((struct mbuf *, struct socket *)); -static void rt_setmetrics __P((u_long, struct rt_metrics *, struct rt_metrics *)); -static void rt_setif __P((struct rtentry *, struct sockaddr *, struct sockaddr *, - struct sockaddr *)); - -/* Sleazy use of local variables throughout file, warning!!!! */ -#define dst info.rti_info[RTAX_DST] -#define gate info.rti_info[RTAX_GATEWAY] -#define netmask info.rti_info[RTAX_NETMASK] -#define genmask info.rti_info[RTAX_GENMASK] -#define ifpaddr info.rti_info[RTAX_IFP] -#define ifaaddr info.rti_info[RTAX_IFA] -#define brdaddr info.rti_info[RTAX_BRD] +static void route_dinit(struct domain *); +static int rts_abort(struct socket *); +static int rts_attach(struct socket *, int, struct proc *); +static int rts_bind(struct socket *, struct sockaddr *, struct proc *); +static int rts_connect(struct socket *, struct sockaddr *, struct proc *); +static int rts_detach(struct socket *); +static int rts_disconnect(struct socket *); +static int rts_peeraddr(struct socket *, struct sockaddr **); +static int rts_send(struct socket *, int, struct mbuf *, struct sockaddr *, + struct mbuf *, struct proc *); +static int rts_shutdown(struct socket *); +static int rts_sockaddr(struct socket *, struct sockaddr **); + +static int route_output(struct mbuf *, struct socket *); +static int rt_setmetrics(u_int32_t, struct rt_metrics *, struct rtentry *); +static void rt_getmetrics(struct rtentry *, struct rt_metrics *); +static void rt_setif(struct rtentry *, struct sockaddr *, struct sockaddr *, + struct sockaddr *, unsigned int); +static int rt_xaddrs(caddr_t, caddr_t, struct rt_addrinfo *); +static struct mbuf *rt_msg1(int, struct rt_addrinfo *); +static int rt_msg2(int, struct rt_addrinfo *, caddr_t, struct walkarg *, + kauth_cred_t *); +static int sysctl_dumpentry(struct radix_node *rn, void *vw); +static int sysctl_dumpentry_ext(struct radix_node *rn, void *vw); +static int sysctl_iflist(int af, struct walkarg *w); +static int sysctl_iflist2(int af, struct walkarg *w); +static int sysctl_rtstat(struct sysctl_req *); +static int sysctl_rttrash(struct sysctl_req *); +static int sysctl_rtsock SYSCTL_HANDLER_ARGS; + +SYSCTL_NODE(_net, PF_ROUTE, routetable, CTLFLAG_RD | CTLFLAG_LOCKED, + sysctl_rtsock, ""); + +SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "routing"); + +/* Align x to 1024 (only power of 2) assuming x is positive */ +#define ALIGN_BYTES(x) do { \ + x = P2ALIGN(x, 1024); \ +} while(0) + +#define ROUNDUP32(a) \ + ((a) > 0 ? (1 + (((a) - 1) | (sizeof (uint32_t) - 1))) : \ + sizeof (uint32_t)) + +#define ADVANCE32(x, n) \ + (x += ROUNDUP32((n)->sa_len)) /* * It really doesn't make any sense at all for this code to share much @@ -118,11 +165,7 @@ static void rt_setif __P((struct rtentry *, struct sockaddr *, struct sockaddr * static int rts_abort(struct socket *so) { - int s, error; - s = splnet(); - error = raw_usrreqs.pru_abort(so); - splx(s); - return error; + return (raw_usrreqs.pru_abort(so)); } /* pru_accept is EOPNOTSUPP */ @@ -130,73 +173,53 @@ rts_abort(struct socket *so) static int rts_attach(struct socket *so, int proto, struct proc *p) { +#pragma unused(p) struct rawcb *rp; - int s, error; + int error; - if (sotorawcb(so) != 0) - return EISCONN; /* XXX panic? */ - MALLOC(rp, struct rawcb *, sizeof *rp, M_PCB, M_WAITOK); /* XXX */ - if (rp == 0) - return ENOBUFS; - bzero(rp, sizeof *rp); + VERIFY(so->so_pcb == NULL); + + MALLOC(rp, struct rawcb *, sizeof (*rp), M_PCB, M_WAITOK | M_ZERO); + if (rp == NULL) + return (ENOBUFS); - /* - * The splnet() is necessary to block protocols from sending - * error notifications (like RTM_REDIRECT or RTM_LOSING) while - * this PCB is extant but incompletely initialized. - * Probably we should try to do more of this work beforehand and - * eliminate the spl. - */ - s = splnet(); so->so_pcb = (caddr_t)rp; - error = raw_attach(so, proto); /* don't use raw_usrreqs.pru_attach, it checks for SS_PRIV */ + /* don't use raw_usrreqs.pru_attach, it checks for SS_PRIV */ + error = raw_attach(so, proto); rp = sotorawcb(so); if (error) { - splx(s); FREE(rp, M_PCB); - so->so_pcb = 0; - return error; + so->so_pcb = NULL; + so->so_flags |= SOF_PCBCLEARING; + return (error); } - switch(rp->rcb_proto.sp_protocol) { + + switch (rp->rcb_proto.sp_protocol) { case AF_INET: - route_cb.ip_count++; + atomic_add_32(&route_cb.ip_count, 1); break; case AF_INET6: - route_cb.ip6_count++; - break; - case AF_IPX: - route_cb.ipx_count++; - break; - case AF_NS: - route_cb.ns_count++; + atomic_add_32(&route_cb.ip6_count, 1); break; } rp->rcb_faddr = &route_src; - route_cb.any_count++; + atomic_add_32(&route_cb.any_count, 1); + /* the socket is already locked when we enter rts_attach */ soisconnected(so); so->so_options |= SO_USELOOPBACK; - splx(s); - return 0; + return (0); } static int rts_bind(struct socket *so, struct sockaddr *nam, struct proc *p) { - int s, error; - s = splnet(); - error = raw_usrreqs.pru_bind(so, nam, p); /* xxx just EINVAL */ - splx(s); - return error; + return (raw_usrreqs.pru_bind(so, nam, p)); /* xxx just EINVAL */ } static int rts_connect(struct socket *so, struct sockaddr *nam, struct proc *p) { - int s, error; - s = splnet(); - error = raw_usrreqs.pru_connect(so, nam, p); /* XXX just EINVAL */ - splx(s); - return error; + return (raw_usrreqs.pru_connect(so, nam, p)); /* XXX just EINVAL */ } /* pru_connect2 is EOPNOTSUPP */ @@ -206,39 +229,25 @@ static int rts_detach(struct socket *so) { struct rawcb *rp = sotorawcb(so); - int s, error; - s = splnet(); - if (rp != 0) { - switch(rp->rcb_proto.sp_protocol) { - case AF_INET: - route_cb.ip_count--; - break; - case AF_INET6: - route_cb.ip6_count--; - break; - case AF_IPX: - route_cb.ipx_count--; - break; - case AF_NS: - route_cb.ns_count--; - break; - } - route_cb.any_count--; + VERIFY(rp != NULL); + + switch (rp->rcb_proto.sp_protocol) { + case AF_INET: + atomic_add_32(&route_cb.ip_count, -1); + break; + case AF_INET6: + atomic_add_32(&route_cb.ip6_count, -1); + break; } - error = raw_usrreqs.pru_detach(so); - splx(s); - return error; + atomic_add_32(&route_cb.any_count, -1); + return (raw_usrreqs.pru_detach(so)); } static int rts_disconnect(struct socket *so) { - int s, error; - s = splnet(); - error = raw_usrreqs.pru_disconnect(so); - splx(s); - return error; + return (raw_usrreqs.pru_disconnect(so)); } /* pru_listen is EOPNOTSUPP */ @@ -246,11 +255,7 @@ rts_disconnect(struct socket *so) static int rts_peeraddr(struct socket *so, struct sockaddr **nam) { - int s, error; - s = splnet(); - error = raw_usrreqs.pru_peeraddr(so, nam); - splx(s); - return error; + return (raw_usrreqs.pru_peeraddr(so, nam)); } /* pru_rcvd is EOPNOTSUPP */ @@ -258,13 +263,9 @@ rts_peeraddr(struct socket *so, struct sockaddr **nam) static int rts_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, - struct mbuf *control, struct proc *p) + struct mbuf *control, struct proc *p) { - int s, error; - s = splnet(); - error = raw_usrreqs.pru_send(so, flags, m, nam, control, p); - splx(s); - return error; + return (raw_usrreqs.pru_send(so, flags, m, nam, control, p)); } /* pru_sense is null */ @@ -272,192 +273,254 @@ rts_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, static int rts_shutdown(struct socket *so) { - int s, error; - s = splnet(); - error = raw_usrreqs.pru_shutdown(so); - splx(s); - return error; + return (raw_usrreqs.pru_shutdown(so)); } static int rts_sockaddr(struct socket *so, struct sockaddr **nam) { - int s, error; - s = splnet(); - error = raw_usrreqs.pru_sockaddr(so, nam); - splx(s); - return error; + return (raw_usrreqs.pru_sockaddr(so, nam)); } static struct pr_usrreqs route_usrreqs = { - rts_abort, pru_accept_notsupp, rts_attach, rts_bind, rts_connect, - pru_connect2_notsupp, pru_control_notsupp, rts_detach, rts_disconnect, - pru_listen_notsupp, rts_peeraddr, pru_rcvd_notsupp, pru_rcvoob_notsupp, - rts_send, pru_sense_null, rts_shutdown, rts_sockaddr, - sosend, soreceive, sopoll + .pru_abort = rts_abort, + .pru_attach = rts_attach, + .pru_bind = rts_bind, + .pru_connect = rts_connect, + .pru_detach = rts_detach, + .pru_disconnect = rts_disconnect, + .pru_peeraddr = rts_peeraddr, + .pru_send = rts_send, + .pru_shutdown = rts_shutdown, + .pru_sockaddr = rts_sockaddr, + .pru_sosend = sosend, + .pru_soreceive = soreceive, }; /*ARGSUSED*/ static int -route_output(m, so) - register struct mbuf *m; - struct socket *so; +route_output(struct mbuf *m, struct socket *so) { - register struct rt_msghdr *rtm = 0; - register struct rtentry *rt = 0; - struct rtentry *saved_nrt = 0; + struct rt_msghdr *rtm = NULL; + struct rtentry *rt = NULL; + struct rtentry *saved_nrt = NULL; struct radix_node_head *rnh; struct rt_addrinfo info; int len, error = 0; - struct ifnet *ifp = 0; - struct ifaddr *ifa = 0; - struct proc *curproc = current_proc(); + sa_family_t dst_sa_family = 0; + struct ifnet *ifp = NULL; + struct sockaddr_in dst_in, gate_in; int sendonlytoself = 0; - -#define senderr(e) { error = e; goto flush;} - if (m == 0 || ((m->m_len < sizeof(long)) && - (m = m_pullup(m, sizeof(long))) == 0)) + unsigned int ifscope = IFSCOPE_NONE; + struct rawcb *rp = NULL; + boolean_t is_router = FALSE; +#define senderr(e) { error = (e); goto flush; } + if (m == NULL || ((m->m_len < sizeof (intptr_t)) && + (m = m_pullup(m, sizeof (intptr_t))) == NULL)) return (ENOBUFS); - if ((m->m_flags & M_PKTHDR) == 0) - panic("route_output"); + VERIFY(m->m_flags & M_PKTHDR); + + /* + * Unlock the socket (but keep a reference) it won't be + * accessed until raw_input appends to it. + */ + socket_unlock(so, 0); + lck_mtx_lock(rnh_lock); + len = m->m_pkthdr.len; - if (len < sizeof(*rtm) || + if (len < sizeof (*rtm) || len != mtod(m, struct rt_msghdr *)->rtm_msglen) { - dst = 0; + info.rti_info[RTAX_DST] = NULL; senderr(EINVAL); } R_Malloc(rtm, struct rt_msghdr *, len); - if (rtm == 0) { - dst = 0; + if (rtm == NULL) { + info.rti_info[RTAX_DST] = NULL; senderr(ENOBUFS); } m_copydata(m, 0, len, (caddr_t)rtm); if (rtm->rtm_version != RTM_VERSION) { - dst = 0; + info.rti_info[RTAX_DST] = NULL; senderr(EPROTONOSUPPORT); } - + /* * Silent version of RTM_GET for Reachabiltiy APIs. We may change * all RTM_GETs to be silent in the future, so this is private for now. */ if (rtm->rtm_type == RTM_GET_SILENT) { - if ((so->so_options & SO_USELOOPBACK) == 0) + if (!(so->so_options & SO_USELOOPBACK)) senderr(EINVAL); sendonlytoself = 1; rtm->rtm_type = RTM_GET; } - + /* * Perform permission checking, only privileged sockets * may perform operations other than RTM_GET */ - if (rtm->rtm_type != RTM_GET && (so->so_state & SS_PRIV) == 0) { - dst = 0; + if (rtm->rtm_type != RTM_GET && !(so->so_state & SS_PRIV)) { + info.rti_info[RTAX_DST] = NULL; senderr(EPERM); } - rtm->rtm_pid = curproc->p_pid; + + rtm->rtm_pid = proc_selfpid(); info.rti_addrs = rtm->rtm_addrs; if (rt_xaddrs((caddr_t)(rtm + 1), len + (caddr_t)rtm, &info)) { - dst = 0; + info.rti_info[RTAX_DST] = NULL; senderr(EINVAL); } - if (dst == 0 || (dst->sa_family >= AF_MAX) - || (gate != 0 && (gate->sa_family >= AF_MAX))) + if (info.rti_info[RTAX_DST] == NULL || + info.rti_info[RTAX_DST]->sa_family >= AF_MAX || + (info.rti_info[RTAX_GATEWAY] != NULL && + info.rti_info[RTAX_GATEWAY]->sa_family >= AF_MAX)) senderr(EINVAL); - if (genmask) { + + if (info.rti_info[RTAX_DST]->sa_family == AF_INET && + info.rti_info[RTAX_DST]->sa_len != sizeof (dst_in)) { + /* At minimum, we need up to sin_addr */ + if (info.rti_info[RTAX_DST]->sa_len < + offsetof(struct sockaddr_in, sin_zero)) + senderr(EINVAL); + bzero(&dst_in, sizeof (dst_in)); + dst_in.sin_len = sizeof (dst_in); + dst_in.sin_family = AF_INET; + dst_in.sin_port = SIN(info.rti_info[RTAX_DST])->sin_port; + dst_in.sin_addr = SIN(info.rti_info[RTAX_DST])->sin_addr; + info.rti_info[RTAX_DST] = (struct sockaddr *)&dst_in; + dst_sa_family = info.rti_info[RTAX_DST]->sa_family; + } + + if (info.rti_info[RTAX_GATEWAY] != NULL && + info.rti_info[RTAX_GATEWAY]->sa_family == AF_INET && + info.rti_info[RTAX_GATEWAY]->sa_len != sizeof (gate_in)) { + /* At minimum, we need up to sin_addr */ + if (info.rti_info[RTAX_GATEWAY]->sa_len < + offsetof(struct sockaddr_in, sin_zero)) + senderr(EINVAL); + bzero(&gate_in, sizeof (gate_in)); + gate_in.sin_len = sizeof (gate_in); + gate_in.sin_family = AF_INET; + gate_in.sin_port = SIN(info.rti_info[RTAX_GATEWAY])->sin_port; + gate_in.sin_addr = SIN(info.rti_info[RTAX_GATEWAY])->sin_addr; + info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&gate_in; + } + + if (info.rti_info[RTAX_GENMASK]) { struct radix_node *t; - t = rn_addmask((caddr_t)genmask, 0, 1); - if (t && Bcmp(genmask, t->rn_key, *(u_char *)genmask) == 0) - genmask = (struct sockaddr *)(t->rn_key); + t = rn_addmask((caddr_t)info.rti_info[RTAX_GENMASK], 0, 1); + if (t != NULL && Bcmp(info.rti_info[RTAX_GENMASK], + t->rn_key, *(u_char *)info.rti_info[RTAX_GENMASK]) == 0) + info.rti_info[RTAX_GENMASK] = + (struct sockaddr *)(t->rn_key); else senderr(ENOBUFS); } - switch (rtm->rtm_type) { + /* + * If RTF_IFSCOPE flag is set, then rtm_index specifies the scope. + */ + if (rtm->rtm_flags & RTF_IFSCOPE) { + if (info.rti_info[RTAX_DST]->sa_family != AF_INET && + info.rti_info[RTAX_DST]->sa_family != AF_INET6) + senderr(EINVAL); + ifscope = rtm->rtm_index; + } + /* + * Block changes on INTCOPROC interfaces. + */ + if (ifscope) { + unsigned int intcoproc_scope = 0; + ifnet_head_lock_shared(); + TAILQ_FOREACH(ifp, &ifnet_head, if_link) { + if (IFNET_IS_INTCOPROC(ifp)) { + intcoproc_scope = ifp->if_index; + break; + } + } + ifnet_head_done(); + if (intcoproc_scope == ifscope && current_proc()->p_pid != 0) + senderr(EINVAL); + } + + /* + * RTF_PROXY can only be set internally from within the kernel. + */ + if (rtm->rtm_flags & RTF_PROXY) + senderr(EINVAL); + + /* + * For AF_INET, always zero out the embedded scope ID. If this is + * a scoped request, it must be done explicitly by setting RTF_IFSCOPE + * flag and the corresponding rtm_index value. This is to prevent + * false interpretation of the scope ID because it's using the sin_zero + * field, which might not be properly cleared by the requestor. + */ + if (info.rti_info[RTAX_DST]->sa_family == AF_INET) + sin_set_ifscope(info.rti_info[RTAX_DST], IFSCOPE_NONE); + if (info.rti_info[RTAX_GATEWAY] != NULL && + info.rti_info[RTAX_GATEWAY]->sa_family == AF_INET) + sin_set_ifscope(info.rti_info[RTAX_GATEWAY], IFSCOPE_NONE); + + switch (rtm->rtm_type) { case RTM_ADD: - if (gate == 0) + if (info.rti_info[RTAX_GATEWAY] == NULL) senderr(EINVAL); -#ifdef __APPLE__ -/* XXX LD11JUL02 Special case for AOL 5.1.2 connectivity issue to AirPort BS (Radar 2969954) - * AOL is adding a circular route ("10.0.1.1/32 10.0.1.1") when establishing its ppp tunnel - * to the AP BaseStation by removing the default gateway and replacing it with their tunnel entry point. - * There is no apparent reason to add this route as there is a valid 10.0.1.1/24 route to the BS. - * That circular route was ignored on previous version of MacOS X because of a routing bug - * corrected with the merge to FreeBSD4.4 (a route generated from an RTF_CLONING route had the RTF_WASCLONED - * flag set but did not have a reference to the parent route) and that entry was left in the RT. This workaround is - * made in order to provide binary compatibility with AOL. - * If we catch a process adding a circular route with a /32 from the routing socket, we error it out instead of - * confusing the routing table with a wrong route to the previous default gateway - */ -{ - extern int check_routeselfref; -#define satosinaddr(sa) (((struct sockaddr_in *)sa)->sin_addr.s_addr) - - if (check_routeselfref && (dst && dst->sa_family == AF_INET) && - (netmask && satosinaddr(netmask) == INADDR_BROADCAST) && - (gate && satosinaddr(dst) == satosinaddr(gate))) { - log(LOG_WARNING, "route_output: circular route %ld.%ld.%ld.%ld/32 ignored\n", - (ntohl(satosinaddr(gate)>>24))&0xff, - (ntohl(satosinaddr(gate)>>16))&0xff, - (ntohl(satosinaddr(gate)>>8))&0xff, - (ntohl(satosinaddr(gate)))&0xff); - - senderr(EINVAL); - } -} -#endif - error = rtrequest(RTM_ADD, dst, gate, netmask, - rtm->rtm_flags, &saved_nrt); - if (error == 0 && saved_nrt) { -#ifdef __APPLE__ - /* - * If the route request specified an interface with - * IFA and/or IFP, we set the requested interface on - * the route with rt_setif. It would be much better - * to do this inside rtrequest, but that would - * require passing the desired interface, in some - * form, to rtrequest. Since rtrequest is called in - * so many places (roughly 40 in our source), adding - * a parameter is to much for us to swallow; this is - * something for the FreeBSD developers to tackle. - * Instead, we let rtrequest compute whatever - * interface it wants, then come in behind it and - * stick in the interface that we really want. This - * works reasonably well except when rtrequest can't - * figure out what interface to use (with - * ifa_withroute) and returns ENETUNREACH. Ideally - * it shouldn't matter if rtrequest can't figure out - * the interface if we're going to explicitly set it - * ourselves anyway. But practically we can't - * recover here because rtrequest will not do any of - * the work necessary to add the route if it can't - * find an interface. As long as there is a default - * route that leads to some interface, rtrequest will - * find an interface, so this problem should be - * rarely encountered. - * dwiggins@bbn.com - */ - - rt_setif(saved_nrt, ifpaddr, ifaaddr, gate); -#endif - rt_setmetrics(rtm->rtm_inits, - &rtm->rtm_rmx, &saved_nrt->rt_rmx); + error = rtrequest_scoped_locked(RTM_ADD, + info.rti_info[RTAX_DST], info.rti_info[RTAX_GATEWAY], + info.rti_info[RTAX_NETMASK], rtm->rtm_flags, &saved_nrt, + ifscope); + if (error == 0 && saved_nrt != NULL) { + RT_LOCK(saved_nrt); + /* + * If the route request specified an interface with + * IFA and/or IFP, we set the requested interface on + * the route with rt_setif. It would be much better + * to do this inside rtrequest, but that would + * require passing the desired interface, in some + * form, to rtrequest. Since rtrequest is called in + * so many places (roughly 40 in our source), adding + * a parameter is to much for us to swallow; this is + * something for the FreeBSD developers to tackle. + * Instead, we let rtrequest compute whatever + * interface it wants, then come in behind it and + * stick in the interface that we really want. This + * works reasonably well except when rtrequest can't + * figure out what interface to use (with + * ifa_withroute) and returns ENETUNREACH. Ideally + * it shouldn't matter if rtrequest can't figure out + * the interface if we're going to explicitly set it + * ourselves anyway. But practically we can't + * recover here because rtrequest will not do any of + * the work necessary to add the route if it can't + * find an interface. As long as there is a default + * route that leads to some interface, rtrequest will + * find an interface, so this problem should be + * rarely encountered. + * dwiggins@bbn.com + */ + rt_setif(saved_nrt, + info.rti_info[RTAX_IFP], info.rti_info[RTAX_IFA], + info.rti_info[RTAX_GATEWAY], ifscope); + (void)rt_setmetrics(rtm->rtm_inits, &rtm->rtm_rmx, saved_nrt); saved_nrt->rt_rmx.rmx_locks &= ~(rtm->rtm_inits); saved_nrt->rt_rmx.rmx_locks |= - (rtm->rtm_inits & rtm->rtm_rmx.rmx_locks); - rtunref(saved_nrt); - saved_nrt->rt_genmask = genmask; + (rtm->rtm_inits & rtm->rtm_rmx.rmx_locks); + saved_nrt->rt_genmask = info.rti_info[RTAX_GENMASK]; + RT_REMREF_LOCKED(saved_nrt); + RT_UNLOCK(saved_nrt); } break; case RTM_DELETE: - error = rtrequest(RTM_DELETE, dst, gate, netmask, - rtm->rtm_flags, &saved_nrt); + error = rtrequest_scoped_locked(RTM_DELETE, + info.rti_info[RTAX_DST], info.rti_info[RTAX_GATEWAY], + info.rti_info[RTAX_NETMASK], rtm->rtm_flags, &saved_nrt, + ifscope); if (error == 0) { - if ((rt = saved_nrt)) - rtref(rt); + rt = saved_nrt; + RT_LOCK(rt); goto report; } break; @@ -465,219 +528,479 @@ route_output(m, so) case RTM_GET: case RTM_CHANGE: case RTM_LOCK: - if ((rnh = rt_tables[dst->sa_family]) == 0) { + rnh = rt_tables[info.rti_info[RTAX_DST]->sa_family]; + if (rnh == NULL) senderr(EAFNOSUPPORT); - } else if ((rt = (struct rtentry *) - rnh->rnh_lookup(dst, netmask, rnh)) != NULL) - rtref(rt); - else + /* + * Lookup the best match based on the key-mask pair; + * callee adds a reference and checks for root node. + */ + rt = rt_lookup(TRUE, info.rti_info[RTAX_DST], + info.rti_info[RTAX_NETMASK], rnh, ifscope); + if (rt == NULL) senderr(ESRCH); - switch(rtm->rtm_type) { - - case RTM_GET: - report: - dst = rt_key(rt); - gate = rt->rt_gateway; - netmask = rt_mask(rt); - genmask = rt->rt_genmask; + RT_LOCK(rt); + + /* + * Holding rnh_lock here prevents the possibility of + * ifa from changing (e.g. in_ifinit), so it is safe + * to access its ifa_addr (down below) without locking. + */ + switch (rtm->rtm_type) { + case RTM_GET: { + kauth_cred_t cred; + struct ifaddr *ifa2; +report: + cred = kauth_cred_proc_ref(current_proc()); + ifa2 = NULL; + RT_LOCK_ASSERT_HELD(rt); + info.rti_info[RTAX_DST] = rt_key(rt); + dst_sa_family = info.rti_info[RTAX_DST]->sa_family; + info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; + info.rti_info[RTAX_NETMASK] = rt_mask(rt); + info.rti_info[RTAX_GENMASK] = rt->rt_genmask; if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) { ifp = rt->rt_ifp; - if (ifp) { - ifpaddr = ifp->if_addrhead.tqh_first->ifa_addr; - ifaaddr = rt->rt_ifa->ifa_addr; + if (ifp != NULL) { + ifnet_lock_shared(ifp); + ifa2 = ifp->if_lladdr; + info.rti_info[RTAX_IFP] = + ifa2->ifa_addr; + IFA_ADDREF(ifa2); + ifnet_lock_done(ifp); + info.rti_info[RTAX_IFA] = + rt->rt_ifa->ifa_addr; rtm->rtm_index = ifp->if_index; } else { - ifpaddr = 0; - ifaaddr = 0; - } + info.rti_info[RTAX_IFP] = NULL; + info.rti_info[RTAX_IFA] = NULL; + } + } else if ((ifp = rt->rt_ifp) != NULL) { + rtm->rtm_index = ifp->if_index; } - len = rt_msg2(rtm->rtm_type, &info, (caddr_t)0, - (struct walkarg *)0); + if (ifa2 != NULL) + IFA_LOCK(ifa2); + len = rt_msg2(rtm->rtm_type, &info, NULL, NULL, &cred); + if (ifa2 != NULL) + IFA_UNLOCK(ifa2); if (len > rtm->rtm_msglen) { struct rt_msghdr *new_rtm; R_Malloc(new_rtm, struct rt_msghdr *, len); - if (new_rtm == 0) + if (new_rtm == NULL) { + RT_UNLOCK(rt); + if (ifa2 != NULL) + IFA_REMREF(ifa2); senderr(ENOBUFS); + } Bcopy(rtm, new_rtm, rtm->rtm_msglen); - Free(rtm); rtm = new_rtm; + R_Free(rtm); rtm = new_rtm; } - (void)rt_msg2(rtm->rtm_type, &info, (caddr_t)rtm, - (struct walkarg *)0); + if (ifa2 != NULL) + IFA_LOCK(ifa2); + (void) rt_msg2(rtm->rtm_type, &info, (caddr_t)rtm, + NULL, &cred); + if (ifa2 != NULL) + IFA_UNLOCK(ifa2); rtm->rtm_flags = rt->rt_flags; - rtm->rtm_rmx = rt->rt_rmx; + rt_getmetrics(rt, &rtm->rtm_rmx); rtm->rtm_addrs = info.rti_addrs; + if (ifa2 != NULL) + IFA_REMREF(ifa2); + + kauth_cred_unref(&cred); break; + } case RTM_CHANGE: - if (gate && (error = rt_setgate(rt, rt_key(rt), gate))) - senderr(error); - + is_router = (rt->rt_flags & RTF_ROUTER) ? TRUE : FALSE; + + if (info.rti_info[RTAX_GATEWAY] != NULL && + (error = rt_setgate(rt, rt_key(rt), + info.rti_info[RTAX_GATEWAY]))) { + int tmp = error; + RT_UNLOCK(rt); + senderr(tmp); + } /* * If they tried to change things but didn't specify * the required gateway, then just use the old one. * This can happen if the user tries to change the * flags on the default route without changing the - * default gateway. Changing flags still doesn't work. + * default gateway. Changing flags still doesn't work. */ - if ((rt->rt_flags & RTF_GATEWAY) && !gate) - gate = rt->rt_gateway; + if ((rt->rt_flags & RTF_GATEWAY) && + info.rti_info[RTAX_GATEWAY] == NULL) + info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; -#ifdef __APPLE__ /* * On Darwin, we call rt_setif which contains the * equivalent to the code found at this very spot * in BSD. */ - rt_setif(rt, ifpaddr, ifaaddr, gate); -#endif - - rt_setmetrics(rtm->rtm_inits, &rtm->rtm_rmx, - &rt->rt_rmx); -#ifndef __APPLE__ - /* rt_setif, called above does this for us on darwin */ - if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest) - rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, gate); -#endif - if (genmask) - rt->rt_genmask = genmask; + rt_setif(rt, + info.rti_info[RTAX_IFP], info.rti_info[RTAX_IFA], + info.rti_info[RTAX_GATEWAY], ifscope); + + if ((error = rt_setmetrics(rtm->rtm_inits, + &rtm->rtm_rmx, rt))) { + int tmp = error; + RT_UNLOCK(rt); + senderr(tmp); + } + if (info.rti_info[RTAX_GENMASK]) + rt->rt_genmask = info.rti_info[RTAX_GENMASK]; + /* - * Fall into + * Enqueue work item to invoke callback for this route entry + * This may not be needed always, but for now issue it anytime + * RTM_CHANGE gets called. */ + route_event_enqueue_nwk_wq_entry(rt, NULL, ROUTE_ENTRY_REFRESH, NULL, TRUE); + /* + * If the route is for a router, walk the tree to send refresh + * event to protocol cloned entries + */ + if (is_router) { + struct route_event rt_ev; + route_event_init(&rt_ev, rt, NULL, ROUTE_ENTRY_REFRESH); + RT_UNLOCK(rt); + (void) rnh->rnh_walktree(rnh, route_event_walktree, (void *)&rt_ev); + RT_LOCK(rt); + } + /* FALLTHRU */ case RTM_LOCK: rt->rt_rmx.rmx_locks &= ~(rtm->rtm_inits); rt->rt_rmx.rmx_locks |= - (rtm->rtm_inits & rtm->rtm_rmx.rmx_locks); + (rtm->rtm_inits & rtm->rtm_rmx.rmx_locks); break; } + RT_UNLOCK(rt); break; default: senderr(EOPNOTSUPP); } - flush: - if (rtm) { + if (rtm != NULL) { if (error) rtm->rtm_errno = error; else rtm->rtm_flags |= RTF_DONE; } - if (rt) - rtfree(rt); - { - register struct rawcb *rp = 0; + if (rt != NULL) { + RT_LOCK_ASSERT_NOTHELD(rt); + rtfree_locked(rt); + } + lck_mtx_unlock(rnh_lock); + + /* relock the socket now */ + socket_lock(so, 0); /* * Check to see if we don't want our own messages. */ - if ((so->so_options & SO_USELOOPBACK) == 0) { + if (!(so->so_options & SO_USELOOPBACK)) { if (route_cb.any_count <= 1) { - if (rtm) - Free(rtm); + if (rtm != NULL) + R_Free(rtm); m_freem(m); return (error); } /* There is another listener, so construct message */ rp = sotorawcb(so); } - if (rtm) { + if (rtm != NULL) { m_copyback(m, 0, rtm->rtm_msglen, (caddr_t)rtm); if (m->m_pkthdr.len < rtm->rtm_msglen) { m_freem(m); m = NULL; - } else if (m->m_pkthdr.len > rtm->rtm_msglen) + } else if (m->m_pkthdr.len > rtm->rtm_msglen) { m_adj(m, rtm->rtm_msglen - m->m_pkthdr.len); - Free(rtm); + } + R_Free(rtm); } - if (sendonlytoself && m) { - if (sbappendaddr(&so->so_rcv, &route_src, m, (struct mbuf*)0) == 0) { - m_freem(m); - error = ENOBUFS; - } else { + if (sendonlytoself && m != NULL) { + error = 0; + if (sbappendaddr(&so->so_rcv, &route_src, m, + NULL, &error) != 0) { sorwakeup(so); } + if (error) + return (error); } else { - if (rp) + struct sockproto route_proto = { PF_ROUTE, 0 }; + if (rp != NULL) rp->rcb_proto.sp_family = 0; /* Avoid us */ - if (dst) - route_proto.sp_protocol = dst->sa_family; - if (m) + if (dst_sa_family != 0) + route_proto.sp_protocol = dst_sa_family; + if (m != NULL) { + socket_unlock(so, 0); raw_input(m, &route_proto, &route_src, &route_dst); - if (rp) - rp->rcb_proto.sp_family = PF_ROUTE; + socket_lock(so, 0); } + if (rp != NULL) + rp->rcb_proto.sp_family = PF_ROUTE; } return (error); } -static void -rt_setmetrics(which, in, out) - u_long which; - register struct rt_metrics *in, *out; +void +rt_setexpire(struct rtentry *rt, uint64_t expiry) +{ + /* set both rt_expire and rmx_expire */ + rt->rt_expire = expiry; + if (expiry) { + rt->rt_rmx.rmx_expire = expiry + rt->base_calendartime - + rt->base_uptime; + } else { + rt->rt_rmx.rmx_expire = 0; + } +} + +static int +rt_setmetrics(u_int32_t which, struct rt_metrics *in, struct rtentry *out) { -#define metric(f, e) if (which & (f)) out->e = in->e; - metric(RTV_RPIPE, rmx_recvpipe); - metric(RTV_SPIPE, rmx_sendpipe); - metric(RTV_SSTHRESH, rmx_ssthresh); - metric(RTV_RTT, rmx_rtt); - metric(RTV_RTTVAR, rmx_rttvar); - metric(RTV_HOPCOUNT, rmx_hopcount); - metric(RTV_MTU, rmx_mtu); - metric(RTV_EXPIRE, rmx_expire); + if (!(which & RTV_REFRESH_HOST)) { + struct timeval caltime; + getmicrotime(&caltime); +#define metric(f, e) if (which & (f)) out->rt_rmx.e = in->e; + metric(RTV_RPIPE, rmx_recvpipe); + metric(RTV_SPIPE, rmx_sendpipe); + metric(RTV_SSTHRESH, rmx_ssthresh); + metric(RTV_RTT, rmx_rtt); + metric(RTV_RTTVAR, rmx_rttvar); + metric(RTV_HOPCOUNT, rmx_hopcount); + metric(RTV_MTU, rmx_mtu); + metric(RTV_EXPIRE, rmx_expire); #undef metric + if (out->rt_rmx.rmx_expire > 0) { + /* account for system time change */ + getmicrotime(&caltime); + out->base_calendartime += + NET_CALCULATE_CLOCKSKEW(caltime, + out->base_calendartime, + net_uptime(), out->base_uptime); + rt_setexpire(out, + out->rt_rmx.rmx_expire - + out->base_calendartime + + out->base_uptime); + } else { + rt_setexpire(out, 0); + } + + VERIFY(out->rt_expire == 0 || out->rt_rmx.rmx_expire != 0); + VERIFY(out->rt_expire != 0 || out->rt_rmx.rmx_expire == 0); + } else { + /* Only RTV_REFRESH_HOST must be set */ + if ((which & ~RTV_REFRESH_HOST) || + (out->rt_flags & RTF_STATIC) || + !(out->rt_flags & RTF_LLINFO)) { + return (EINVAL); + } + + if (out->rt_llinfo_refresh == NULL) { + return (ENOTSUP); + } + + out->rt_llinfo_refresh(out); + } + return (0); +} + +static void +rt_getmetrics(struct rtentry *in, struct rt_metrics *out) +{ + struct timeval caltime; + + VERIFY(in->rt_expire == 0 || in->rt_rmx.rmx_expire != 0); + VERIFY(in->rt_expire != 0 || in->rt_rmx.rmx_expire == 0); + + *out = in->rt_rmx; + + if (in->rt_expire != 0) { + /* account for system time change */ + getmicrotime(&caltime); + + in->base_calendartime += + NET_CALCULATE_CLOCKSKEW(caltime, + in->base_calendartime, net_uptime(), in->base_uptime); + + out->rmx_expire = in->base_calendartime + + in->rt_expire - in->base_uptime; + } else { + out->rmx_expire = 0; + } } /* - * Set route's interface given ifpaddr, ifaaddr, and gateway. + * Set route's interface given info.rti_info[RTAX_IFP], + * info.rti_info[RTAX_IFA], and gateway. */ static void -rt_setif(rt, Ifpaddr, Ifaaddr, Gate) - struct rtentry *rt; - struct sockaddr *Ifpaddr, *Ifaaddr, *Gate; +rt_setif(struct rtentry *rt, struct sockaddr *Ifpaddr, struct sockaddr *Ifaaddr, + struct sockaddr *Gate, unsigned int ifscope) { - struct ifaddr *ifa = 0; - struct ifnet *ifp = 0; - - /* new gateway could require new ifaddr, ifp; - flags may also be different; ifp may be specified - by ll sockaddr when protocol address is ambiguous */ - if (Ifpaddr && (ifa = ifa_ifwithnet(Ifpaddr)) && - (ifp = ifa->ifa_ifp) && (Ifaaddr || Gate)) - ifa = ifaof_ifpforaddr(Ifaaddr ? Ifaaddr : Gate, - ifp); - else if (Ifpaddr && (ifp = if_withname(Ifpaddr)) ) { - ifa = Gate ? ifaof_ifpforaddr(Gate, ifp) : - TAILQ_FIRST(&ifp->if_addrhead); + struct ifaddr *ifa = NULL; + struct ifnet *ifp = NULL; + void (*ifa_rtrequest)(int, struct rtentry *, struct sockaddr *); + + LCK_MTX_ASSERT(rnh_lock, LCK_MTX_ASSERT_OWNED); + + RT_LOCK_ASSERT_HELD(rt); + + /* Don't update a defunct route */ + if (rt->rt_flags & RTF_CONDEMNED) + return; + + /* Add an extra ref for ourselves */ + RT_ADDREF_LOCKED(rt); + + /* Become a regular mutex, just in case */ + RT_CONVERT_LOCK(rt); + + /* + * New gateway could require new ifaddr, ifp; flags may also + * be different; ifp may be specified by ll sockaddr when + * protocol address is ambiguous. + */ + if (Ifpaddr && (ifa = ifa_ifwithnet_scoped(Ifpaddr, ifscope)) && + (ifp = ifa->ifa_ifp) && (Ifaaddr || Gate)) { + IFA_REMREF(ifa); + ifa = ifaof_ifpforaddr(Ifaaddr ? Ifaaddr : Gate, ifp); + } else { + if (ifa != NULL) { + IFA_REMREF(ifa); + ifa = NULL; + } + if (Ifpaddr && (ifp = if_withname(Ifpaddr))) { + if (Gate) { + ifa = ifaof_ifpforaddr(Gate, ifp); + } else { + ifnet_lock_shared(ifp); + ifa = TAILQ_FIRST(&ifp->if_addrhead); + if (ifa != NULL) + IFA_ADDREF(ifa); + ifnet_lock_done(ifp); + } + } else if (Ifaaddr && + (ifa = ifa_ifwithaddr_scoped(Ifaaddr, ifscope))) { + ifp = ifa->ifa_ifp; + } else if (Gate != NULL) { + /* + * Safe to drop rt_lock and use rt_key, since holding + * rnh_lock here prevents another thread from calling + * rt_setgate() on this route. We cannot hold the + * lock across ifa_ifwithroute since the lookup done + * by that routine may point to the same route. + */ + RT_UNLOCK(rt); + if ((ifa = ifa_ifwithroute_scoped_locked(rt->rt_flags, + rt_key(rt), Gate, ifscope)) != NULL) + ifp = ifa->ifa_ifp; + RT_LOCK(rt); + /* Don't update a defunct route */ + if (rt->rt_flags & RTF_CONDEMNED) { + if (ifa != NULL) + IFA_REMREF(ifa); + /* Release extra ref */ + RT_REMREF_LOCKED(rt); + return; + } + } } - else if ((Ifaaddr && (ifa = ifa_ifwithaddr(Ifaaddr))) || - (Gate && (ifa = ifa_ifwithroute(rt->rt_flags, - rt_key(rt), Gate)))) - ifp = ifa->ifa_ifp; - if (ifa) { - register struct ifaddr *oifa = rt->rt_ifa; + + /* trigger route cache reevaluation */ + if (rt_key(rt)->sa_family == AF_INET) + routegenid_inet_update(); +#if INET6 + else if (rt_key(rt)->sa_family == AF_INET6) + routegenid_inet6_update(); +#endif /* INET6 */ + + if (ifa != NULL) { + struct ifaddr *oifa = rt->rt_ifa; if (oifa != ifa) { - if (oifa && oifa->ifa_rtrequest) - oifa->ifa_rtrequest(RTM_DELETE, - rt, Gate); + if (oifa != NULL) { + IFA_LOCK_SPIN(oifa); + ifa_rtrequest = oifa->ifa_rtrequest; + IFA_UNLOCK(oifa); + if (ifa_rtrequest != NULL) + ifa_rtrequest(RTM_DELETE, rt, Gate); + } rtsetifa(rt, ifa); - rt->rt_ifp = ifp; - rt->rt_rmx.rmx_mtu = ifp->if_mtu; - if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest) - rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, Gate); - } else - goto call_ifareq; - return; + + if (rt->rt_ifp != ifp) { + /* + * Purge any link-layer info caching. + */ + if (rt->rt_llinfo_purge != NULL) + rt->rt_llinfo_purge(rt); + + /* + * Adjust route ref count for the interfaces. + */ + if (rt->rt_if_ref_fn != NULL) { + rt->rt_if_ref_fn(ifp, 1); + rt->rt_if_ref_fn(rt->rt_ifp, -1); + } + } + rt->rt_ifp = ifp; + /* + * If this is the (non-scoped) default route, record + * the interface index used for the primary ifscope. + */ + if (rt_primary_default(rt, rt_key(rt))) { + set_primary_ifscope(rt_key(rt)->sa_family, + rt->rt_ifp->if_index); + } + /* + * If rmx_mtu is not locked, update it + * to the MTU used by the new interface. + */ + if (!(rt->rt_rmx.rmx_locks & RTV_MTU)) + rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; + + if (rt->rt_ifa != NULL) { + IFA_LOCK_SPIN(rt->rt_ifa); + ifa_rtrequest = rt->rt_ifa->ifa_rtrequest; + IFA_UNLOCK(rt->rt_ifa); + if (ifa_rtrequest != NULL) + ifa_rtrequest(RTM_ADD, rt, Gate); + } + IFA_REMREF(ifa); + /* Release extra ref */ + RT_REMREF_LOCKED(rt); + return; + } + IFA_REMREF(ifa); + ifa = NULL; } - call_ifareq: - /* XXX: to reset gateway to correct value, at RTM_CHANGE */ - if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest) - rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, Gate); -} + /* XXX: to reset gateway to correct value, at RTM_CHANGE */ + if (rt->rt_ifa != NULL) { + IFA_LOCK_SPIN(rt->rt_ifa); + ifa_rtrequest = rt->rt_ifa->ifa_rtrequest; + IFA_UNLOCK(rt->rt_ifa); + if (ifa_rtrequest != NULL) + ifa_rtrequest(RTM_ADD, rt, Gate); + } -#define ROUNDUP(a) \ - ((a) > 0 ? (1 + (((a) - 1) | (sizeof(long) - 1))) : sizeof(long)) -#define ADVANCE(x, n) (x += ROUNDUP((n)->sa_len)) + /* + * Workaround for local address routes pointing to the loopback + * interface added by configd, until . + */ + if ((rt->rt_ifp->if_flags & IFF_LOOPBACK) && + (rt->rt_flags & RTF_HOST) && rt->rt_ifa->ifa_ifp == rt->rt_ifp) { + ifa = ifa_ifwithaddr(rt_key(rt)); + if (ifa != NULL) { + if (ifa != rt->rt_ifa) + rtsetifa(rt, ifa); + IFA_REMREF(ifa); + } + } + /* Release extra ref */ + RT_REMREF_LOCKED(rt); +} /* * Extract the addresses of the passed sockaddrs. @@ -685,14 +1008,12 @@ rt_setif(rt, Ifpaddr, Ifaaddr, Gate) * This data is derived straight from userland. */ static int -rt_xaddrs(cp, cplim, rtinfo) - register caddr_t cp, cplim; - register struct rt_addrinfo *rtinfo; +rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo) { - register struct sockaddr *sa; - register int i; + struct sockaddr *sa; + int i; - bzero(rtinfo->rti_info, sizeof(rtinfo->rti_info)); + bzero(rtinfo->rti_info, sizeof (rtinfo->rti_info)); for (i = 0; (i < RTAX_MAX) && (cp < cplim); i++) { if ((rtinfo->rti_addrs & (1 << i)) == 0) continue; @@ -700,14 +1021,12 @@ rt_xaddrs(cp, cplim, rtinfo) /* * It won't fit. */ - if ( (cp + sa->sa_len) > cplim ) { + if ((cp + sa->sa_len) > cplim) return (EINVAL); - } - /* * there are no more.. quit now * If there are more bits, they are in error. - * I've seen this. route(1) can evidently generate these. + * I've seen this. route(1) can evidently generate these. * This causes kernel to core dump. * for compatibility, If we see this, point to a safe address. */ @@ -715,67 +1034,87 @@ rt_xaddrs(cp, cplim, rtinfo) rtinfo->rti_info[i] = &sa_zero; return (0); /* should be EINVAL but for compat */ } - /* accept it */ rtinfo->rti_info[i] = sa; - ADVANCE(cp, sa); + ADVANCE32(cp, sa); } return (0); } static struct mbuf * -rt_msg1(type, rtinfo) - int type; - register struct rt_addrinfo *rtinfo; +rt_msg1(int type, struct rt_addrinfo *rtinfo) { - register struct rt_msghdr *rtm; - register struct mbuf *m; - register int i; - register struct sockaddr *sa; - int len, dlen; + struct rt_msghdr *rtm; + struct mbuf *m; + int i; + int len, dlen, off; switch (type) { case RTM_DELADDR: case RTM_NEWADDR: - len = sizeof(struct ifa_msghdr); + len = sizeof (struct ifa_msghdr); break; case RTM_DELMADDR: case RTM_NEWMADDR: - len = sizeof(struct ifma_msghdr); + len = sizeof (struct ifma_msghdr); break; case RTM_IFINFO: - len = sizeof(struct if_msghdr); + len = sizeof (struct if_msghdr); break; default: - len = sizeof(struct rt_msghdr); + len = sizeof (struct rt_msghdr); } - if (len > MCLBYTES) - panic("rt_msg1"); m = m_gethdr(M_DONTWAIT, MT_DATA); if (m && len > MHLEN) { MCLGET(m, M_DONTWAIT); - if ((m->m_flags & M_EXT) == 0) { + if (!(m->m_flags & M_EXT)) { m_free(m); m = NULL; } } - if (m == 0) - return (m); + if (m == NULL) + return (NULL); m->m_pkthdr.len = m->m_len = len; - m->m_pkthdr.rcvif = 0; + m->m_pkthdr.rcvif = NULL; rtm = mtod(m, struct rt_msghdr *); bzero((caddr_t)rtm, len); + off = len; for (i = 0; i < RTAX_MAX; i++) { + struct sockaddr *sa, *hint; + uint8_t ssbuf[SOCK_MAXADDRLEN + 1]; + + /* + * Make sure to accomodate the largest possible size of sa_len. + */ + _CASSERT(sizeof (ssbuf) == (SOCK_MAXADDRLEN + 1)); + if ((sa = rtinfo->rti_info[i]) == NULL) continue; + + switch (i) { + case RTAX_DST: + case RTAX_NETMASK: + if ((hint = rtinfo->rti_info[RTAX_DST]) == NULL) + hint = rtinfo->rti_info[RTAX_IFA]; + + /* Scrub away any trace of embedded interface scope */ + sa = rtm_scrub(type, i, hint, sa, &ssbuf, + sizeof (ssbuf), NULL); + break; + + default: + break; + } + rtinfo->rti_addrs |= (1 << i); - dlen = ROUNDUP(sa->sa_len); - m_copyback(m, len, dlen, (caddr_t)sa); - len += dlen; + dlen = sa->sa_len; + m_copyback(m, off, dlen, (caddr_t)sa); + len = off + dlen; + off += ROUNDUP32(dlen); } if (m->m_pkthdr.len != len) { m_freem(m); @@ -788,14 +1127,11 @@ rt_msg1(type, rtinfo) } static int -rt_msg2(type, rtinfo, cp, w) - int type; - register struct rt_addrinfo *rtinfo; - caddr_t cp; - struct walkarg *w; +rt_msg2(int type, struct rt_addrinfo *rtinfo, caddr_t cp, struct walkarg *w, + kauth_cred_t* credp) { - register int i; - int len, dlen, second_time = 0; + int i; + int len, dlen, rlen, second_time = 0; caddr_t cp0; rtinfo->rti_addrs = 0; @@ -804,45 +1140,95 @@ again: case RTM_DELADDR: case RTM_NEWADDR: - len = sizeof(struct ifa_msghdr); + len = sizeof (struct ifa_msghdr); + break; + + case RTM_DELMADDR: + case RTM_NEWMADDR: + len = sizeof (struct ifma_msghdr); break; case RTM_IFINFO: - len = sizeof(struct if_msghdr); + len = sizeof (struct if_msghdr); + break; + + case RTM_IFINFO2: + len = sizeof (struct if_msghdr2); + break; + + case RTM_NEWMADDR2: + len = sizeof (struct ifma_msghdr2); + break; + + case RTM_GET_EXT: + len = sizeof (struct rt_msghdr_ext); + break; + + case RTM_GET2: + len = sizeof (struct rt_msghdr2); break; default: - len = sizeof(struct rt_msghdr); + len = sizeof (struct rt_msghdr); } cp0 = cp; if (cp0) cp += len; for (i = 0; i < RTAX_MAX; i++) { - register struct sockaddr *sa; + struct sockaddr *sa, *hint; + uint8_t ssbuf[SOCK_MAXADDRLEN + 1]; + + /* + * Make sure to accomodate the largest possible size of sa_len. + */ + _CASSERT(sizeof (ssbuf) == (SOCK_MAXADDRLEN + 1)); - if ((sa = rtinfo->rti_info[i]) == 0) + if ((sa = rtinfo->rti_info[i]) == NULL) continue; + + switch (i) { + case RTAX_DST: + case RTAX_NETMASK: + if ((hint = rtinfo->rti_info[RTAX_DST]) == NULL) + hint = rtinfo->rti_info[RTAX_IFA]; + + /* Scrub away any trace of embedded interface scope */ + sa = rtm_scrub(type, i, hint, sa, &ssbuf, + sizeof (ssbuf), NULL); + break; + case RTAX_GATEWAY: + case RTAX_IFP: + sa = rtm_scrub(type, i, NULL, sa, &ssbuf, + sizeof (ssbuf), credp); + break; + + default: + break; + } + rtinfo->rti_addrs |= (1 << i); - dlen = ROUNDUP(sa->sa_len); + dlen = sa->sa_len; + rlen = ROUNDUP32(dlen); if (cp) { - bcopy((caddr_t)sa, cp, (unsigned)dlen); - cp += dlen; + bcopy((caddr_t)sa, cp, (size_t)dlen); + if (dlen != rlen) + bzero(cp + dlen, rlen - dlen); + cp += rlen; } - len += dlen; + len += rlen; } - if (cp == 0 && w != NULL && !second_time) { - register struct walkarg *rw = w; + if (cp == NULL && w != NULL && !second_time) { + struct walkarg *rw = w; - if (rw->w_req) { + if (rw->w_req != NULL) { if (rw->w_tmemsize < len) { - if (rw->w_tmem) + if (rw->w_tmem != NULL) FREE(rw->w_tmem, M_RTABLE); - rw->w_tmem = (caddr_t) - _MALLOC(len, M_RTABLE, M_WAITOK); /*###LD0412 was NOWAIT */ - if (rw->w_tmem) + rw->w_tmem = _MALLOC(len, M_RTABLE, M_WAITOK); + if (rw->w_tmem != NULL) rw->w_tmemsize = len; } - if (rw->w_tmem) { + if (rw->w_tmem != NULL) { cp = rw->w_tmem; second_time = 1; goto again; @@ -850,7 +1236,7 @@ again: } } if (cp) { - register struct rt_msghdr *rtm = (struct rt_msghdr *)cp0; + struct rt_msghdr *rtm = (struct rt_msghdr *)(void *)cp0; rtm->rtm_version = RTM_VERSION; rtm->rtm_type = type; @@ -861,29 +1247,28 @@ again: /* * This routine is called to generate a message from the routing - * socket indicating that a redirect has occured, a routing lookup + * socket indicating that a redirect has occurred, a routing lookup * has failed, or that a protocol has detected timeouts to a particular * destination. */ void -rt_missmsg(type, rtinfo, flags, error) - int type, flags, error; - register struct rt_addrinfo *rtinfo; +rt_missmsg(int type, struct rt_addrinfo *rtinfo, int flags, int error) { - register struct rt_msghdr *rtm; - register struct mbuf *m; + struct rt_msghdr *rtm; + struct mbuf *m; struct sockaddr *sa = rtinfo->rti_info[RTAX_DST]; + struct sockproto route_proto = { PF_ROUTE, 0 }; if (route_cb.any_count == 0) return; m = rt_msg1(type, rtinfo); - if (m == 0) + if (m == NULL) return; rtm = mtod(m, struct rt_msghdr *); rtm->rtm_flags = RTF_DONE | flags; rtm->rtm_errno = error; rtm->rtm_addrs = rtinfo->rti_addrs; - route_proto.sp_protocol = sa ? sa->sa_family : 0; + route_proto.sp_family = sa ? sa->sa_family : 0; raw_input(m, &route_proto, &route_src, &route_dst); } @@ -892,25 +1277,24 @@ rt_missmsg(type, rtinfo, flags, error) * socket indicating that the status of a network interface has changed. */ void -rt_ifmsg(ifp) - register struct ifnet *ifp; +rt_ifmsg(struct ifnet *ifp) { - register struct if_msghdr *ifm; + struct if_msghdr *ifm; struct mbuf *m; struct rt_addrinfo info; + struct sockproto route_proto = { PF_ROUTE, 0 }; if (route_cb.any_count == 0) return; - bzero((caddr_t)&info, sizeof(info)); + bzero((caddr_t)&info, sizeof (info)); m = rt_msg1(RTM_IFINFO, &info); - if (m == 0) + if (m == NULL) return; ifm = mtod(m, struct if_msghdr *); ifm->ifm_index = ifp->if_index; ifm->ifm_flags = (u_short)ifp->if_flags; - ifm->ifm_data = ifp->if_data; + if_data_internal_to_if_data(ifp, &ifp->if_data, &ifm->ifm_data); ifm->ifm_addrs = 0; - route_proto.sp_protocol = 0; raw_input(m, &route_proto, &route_src, &route_dst); } @@ -921,49 +1305,72 @@ rt_ifmsg(ifp) * socket indicate a request to configure interfaces, then it will * be unnecessary as the routing socket will automatically generate * copies of it. + * + * Since this is coming from the interface, it is expected that the + * interface will be locked. Caller must hold rnh_lock and rt_lock. */ void -rt_newaddrmsg(cmd, ifa, error, rt) - int cmd, error; - register struct ifaddr *ifa; - register struct rtentry *rt; +rt_newaddrmsg(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt) { struct rt_addrinfo info; struct sockaddr *sa = 0; int pass; struct mbuf *m = 0; struct ifnet *ifp = ifa->ifa_ifp; + struct sockproto route_proto = { PF_ROUTE, 0 }; + + LCK_MTX_ASSERT(rnh_lock, LCK_MTX_ASSERT_OWNED); + RT_LOCK_ASSERT_HELD(rt); if (route_cb.any_count == 0) return; + + /* Become a regular mutex, just in case */ + RT_CONVERT_LOCK(rt); for (pass = 1; pass < 3; pass++) { - bzero((caddr_t)&info, sizeof(info)); + bzero((caddr_t)&info, sizeof (info)); if ((cmd == RTM_ADD && pass == 1) || (cmd == RTM_DELETE && pass == 2)) { - register struct ifa_msghdr *ifam; + struct ifa_msghdr *ifam; int ncmd = cmd == RTM_ADD ? RTM_NEWADDR : RTM_DELADDR; - ifaaddr = sa = ifa->ifa_addr; - ifpaddr = ifp->if_addrhead.tqh_first->ifa_addr; - netmask = ifa->ifa_netmask; - brdaddr = ifa->ifa_dstaddr; - if ((m = rt_msg1(ncmd, &info)) == NULL) + /* Lock ifp for if_lladdr */ + ifnet_lock_shared(ifp); + IFA_LOCK(ifa); + info.rti_info[RTAX_IFA] = sa = ifa->ifa_addr; + /* + * Holding ifnet lock here prevents the link address + * from changing contents, so no need to hold its + * lock. The link address is always present; it's + * never freed. + */ + info.rti_info[RTAX_IFP] = ifp->if_lladdr->ifa_addr; + info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask; + info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; + if ((m = rt_msg1(ncmd, &info)) == NULL) { + IFA_UNLOCK(ifa); + ifnet_lock_done(ifp); continue; + } + IFA_UNLOCK(ifa); + ifnet_lock_done(ifp); ifam = mtod(m, struct ifa_msghdr *); ifam->ifam_index = ifp->if_index; + IFA_LOCK_SPIN(ifa); ifam->ifam_metric = ifa->ifa_metric; ifam->ifam_flags = ifa->ifa_flags; + IFA_UNLOCK(ifa); ifam->ifam_addrs = info.rti_addrs; } if ((cmd == RTM_ADD && pass == 2) || (cmd == RTM_DELETE && pass == 1)) { - register struct rt_msghdr *rtm; + struct rt_msghdr *rtm; - if (rt == 0) + if (rt == NULL) continue; - netmask = rt_mask(rt); - dst = sa = rt_key(rt); - gate = rt->rt_gateway; + info.rti_info[RTAX_NETMASK] = rt_mask(rt); + info.rti_info[RTAX_DST] = sa = rt_key(rt); + info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; if ((m = rt_msg1(cmd, &info)) == NULL) continue; rtm = mtod(m, struct rt_msghdr *); @@ -983,140 +1390,636 @@ rt_newaddrmsg(cmd, ifa, error, rt) * there is no route state to worry about. */ void -rt_newmaddrmsg(cmd, ifma) - int cmd; - struct ifmultiaddr *ifma; +rt_newmaddrmsg(int cmd, struct ifmultiaddr *ifma) { struct rt_addrinfo info; struct mbuf *m = 0; struct ifnet *ifp = ifma->ifma_ifp; struct ifma_msghdr *ifmam; + struct sockproto route_proto = { PF_ROUTE, 0 }; if (route_cb.any_count == 0) return; - bzero((caddr_t)&info, sizeof(info)); - ifaaddr = ifma->ifma_addr; - if (ifp && ifp->if_addrhead.tqh_first) - ifpaddr = ifp->if_addrhead.tqh_first->ifa_addr; - else - ifpaddr = NULL; + /* Lock ifp for if_lladdr */ + ifnet_lock_shared(ifp); + bzero((caddr_t)&info, sizeof (info)); + IFMA_LOCK(ifma); + info.rti_info[RTAX_IFA] = ifma->ifma_addr; + /* lladdr doesn't need lock */ + info.rti_info[RTAX_IFP] = ifp->if_lladdr->ifa_addr; + /* * If a link-layer address is present, present it as a ``gateway'' * (similarly to how ARP entries, e.g., are presented). */ - gate = ifma->ifma_lladdr; - if ((m = rt_msg1(cmd, &info)) == NULL) + info.rti_info[RTAX_GATEWAY] = (ifma->ifma_ll != NULL) ? + ifma->ifma_ll->ifma_addr : NULL; + if ((m = rt_msg1(cmd, &info)) == NULL) { + IFMA_UNLOCK(ifma); + ifnet_lock_done(ifp); return; + } ifmam = mtod(m, struct ifma_msghdr *); ifmam->ifmam_index = ifp->if_index; ifmam->ifmam_addrs = info.rti_addrs; route_proto.sp_protocol = ifma->ifma_addr->sa_family; + IFMA_UNLOCK(ifma); + ifnet_lock_done(ifp); raw_input(m, &route_proto, &route_src, &route_dst); } +const char * +rtm2str(int cmd) +{ + const char *c = "RTM_?"; + + switch (cmd) { + case RTM_ADD: + c = "RTM_ADD"; + break; + case RTM_DELETE: + c = "RTM_DELETE"; + break; + case RTM_CHANGE: + c = "RTM_CHANGE"; + break; + case RTM_GET: + c = "RTM_GET"; + break; + case RTM_LOSING: + c = "RTM_LOSING"; + break; + case RTM_REDIRECT: + c = "RTM_REDIRECT"; + break; + case RTM_MISS: + c = "RTM_MISS"; + break; + case RTM_LOCK: + c = "RTM_LOCK"; + break; + case RTM_OLDADD: + c = "RTM_OLDADD"; + break; + case RTM_OLDDEL: + c = "RTM_OLDDEL"; + break; + case RTM_RESOLVE: + c = "RTM_RESOLVE"; + break; + case RTM_NEWADDR: + c = "RTM_NEWADDR"; + break; + case RTM_DELADDR: + c = "RTM_DELADDR"; + break; + case RTM_IFINFO: + c = "RTM_IFINFO"; + break; + case RTM_NEWMADDR: + c = "RTM_NEWMADDR"; + break; + case RTM_DELMADDR: + c = "RTM_DELMADDR"; + break; + case RTM_GET_SILENT: + c = "RTM_GET_SILENT"; + break; + case RTM_IFINFO2: + c = "RTM_IFINFO2"; + break; + case RTM_NEWMADDR2: + c = "RTM_NEWMADDR2"; + break; + case RTM_GET2: + c = "RTM_GET2"; + break; + case RTM_GET_EXT: + c = "RTM_GET_EXT"; + break; + } + + return (c); +} + /* * This is used in dumping the kernel table via sysctl(). */ -int -sysctl_dumpentry(rn, vw) - struct radix_node *rn; - void *vw; +static int +sysctl_dumpentry(struct radix_node *rn, void *vw) { - register struct walkarg *w = vw; - register struct rtentry *rt = (struct rtentry *)rn; + struct walkarg *w = vw; + struct rtentry *rt = (struct rtentry *)rn; int error = 0, size; struct rt_addrinfo info; + kauth_cred_t cred; + + cred = kauth_cred_proc_ref(current_proc()); + RT_LOCK(rt); if (w->w_op == NET_RT_FLAGS && !(rt->rt_flags & w->w_arg)) - return 0; - bzero((caddr_t)&info, sizeof(info)); - dst = rt_key(rt); - gate = rt->rt_gateway; - netmask = rt_mask(rt); - genmask = rt->rt_genmask; - size = rt_msg2(RTM_GET, &info, 0, w); - if (w->w_req && w->w_tmem) { - register struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem; - - rtm->rtm_flags = rt->rt_flags; - rtm->rtm_use = rt->rt_use; - rtm->rtm_rmx = rt->rt_rmx; - rtm->rtm_index = rt->rt_ifp->if_index; - rtm->rtm_errno = rtm->rtm_pid = rtm->rtm_seq = 0; - rtm->rtm_addrs = info.rti_addrs; - error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size); - return (error); + goto done; + bzero((caddr_t)&info, sizeof (info)); + info.rti_info[RTAX_DST] = rt_key(rt); + info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; + info.rti_info[RTAX_NETMASK] = rt_mask(rt); + info.rti_info[RTAX_GENMASK] = rt->rt_genmask; + + if (w->w_op != NET_RT_DUMP2) { + size = rt_msg2(RTM_GET, &info, NULL, w, &cred); + if (w->w_req != NULL && w->w_tmem != NULL) { + struct rt_msghdr *rtm = + (struct rt_msghdr *)(void *)w->w_tmem; + + rtm->rtm_flags = rt->rt_flags; + rtm->rtm_use = rt->rt_use; + rt_getmetrics(rt, &rtm->rtm_rmx); + rtm->rtm_index = rt->rt_ifp->if_index; + rtm->rtm_pid = 0; + rtm->rtm_seq = 0; + rtm->rtm_errno = 0; + rtm->rtm_addrs = info.rti_addrs; + error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size); + } + } else { + size = rt_msg2(RTM_GET2, &info, NULL, w, &cred); + if (w->w_req != NULL && w->w_tmem != NULL) { + struct rt_msghdr2 *rtm = + (struct rt_msghdr2 *)(void *)w->w_tmem; + + rtm->rtm_flags = rt->rt_flags; + rtm->rtm_use = rt->rt_use; + rt_getmetrics(rt, &rtm->rtm_rmx); + rtm->rtm_index = rt->rt_ifp->if_index; + rtm->rtm_refcnt = rt->rt_refcnt; + if (rt->rt_parent) + rtm->rtm_parentflags = rt->rt_parent->rt_flags; + else + rtm->rtm_parentflags = 0; + rtm->rtm_reserved = 0; + rtm->rtm_addrs = info.rti_addrs; + error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size); + } } + +done: + RT_UNLOCK(rt); + kauth_cred_unref(&cred); return (error); } -int -sysctl_iflist(af, w) - int af; - register struct walkarg *w; +/* + * This is used for dumping extended information from route entries. + */ +static int +sysctl_dumpentry_ext(struct radix_node *rn, void *vw) { - register struct ifnet *ifp; - register struct ifaddr *ifa; + struct walkarg *w = vw; + struct rtentry *rt = (struct rtentry *)rn; + int error = 0, size; + struct rt_addrinfo info; + kauth_cred_t cred; + + cred = kauth_cred_proc_ref(current_proc()); + + RT_LOCK(rt); + if (w->w_op == NET_RT_DUMPX_FLAGS && !(rt->rt_flags & w->w_arg)) + goto done; + bzero(&info, sizeof (info)); + info.rti_info[RTAX_DST] = rt_key(rt); + info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; + info.rti_info[RTAX_NETMASK] = rt_mask(rt); + info.rti_info[RTAX_GENMASK] = rt->rt_genmask; + + size = rt_msg2(RTM_GET_EXT, &info, NULL, w, &cred); + if (w->w_req != NULL && w->w_tmem != NULL) { + struct rt_msghdr_ext *ertm = + (struct rt_msghdr_ext *)(void *)w->w_tmem; + + ertm->rtm_flags = rt->rt_flags; + ertm->rtm_use = rt->rt_use; + rt_getmetrics(rt, &ertm->rtm_rmx); + ertm->rtm_index = rt->rt_ifp->if_index; + ertm->rtm_pid = 0; + ertm->rtm_seq = 0; + ertm->rtm_errno = 0; + ertm->rtm_addrs = info.rti_addrs; + if (rt->rt_llinfo_get_ri == NULL) { + bzero(&ertm->rtm_ri, sizeof (ertm->rtm_ri)); + ertm->rtm_ri.ri_rssi = IFNET_RSSI_UNKNOWN; + ertm->rtm_ri.ri_lqm = IFNET_LQM_THRESH_OFF; + ertm->rtm_ri.ri_npm = IFNET_NPM_THRESH_UNKNOWN; + } else { + rt->rt_llinfo_get_ri(rt, &ertm->rtm_ri); + } + error = SYSCTL_OUT(w->w_req, (caddr_t)ertm, size); + } + +done: + RT_UNLOCK(rt); + kauth_cred_unref(&cred); + return (error); +} + +/* + * rdar://9307819 + * To avoid to call copyout() while holding locks and to cause problems + * in the paging path, sysctl_iflist() and sysctl_iflist2() contstruct + * the list in two passes. In the first pass we compute the total + * length of the data we are going to copyout, then we release + * all locks to allocate a temporary buffer that gets filled + * in the second pass. + * + * Note that we are verifying the assumption that _MALLOC returns a buffer + * that is at least 32 bits aligned and that the messages and addresses are + * 32 bits aligned. + */ +static int +sysctl_iflist(int af, struct walkarg *w) +{ + struct ifnet *ifp; + struct ifaddr *ifa; struct rt_addrinfo info; - int len, error = 0; + int len = 0, error = 0; + int pass = 0; + int total_len = 0, current_len = 0; + char *total_buffer = NULL, *cp = NULL; + kauth_cred_t cred; - bzero((caddr_t)&info, sizeof(info)); - for (ifp = ifnet.tqh_first; ifp; ifp = ifp->if_link.tqe_next) { - if (w->w_arg && w->w_arg != ifp->if_index) - continue; - ifa = ifp->if_addrhead.tqh_first; - ifpaddr = ifa->ifa_addr; - len = rt_msg2(RTM_IFINFO, &info, (caddr_t)0, w); - ifpaddr = 0; - if (w->w_req && w->w_tmem) { - register struct if_msghdr *ifm; - - ifm = (struct if_msghdr *)w->w_tmem; - ifm->ifm_index = ifp->if_index; - ifm->ifm_flags = (u_short)ifp->if_flags; - ifm->ifm_data = ifp->if_data; - ifm->ifm_addrs = info.rti_addrs; - error = SYSCTL_OUT(w->w_req,(caddr_t)ifm, len); + cred = kauth_cred_proc_ref(current_proc()); + + bzero((caddr_t)&info, sizeof (info)); + + for (pass = 0; pass < 2; pass++) { + ifnet_head_lock_shared(); + + TAILQ_FOREACH(ifp, &ifnet_head, if_link) { if (error) - return (error); - } - while ((ifa = ifa->ifa_link.tqe_next) != 0) { - if (af && af != ifa->ifa_addr->sa_family) + break; + if (w->w_arg && w->w_arg != ifp->if_index) continue; -#ifndef __APPLE__ - if (curproc->p_prison && prison_if(curproc, ifa->ifa_addr)) + ifnet_lock_shared(ifp); + /* + * Holding ifnet lock here prevents the link address + * from changing contents, so no need to hold the ifa + * lock. The link address is always present; it's + * never freed. + */ + ifa = ifp->if_lladdr; + info.rti_info[RTAX_IFP] = ifa->ifa_addr; + len = rt_msg2(RTM_IFINFO, &info, NULL, NULL, &cred); + if (pass == 0) { + total_len += len; + } else { + struct if_msghdr *ifm; + + if (current_len + len > total_len) { + ifnet_lock_done(ifp); + error = ENOBUFS; + break; + } + info.rti_info[RTAX_IFP] = ifa->ifa_addr; + len = rt_msg2(RTM_IFINFO, &info, + (caddr_t)cp, NULL, &cred); + info.rti_info[RTAX_IFP] = NULL; + + ifm = (struct if_msghdr *)(void *)cp; + ifm->ifm_index = ifp->if_index; + ifm->ifm_flags = (u_short)ifp->if_flags; + if_data_internal_to_if_data(ifp, &ifp->if_data, + &ifm->ifm_data); + ifm->ifm_addrs = info.rti_addrs; + /* + * + * Round bytes only for non-platform + */ + if (!csproc_get_platform_binary(w->w_req->p)) { + ALIGN_BYTES(ifm->ifm_data.ifi_ibytes); + ALIGN_BYTES(ifm->ifm_data.ifi_obytes); + } + + cp += len; + VERIFY(IS_P2ALIGNED(cp, sizeof (u_int32_t))); + current_len += len; + } + while ((ifa = ifa->ifa_link.tqe_next) != NULL) { + IFA_LOCK(ifa); + if (af && af != ifa->ifa_addr->sa_family) { + IFA_UNLOCK(ifa); + continue; + } + info.rti_info[RTAX_IFA] = ifa->ifa_addr; + info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask; + info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; + len = rt_msg2(RTM_NEWADDR, &info, NULL, NULL, + &cred); + if (pass == 0) { + total_len += len; + } else { + struct ifa_msghdr *ifam; + + if (current_len + len > total_len) { + IFA_UNLOCK(ifa); + error = ENOBUFS; + break; + } + len = rt_msg2(RTM_NEWADDR, &info, + (caddr_t)cp, NULL, &cred); + + ifam = (struct ifa_msghdr *)(void *)cp; + ifam->ifam_index = + ifa->ifa_ifp->if_index; + ifam->ifam_flags = ifa->ifa_flags; + ifam->ifam_metric = ifa->ifa_metric; + ifam->ifam_addrs = info.rti_addrs; + + cp += len; + VERIFY(IS_P2ALIGNED(cp, + sizeof (u_int32_t))); + current_len += len; + } + IFA_UNLOCK(ifa); + } + ifnet_lock_done(ifp); + info.rti_info[RTAX_IFA] = info.rti_info[RTAX_NETMASK] = + info.rti_info[RTAX_BRD] = NULL; + } + + ifnet_head_done(); + + if (error != 0) { + if (error == ENOBUFS) + printf("%s: current_len (%d) + len (%d) > " + "total_len (%d)\n", __func__, current_len, + len, total_len); + break; + } + + if (pass == 0) { + /* Better to return zero length buffer than ENOBUFS */ + if (total_len == 0) + total_len = 1; + total_len += total_len >> 3; + total_buffer = _MALLOC(total_len, M_RTABLE, + M_ZERO | M_WAITOK); + if (total_buffer == NULL) { + printf("%s: _MALLOC(%d) failed\n", __func__, + total_len); + error = ENOBUFS; + break; + } + cp = total_buffer; + VERIFY(IS_P2ALIGNED(cp, sizeof (u_int32_t))); + } else { + error = SYSCTL_OUT(w->w_req, total_buffer, current_len); + if (error) + break; + } + } + + if (total_buffer != NULL) + _FREE(total_buffer, M_RTABLE); + + kauth_cred_unref(&cred); + return (error); +} + +static int +sysctl_iflist2(int af, struct walkarg *w) +{ + struct ifnet *ifp; + struct ifaddr *ifa; + struct rt_addrinfo info; + int len = 0, error = 0; + int pass = 0; + int total_len = 0, current_len = 0; + char *total_buffer = NULL, *cp = NULL; + kauth_cred_t cred; + + cred = kauth_cred_proc_ref(current_proc()); + + bzero((caddr_t)&info, sizeof (info)); + + for (pass = 0; pass < 2; pass++) { + struct ifmultiaddr *ifma; + + ifnet_head_lock_shared(); + + TAILQ_FOREACH(ifp, &ifnet_head, if_link) { + if (error) + break; + if (w->w_arg && w->w_arg != ifp->if_index) continue; -#endif - ifaaddr = ifa->ifa_addr; - netmask = ifa->ifa_netmask; - brdaddr = ifa->ifa_dstaddr; - len = rt_msg2(RTM_NEWADDR, &info, 0, w); - if (w->w_req && w->w_tmem) { - register struct ifa_msghdr *ifam; - - ifam = (struct ifa_msghdr *)w->w_tmem; - ifam->ifam_index = ifa->ifa_ifp->if_index; - ifam->ifam_flags = ifa->ifa_flags; - ifam->ifam_metric = ifa->ifa_metric; - ifam->ifam_addrs = info.rti_addrs; - error = SYSCTL_OUT(w->w_req, w->w_tmem, len); - if (error) - return (error); + ifnet_lock_shared(ifp); + /* + * Holding ifnet lock here prevents the link address + * from changing contents, so no need to hold the ifa + * lock. The link address is always present; it's + * never freed. + */ + ifa = ifp->if_lladdr; + info.rti_info[RTAX_IFP] = ifa->ifa_addr; + len = rt_msg2(RTM_IFINFO2, &info, NULL, NULL, &cred); + if (pass == 0) { + total_len += len; + } else { + struct if_msghdr2 *ifm; + + if (current_len + len > total_len) { + ifnet_lock_done(ifp); + error = ENOBUFS; + break; + } + info.rti_info[RTAX_IFP] = ifa->ifa_addr; + len = rt_msg2(RTM_IFINFO2, &info, + (caddr_t)cp, NULL, &cred); + info.rti_info[RTAX_IFP] = NULL; + + ifm = (struct if_msghdr2 *)(void *)cp; + ifm->ifm_addrs = info.rti_addrs; + ifm->ifm_flags = (u_short)ifp->if_flags; + ifm->ifm_index = ifp->if_index; + ifm->ifm_snd_len = IFCQ_LEN(&ifp->if_snd); + ifm->ifm_snd_maxlen = IFCQ_MAXLEN(&ifp->if_snd); + ifm->ifm_snd_drops = + ifp->if_snd.ifcq_dropcnt.packets; + ifm->ifm_timer = ifp->if_timer; + if_data_internal_to_if_data64(ifp, + &ifp->if_data, &ifm->ifm_data); + /* + * + * Round bytes only for non-platform + */ + if (!csproc_get_platform_binary(w->w_req->p)) { + ALIGN_BYTES(ifm->ifm_data.ifi_ibytes); + ALIGN_BYTES(ifm->ifm_data.ifi_obytes); + } + + cp += len; + VERIFY(IS_P2ALIGNED(cp, sizeof (u_int32_t))); + current_len += len; + } + while ((ifa = ifa->ifa_link.tqe_next) != NULL) { + IFA_LOCK(ifa); + if (af && af != ifa->ifa_addr->sa_family) { + IFA_UNLOCK(ifa); + continue; + } + info.rti_info[RTAX_IFA] = ifa->ifa_addr; + info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask; + info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; + len = rt_msg2(RTM_NEWADDR, &info, NULL, NULL, + &cred); + if (pass == 0) { + total_len += len; + } else { + struct ifa_msghdr *ifam; + + if (current_len + len > total_len) { + IFA_UNLOCK(ifa); + error = ENOBUFS; + break; + } + len = rt_msg2(RTM_NEWADDR, &info, + (caddr_t)cp, NULL, &cred); + + ifam = (struct ifa_msghdr *)(void *)cp; + ifam->ifam_index = + ifa->ifa_ifp->if_index; + ifam->ifam_flags = ifa->ifa_flags; + ifam->ifam_metric = ifa->ifa_metric; + ifam->ifam_addrs = info.rti_addrs; + + cp += len; + VERIFY(IS_P2ALIGNED(cp, + sizeof (u_int32_t))); + current_len += len; + } + IFA_UNLOCK(ifa); + } + if (error) { + ifnet_lock_done(ifp); + break; + } + + for (ifma = LIST_FIRST(&ifp->if_multiaddrs); + ifma != NULL; ifma = LIST_NEXT(ifma, ifma_link)) { + struct ifaddr *ifa0; + + IFMA_LOCK(ifma); + if (af && af != ifma->ifma_addr->sa_family) { + IFMA_UNLOCK(ifma); + continue; + } + bzero((caddr_t)&info, sizeof (info)); + info.rti_info[RTAX_IFA] = ifma->ifma_addr; + /* + * Holding ifnet lock here prevents the link + * address from changing contents, so no need + * to hold the ifa0 lock. The link address is + * always present; it's never freed. + */ + ifa0 = ifp->if_lladdr; + info.rti_info[RTAX_IFP] = ifa0->ifa_addr; + if (ifma->ifma_ll != NULL) + info.rti_info[RTAX_GATEWAY] = + ifma->ifma_ll->ifma_addr; + len = rt_msg2(RTM_NEWMADDR2, &info, NULL, NULL, + &cred); + if (pass == 0) { + total_len += len; + } else { + struct ifma_msghdr2 *ifmam; + + if (current_len + len > total_len) { + IFMA_UNLOCK(ifma); + error = ENOBUFS; + break; + } + len = rt_msg2(RTM_NEWMADDR2, &info, + (caddr_t)cp, NULL, &cred); + + ifmam = + (struct ifma_msghdr2 *)(void *)cp; + ifmam->ifmam_addrs = info.rti_addrs; + ifmam->ifmam_flags = 0; + ifmam->ifmam_index = + ifma->ifma_ifp->if_index; + ifmam->ifmam_refcount = + ifma->ifma_reqcnt; + + cp += len; + VERIFY(IS_P2ALIGNED(cp, + sizeof (u_int32_t))); + current_len += len; + } + IFMA_UNLOCK(ifma); } + ifnet_lock_done(ifp); + info.rti_info[RTAX_IFA] = info.rti_info[RTAX_NETMASK] = + info.rti_info[RTAX_BRD] = NULL; + } + ifnet_head_done(); + + if (error) { + if (error == ENOBUFS) + printf("%s: current_len (%d) + len (%d) > " + "total_len (%d)\n", __func__, current_len, + len, total_len); + break; + } + + if (pass == 0) { + /* Better to return zero length buffer than ENOBUFS */ + if (total_len == 0) + total_len = 1; + total_len += total_len >> 3; + total_buffer = _MALLOC(total_len, M_RTABLE, + M_ZERO | M_WAITOK); + if (total_buffer == NULL) { + printf("%s: _MALLOC(%d) failed\n", __func__, + total_len); + error = ENOBUFS; + break; + } + cp = total_buffer; + VERIFY(IS_P2ALIGNED(cp, sizeof (u_int32_t))); + } else { + error = SYSCTL_OUT(w->w_req, total_buffer, current_len); + if (error) + break; } - ifaaddr = netmask = brdaddr = 0; } - return (0); + + if (total_buffer != NULL) + _FREE(total_buffer, M_RTABLE); + + kauth_cred_unref(&cred); + return (error); +} + + +static int +sysctl_rtstat(struct sysctl_req *req) +{ + return (SYSCTL_OUT(req, &rtstat, sizeof (struct rtstat))); +} + +static int +sysctl_rttrash(struct sysctl_req *req) +{ + return (SYSCTL_OUT(req, &rttrash, sizeof (rttrash))); } static int sysctl_rtsock SYSCTL_HANDLER_ARGS { +#pragma unused(oidp) int *name = (int *)arg1; u_int namelen = arg2; - register struct radix_node_head *rnh; - int i, s, error = EINVAL; + struct radix_node_head *rnh; + int i, error = EINVAL; u_char af; struct walkarg w; @@ -1127,52 +2030,88 @@ sysctl_rtsock SYSCTL_HANDLER_ARGS if (namelen != 3) return (EINVAL); af = name[0]; - Bzero(&w, sizeof(w)); + Bzero(&w, sizeof (w)); w.w_op = name[1]; w.w_arg = name[2]; w.w_req = req; - s = splnet(); switch (w.w_op) { case NET_RT_DUMP: + case NET_RT_DUMP2: case NET_RT_FLAGS: + lck_mtx_lock(rnh_lock); for (i = 1; i <= AF_MAX; i++) if ((rnh = rt_tables[i]) && (af == 0 || af == i) && (error = rnh->rnh_walktree(rnh, - sysctl_dumpentry, &w))) + sysctl_dumpentry, &w))) break; + lck_mtx_unlock(rnh_lock); + break; + case NET_RT_DUMPX: + case NET_RT_DUMPX_FLAGS: + lck_mtx_lock(rnh_lock); + for (i = 1; i <= AF_MAX; i++) + if ((rnh = rt_tables[i]) && (af == 0 || af == i) && + (error = rnh->rnh_walktree(rnh, + sysctl_dumpentry_ext, &w))) + break; + lck_mtx_unlock(rnh_lock); break; - case NET_RT_IFLIST: error = sysctl_iflist(af, &w); + break; + case NET_RT_IFLIST2: + error = sysctl_iflist2(af, &w); + break; + case NET_RT_STAT: + error = sysctl_rtstat(req); + break; + case NET_RT_TRASH: + error = sysctl_rttrash(req); + break; } - splx(s); - if (w.w_tmem) + if (w.w_tmem != NULL) FREE(w.w_tmem, M_RTABLE); return (error); } -SYSCTL_NODE(_net, PF_ROUTE, routetable, CTLFLAG_RD, sysctl_rtsock, ""); - /* * Definitions of protocols supported in the ROUTE domain. */ - -struct domain routedomain; /* or at least forward */ - static struct protosw routesw[] = { -{ SOCK_RAW, &routedomain, 0, PR_ATOMIC|PR_ADDR, - 0, route_output, raw_ctlinput, 0, - 0, - raw_init, 0, 0, 0, - 0, &route_usrreqs, 0, 0 +{ + .pr_type = SOCK_RAW, + .pr_protocol = 0, + .pr_flags = PR_ATOMIC|PR_ADDR, + .pr_output = route_output, + .pr_ctlinput = raw_ctlinput, + .pr_init = raw_init, + .pr_usrreqs = &route_usrreqs, } }; -struct domain routedomain = - { PF_ROUTE, "route", route_init, 0, 0, - routesw}; +static int route_proto_count = (sizeof (routesw) / sizeof (struct protosw)); -DOMAIN_SET(route); +struct domain routedomain_s = { + .dom_family = PF_ROUTE, + .dom_name = "route", + .dom_init = route_dinit, +}; +static void +route_dinit(struct domain *dp) +{ + struct protosw *pr; + int i; + + VERIFY(!(dp->dom_flags & DOM_INITIALIZED)); + VERIFY(routedomain == NULL); + + routedomain = dp; + + for (i = 0, pr = &routesw[0]; i < route_proto_count; i++, pr++) + net_add_proto(pr, dp, 1); + + route_init(); +}