X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/89b3af67bb32e691275bf6fa803d1834b2284115..7ee9d059c4eecf68ae4f8b0fb99ae2471eda79af:/bsd/netinet/ip_output.c diff --git a/bsd/netinet/ip_output.c b/bsd/netinet/ip_output.c index e95d54b91..57f522919 100644 --- a/bsd/netinet/ip_output.c +++ b/bsd/netinet/ip_output.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -60,6 +60,12 @@ * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 * $FreeBSD: src/sys/netinet/ip_output.c,v 1.99.2.16 2001/07/19 06:37:26 kris Exp $ */ +/* + * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce + * support for mandatory and extensible security protections. This notice + * is included in support of clause 2.2 (b) of the Apple Public License, + * Version 2.0. + */ #define _IP_VHL @@ -73,9 +79,17 @@ #include #include #include +#include + +#include +#include #include +#include +#include #include +#include +#include #include #include @@ -86,10 +100,15 @@ #include +#if CONFIG_MACF_NET +#include +#endif + #include "faith.h" #include #include +#include #define DBG_LAYER_BEG NETDBG_CODE(DBG_NETIP, 1) #define DBG_LAYER_END NETDBG_CODE(DBG_NETIP, 3) @@ -110,11 +129,16 @@ #include #include +#include #if DUMMYNET #include #endif +#if PF +#include +#endif /* PF */ + #if IPFIREWALL_FORWARD_DEBUG #define print_ip(a) printf("%ld.%ld.%ld.%ld",(ntohl(a.s_addr)>>24)&0xFF,\ (ntohl(a.s_addr)>>16)&0xFF,\ @@ -122,31 +146,23 @@ (ntohl(a.s_addr))&0xFF); #endif -#if IPSEC -extern lck_mtx_t *sadb_mutex; -#endif u_short ip_id; static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *); -static struct ifnet *ip_multicast_if(struct in_addr *, int *); static void ip_mloopback(struct ifnet *, struct mbuf *, struct sockaddr_in *, int); -static int ip_getmoptions(struct sockopt *, struct ip_moptions *); static int ip_pcbopts(int, struct mbuf **, struct mbuf *); -static int ip_setmoptions(struct sockopt *, struct ip_moptions **); +static void imo_trace(struct ip_moptions *, int); + +static void ip_out_cksum_stats(int, u_int32_t); +static struct ifaddr *in_selectsrcif(struct ip *, struct route *, unsigned int); -int ip_createmoptions(struct ip_moptions **imop); -int ip_addmembership(struct ip_moptions *imo, struct ip_mreq *mreq); -int ip_dropmembership(struct ip_moptions *imo, struct ip_mreq *mreq); int ip_optcopy(struct ip *, struct ip *); -extern int (*fr_checkp)(struct ip *, int, struct ifnet *, int, struct mbuf **); -#ifdef __APPLE__ -extern struct mbuf* m_dup(register struct mbuf *m, int how); -#endif +void in_delayed_cksum_offset(struct mbuf *, int ); +void in_cksum_offset(struct mbuf* , size_t ); -extern int apple_hwcksum_tx; -extern u_long route_generation; +extern int (*fr_checkp)(struct ip *, int, struct ifnet *, int, struct mbuf **); extern struct protosw inetsw[]; @@ -159,8 +175,50 @@ extern int ipsec_bypass; #endif static int ip_maxchainsent = 0; -SYSCTL_INT(_net_inet_ip, OID_AUTO, maxchainsent, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, maxchainsent, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_maxchainsent, 0, "use dlil_output_list"); +#if DEBUG +static int forge_ce = 0; +SYSCTL_INT(_net_inet_ip, OID_AUTO, forge_ce, CTLFLAG_RW | CTLFLAG_LOCKED, + &forge_ce, 0, "Forge ECN CE"); +#endif /* DEBUG */ + +static int ip_select_srcif_debug = 0; +SYSCTL_INT(_net_inet_ip, OID_AUTO, select_srcif_debug, CTLFLAG_RW | CTLFLAG_LOCKED, + &ip_select_srcif_debug, 0, "log source interface selection debug info"); + +#define IMO_TRACE_HIST_SIZE 32 /* size of trace history */ + +/* For gdb */ +__private_extern__ unsigned int imo_trace_hist_size = IMO_TRACE_HIST_SIZE; + +struct ip_moptions_dbg { + struct ip_moptions imo; /* ip_moptions */ + u_int16_t imo_refhold_cnt; /* # of IMO_ADDREF */ + u_int16_t imo_refrele_cnt; /* # of IMO_REMREF */ + /* + * Alloc and free callers. + */ + ctrace_t imo_alloc; + ctrace_t imo_free; + /* + * Circular lists of IMO_ADDREF and IMO_REMREF callers. + */ + ctrace_t imo_refhold[IMO_TRACE_HIST_SIZE]; + ctrace_t imo_refrele[IMO_TRACE_HIST_SIZE]; +}; + +#if DEBUG +static unsigned int imo_debug = 1; /* debugging (enabled) */ +#else +static unsigned int imo_debug; /* debugging (disabled) */ +#endif /* !DEBUG */ +static unsigned int imo_size; /* size of zone element */ +static struct zone *imo_zone; /* zone for ip_moptions */ + +#define IMO_ZONE_MAX 64 /* maximum elements in zone */ +#define IMO_ZONE_NAME "ip_moptions" /* zone name */ + /* * IP output. The packet in mbuf chain m contains a skeletal IP * header (with len, off, ttl, proto, tos, src, dst). @@ -173,13 +231,36 @@ ip_output( struct mbuf *opt, struct route *ro, int flags, - struct ip_moptions *imo) + struct ip_moptions *imo, + struct ip_out_args *ipoa) { int error; - error = ip_output_list(m0, 0, opt, ro, flags, imo); + error = ip_output_list(m0, 0, opt, ro, flags, imo, ipoa); return error; } +/* + * Returns: 0 Success + * ENOMEM + * EADDRNOTAVAIL + * ENETUNREACH + * EHOSTUNREACH + * EACCES + * EMSGSIZE + * ENOBUFS + * ipsec4_getpolicybyaddr:??? [IPSEC 4th argument, contents modified] + * ipsec4_getpolicybysock:??? [IPSEC 4th argument, contents modified] + * key_spdacquire:??? [IPSEC] + * ipsec4_output:??? [IPSEC] + * :??? [firewall] + * ip_dn_io_ptr:??? [dummynet] + * dlil_output:??? [DLIL] + * dlil_output_list:??? [DLIL] + * + * Notes: The ipsec4_getpolicyby{addr|sock} function error returns are + * only used as the error return from this function where one of + * these functions fails to return a policy. + */ int ip_output_list( struct mbuf *m0, @@ -187,17 +268,20 @@ ip_output_list( struct mbuf *opt, struct route *ro, int flags, - struct ip_moptions *imo) + struct ip_moptions *imo, + struct ip_out_args *ipoa + ) { - struct ip *ip, *mhip; + struct ip *ip; struct ifnet *ifp = NULL; - struct mbuf *m = m0; + struct mbuf *m = m0, **mppn = NULL; int hlen = sizeof (struct ip); - int len, off, error = 0; + int len = 0, error = 0; struct sockaddr_in *dst = NULL; - struct in_ifaddr *ia = NULL; + struct in_ifaddr *ia = NULL, *src_ia = NULL; int isbroadcast, sw_csum; struct in_addr pkt_dst; + struct ipf_pktopts *ippo = NULL, ipf_pktopts; #if IPSEC struct route iproute; struct socket *so = NULL; @@ -206,63 +290,88 @@ ip_output_list( #if IPFIREWALL_FORWARD int fwd_rewrite_src = 0; #endif +#if IPFIREWALL + int off; struct ip_fw_args args; + struct m_tag *tag; + struct sockaddr_in *next_hop_from_ipfwd_tag = NULL; +#endif int didfilter = 0; ipfilter_t inject_filter_ref = 0; - struct m_tag *tag; - struct route dn_route; +#if DUMMYNET + struct route saved_route; + struct ip_out_args saved_ipoa; + struct sockaddr_in dst_buf; +#endif /* DUMMYNET */ struct mbuf * packetlist; - int pktcnt = 0; - - lck_mtx_lock(ip_mutex); - + int pktcnt = 0, tso = 0; + u_int32_t bytecnt = 0; + unsigned int ifscope; + unsigned int nocell; + boolean_t select_srcif; KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0); packetlist = m0; +#if IPFIREWALL + args.next_hop = NULL; args.eh = NULL; args.rule = NULL; - args.next_hop = NULL; args.divert_rule = 0; /* divert cookie */ - + args.ipoa = NULL; + + if (SLIST_EMPTY(&m0->m_pkthdr.tags)) + goto ipfw_tags_done; + /* Grab info from mtags prepended to the chain */ #if DUMMYNET - if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET, NULL)) != NULL) { + if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID, + KERNEL_TAG_TYPE_DUMMYNET, NULL)) != NULL) { struct dn_pkt_tag *dn_tag; - + dn_tag = (struct dn_pkt_tag *)(tag+1); args.rule = dn_tag->rule; opt = NULL; - dn_route = dn_tag->ro; - ro = &dn_route; - + saved_route = dn_tag->ro; + ro = &saved_route; + imo = NULL; - dst = dn_tag->dn_dst; + bcopy(&dn_tag->dn_dst, &dst_buf, sizeof(dst_buf)); + dst = &dst_buf; ifp = dn_tag->ifp; flags = dn_tag->flags; - + saved_ipoa = dn_tag->ipoa; + ipoa = &saved_ipoa; + m_tag_delete(m0, tag); } #endif /* DUMMYNET */ - if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DIVERT, NULL)) != NULL) { +#if IPDIVERT + if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID, + KERNEL_TAG_TYPE_DIVERT, NULL)) != NULL) { struct divert_tag *div_tag; - + div_tag = (struct divert_tag *)(tag+1); args.divert_rule = div_tag->cookie; m_tag_delete(m0, tag); } - if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFORWARD, NULL)) != NULL) { +#endif /* IPDIVERT */ + + if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID, + KERNEL_TAG_TYPE_IPFORWARD, NULL)) != NULL) { struct ip_fwd_tag *ipfwd_tag; - + ipfwd_tag = (struct ip_fwd_tag *)(tag+1); - args.next_hop = ipfwd_tag->next_hop; + next_hop_from_ipfwd_tag = ipfwd_tag->next_hop; m_tag_delete(m0, tag); } +ipfw_tags_done: +#endif /* IPFIREWALL */ m = m0; - + #if DIAGNOSTIC if ( !m || (m->m_flags & M_PKTHDR) != 0) panic("ip_output no HDR"); @@ -271,21 +380,56 @@ ip_output_list( mtod(m, struct ip *)->ip_p); #endif + bzero(&ipf_pktopts, sizeof(struct ipf_pktopts)); + ippo = &ipf_pktopts; + + /* + * At present the IP_OUTARGS flag implies a request for IP to + * perform source interface selection. In the forwarding case, + * only the ifscope value is used, as source interface selection + * doesn't take place. + */ + if (ip_doscopedroute && (flags & IP_OUTARGS)) { + select_srcif = !(flags & IP_FORWARDING); + ifscope = ipoa->ipoa_boundif; + ipf_pktopts.ippo_flags = IPPOF_BOUND_IF; + ipf_pktopts.ippo_flags |= (ifscope << IPPOF_SHIFT_IFSCOPE); + } else { + select_srcif = FALSE; + ifscope = IFSCOPE_NONE; + } + + if (flags & IP_OUTARGS) { + nocell = ipoa->ipoa_nocell; + if (nocell) + ipf_pktopts.ippo_flags |= IPPOF_NO_IFT_CELLULAR; + } else { + nocell = 0; + } + +#if IPFIREWALL if (args.rule != NULL) { /* dummynet already saw us */ - ip = mtod(m, struct ip *); - hlen = IP_VHL_HL(ip->ip_vhl) << 2 ; - if (ro->ro_rt != NULL) - ia = (struct in_ifaddr *)ro->ro_rt->rt_ifa; - if (ia) - ifaref(&ia->ia_ifa); -#if IPSEC - if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) { - so = ipsec_getsocket(m); - (void)ipsec_setsocket(m, NULL); + ip = mtod(m, struct ip *); + hlen = IP_VHL_HL(ip->ip_vhl) << 2 ; + if (ro->ro_rt != NULL) { + RT_LOCK_SPIN(ro->ro_rt); + ia = (struct in_ifaddr *)ro->ro_rt->rt_ifa; + if (ia) { + /* Become a regular mutex */ + RT_CONVERT_LOCK(ro->ro_rt); + IFA_ADDREF(&ia->ia_ifa); } + RT_UNLOCK(ro->ro_rt); + } +#if IPSEC + if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) { + so = ipsec_getsocket(m); + (void)ipsec_setsocket(m, NULL); + } #endif - goto sendit; + goto sendit; } +#endif /* IPFIREWALL */ #if IPSEC if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) { @@ -298,14 +442,39 @@ loopit: * No need to proccess packet twice if we've * already seen it */ - inject_filter_ref = ipf_get_inject_filter(m); + if (!SLIST_EMPTY(&m->m_pkthdr.tags)) + inject_filter_ref = ipf_get_inject_filter(m); + else + inject_filter_ref = 0; if (opt) { m = ip_insertoptions(m, opt, &len); hlen = len; } ip = mtod(m, struct ip *); +#if IPFIREWALL + /* + * rdar://8542331 + * + * When dealing with a packet chain, we need to reset "next_hop" because + * "dst" may have been changed to the gateway address below for the previous + * packet of the chain. This could cause the route to be inavertandly changed + * to the route to the gateway address (instead of the route to the destination). + */ + args.next_hop = next_hop_from_ipfwd_tag; pkt_dst = args.next_hop ? args.next_hop->sin_addr : ip->ip_dst; +#else + pkt_dst = ip->ip_dst; +#endif + + /* + * We must not send if the packet is destined to network zero. + * RFC1122 3.2.1.3 (a) and (b). + */ + if (IN_ZERONET(ntohl(pkt_dst.s_addr))) { + error = EHOSTUNREACH; + goto bad; + } /* * Fill in IP header. @@ -318,14 +487,24 @@ loopit: #else ip->ip_id = htons(ip_id++); #endif - ipstat.ips_localout++; + OSAddAtomic(1, &ipstat.ips_localout); } else { hlen = IP_VHL_HL(ip->ip_vhl) << 2; } + +#if DEBUG + /* For debugging, we let the stack forge congestion */ + if (forge_ce != 0 && + ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT1 || + (ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT0)) { + ip->ip_tos = (ip->ip_tos & ~IPTOS_ECN_MASK) | IPTOS_ECN_CE; + forge_ce--; + } +#endif /* DEBUG */ KERNEL_DEBUG(DBG_LAYER_BEG, ip->ip_dst.s_addr, ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len); - + dst = (struct sockaddr_in *)&ro->ro_dst; /* @@ -336,21 +515,39 @@ loopit: * cache with IPv6. */ - { - if (ro->ro_rt && (ro->ro_rt->generation_id != route_generation) && - ((flags & (IP_ROUTETOIF | IP_FORWARDING)) == 0) && (ip->ip_src.s_addr != INADDR_ANY) && - (ifa_foraddr(ip->ip_src.s_addr) == 0)) { - error = EADDRNOTAVAIL; - goto bad; + if (ro->ro_rt != NULL) { + if (ro->ro_rt->generation_id != route_generation && + ((flags & (IP_ROUTETOIF | IP_FORWARDING)) == 0) && + (ip->ip_src.s_addr != INADDR_ANY)) { + src_ia = ifa_foraddr(ip->ip_src.s_addr); + if (src_ia == NULL) { + error = EADDRNOTAVAIL; + goto bad; + } + IFA_REMREF(&src_ia->ia_ifa); } + /* + * Test rt_flags without holding rt_lock for performance + * reasons; if the route is down it will hopefully be + * caught by the layer below (since it uses this route + * as a hint) or during the next transmit. + */ + if ((ro->ro_rt->rt_flags & RTF_UP) == 0 || + dst->sin_family != AF_INET || + dst->sin_addr.s_addr != pkt_dst.s_addr) { + rtfree(ro->ro_rt); + ro->ro_rt = NULL; + } + /* + * If we're doing source interface selection, we may not + * want to use this route; only synch up the generation + * count otherwise. + */ + if (!select_srcif && ro->ro_rt != NULL && + ro->ro_rt->generation_id != route_generation) + ro->ro_rt->generation_id = route_generation; } - if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 || - dst->sin_family != AF_INET || - dst->sin_addr.s_addr != pkt_dst.s_addr)) { - rtfree(ro->ro_rt); - ro->ro_rt = (struct rtentry *)0; - } - if (ro->ro_rt == 0) { + if (ro->ro_rt == NULL) { bzero(dst, sizeof(*dst)); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); @@ -364,10 +561,10 @@ loopit: #define sintosa(sin) ((struct sockaddr *)(sin)) if (flags & IP_ROUTETOIF) { if (ia) - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == 0) { if ((ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == 0) { - ipstat.ips_noroute++; + OSAddAtomic(1, &ipstat.ips_noroute); error = ENETUNREACH; goto bad; } @@ -375,7 +572,85 @@ loopit: ifp = ia->ia_ifp; ip->ip_ttl = 1; isbroadcast = in_broadcast(dst->sin_addr, ifp); + } else if (IN_MULTICAST(ntohl(pkt_dst.s_addr)) && + imo != NULL && (ifp = imo->imo_multicast_ifp) != NULL) { + /* + * Bypass the normal routing lookup for multicast + * packets if the interface is specified. + */ + isbroadcast = 0; + if (ia != NULL) + IFA_REMREF(&ia->ia_ifa); + + /* Macro takes reference on ia */ + IFP_TO_IA(ifp, ia); } else { + boolean_t cloneok = FALSE; + /* + * Perform source interface selection; the source IP address + * must belong to one of the addresses of the interface used + * by the route. For performance reasons, do this only if + * there is no route, or if the routing table has changed, + * or if we haven't done source interface selection on this + * route (for this PCB instance) before. + */ + if (select_srcif && ip->ip_src.s_addr != INADDR_ANY && + (ro->ro_rt == NULL || !(ro->ro_rt->rt_flags & RTF_UP) || + ro->ro_rt->generation_id != route_generation || + !(ro->ro_flags & ROF_SRCIF_SELECTED))) { + struct ifaddr *ifa; + + /* Find the source interface */ + ifa = in_selectsrcif(ip, ro, ifscope); + + /* + * If the source address belongs to a cellular interface + * and the caller forbids our using interfaces of such + * type, pretend that there is no source address. + */ + if (nocell && ifa != NULL && + ifa->ifa_ifp->if_type == IFT_CELLULAR) { + IFA_REMREF(ifa); + error = EADDRNOTAVAIL; + goto bad; + } + + /* + * If the source address is spoofed (in the case + * of IP_RAWOUTPUT), or if this is destined for + * local/loopback, just let it go out using the + * interface of the route. Otherwise, there's no + * interface having such an address, so bail out. + */ + if (ifa == NULL && !(flags & IP_RAWOUTPUT) && + ifscope != lo_ifp->if_index) { + error = EADDRNOTAVAIL; + goto bad; + } + + /* + * If the caller didn't explicitly specify the scope, + * pick it up from the source interface. If the cached + * route was wrong and was blown away as part of source + * interface selection, don't mask out RTF_PRCLONING + * since that route may have been allocated by the ULP, + * unless the IP header was created by the caller or + * the destination is IPv4 LLA. The check for the + * latter is needed because IPv4 LLAs are never scoped + * in the current implementation, and we don't want to + * replace the resolved IPv4 LLA route with one whose + * gateway points to that of the default gateway on + * the primary interface of the system. + */ + if (ifa != NULL) { + if (ifscope == IFSCOPE_NONE) + ifscope = ifa->ifa_ifp->if_index; + IFA_REMREF(ifa); + cloneok = (!(flags & IP_RAWOUTPUT) && + !(IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)))); + } + } + /* * If this is the case, we probably don't want to allocate * a protocol-cloned route since we didn't get one from the @@ -385,29 +660,95 @@ loopit: * the link layer, as this is probably required in all cases * for correct operation (as it is for ARP). */ - if (ro->ro_rt == 0) - rtalloc_ign(ro, RTF_PRCLONING); - if (ro->ro_rt == 0) { - ipstat.ips_noroute++; + if (ro->ro_rt == NULL) { + unsigned long ign = RTF_PRCLONING; + /* + * We make an exception here: if the destination + * address is INADDR_BROADCAST, allocate a protocol- + * cloned host route so that we end up with a route + * marked with the RTF_BROADCAST flag. Otherwise, + * we would end up referring to the default route, + * instead of creating a cloned host route entry. + * That would introduce inconsistencies between ULPs + * that allocate a route and those that don't. The + * RTF_BROADCAST route is important since we'd want + * to send out undirected IP broadcast packets using + * link-level broadcast address. Another exception + * is for ULP-created routes that got blown away by + * source interface selection (see above). + * + * These exceptions will no longer be necessary when + * the RTF_PRCLONING scheme is no longer present. + */ + if (cloneok || dst->sin_addr.s_addr == INADDR_BROADCAST) + ign &= ~RTF_PRCLONING; + + /* + * Loosen the route lookup criteria if the ifscope + * corresponds to the loopback interface; this is + * needed to support Application Layer Gateways + * listening on loopback, in conjunction with packet + * filter redirection rules. The final source IP + * address will be rewritten by the packet filter + * prior to the RFC1122 loopback check below. + */ + if (ifscope == lo_ifp->if_index) + rtalloc_ign(ro, ign); + else + rtalloc_scoped_ign(ro, ign, ifscope); + + /* + * If the route points to a cellular interface and the + * caller forbids our using interfaces of such type, + * pretend that there is no route. + */ + if (nocell && ro->ro_rt != NULL) { + RT_LOCK_SPIN(ro->ro_rt); + if (ro->ro_rt->rt_ifp->if_type == + IFT_CELLULAR) { + RT_UNLOCK(ro->ro_rt); + rtfree(ro->ro_rt); + ro->ro_rt = NULL; + } else { + RT_UNLOCK(ro->ro_rt); + } + } + } + + if (ro->ro_rt == NULL) { + OSAddAtomic(1, &ipstat.ips_noroute); error = EHOSTUNREACH; goto bad; } + if (ia) - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); + RT_LOCK_SPIN(ro->ro_rt); ia = ifatoia(ro->ro_rt->rt_ifa); - if (ia) - ifaref(&ia->ia_ifa); + if (ia) { + /* Become a regular mutex */ + RT_CONVERT_LOCK(ro->ro_rt); + IFA_ADDREF(&ia->ia_ifa); + } ifp = ro->ro_rt->rt_ifp; ro->ro_rt->rt_use++; if (ro->ro_rt->rt_flags & RTF_GATEWAY) dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway; - if (ro->ro_rt->rt_flags & RTF_HOST) + if (ro->ro_rt->rt_flags & RTF_HOST) { isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST); - else + } else { + /* Become a regular mutex */ + RT_CONVERT_LOCK(ro->ro_rt); isbroadcast = in_broadcast(dst->sin_addr, ifp); + } + RT_UNLOCK(ro->ro_rt); } + if (IN_MULTICAST(ntohl(pkt_dst.s_addr))) { struct in_multi *inm; + u_int32_t vif; + u_int8_t ttl = IP_DEFAULT_MULTICAST_TTL; + u_int8_t loop = IP_DEFAULT_MULTICAST_LOOP; m->m_flags |= M_MCAST; /* @@ -420,22 +761,30 @@ loopit: * See if the caller provided any multicast options */ if (imo != NULL) { - if ((flags & IP_RAWOUTPUT) == 0) ip->ip_ttl = imo->imo_multicast_ttl; - if (imo->imo_multicast_ifp != NULL) { + IMO_LOCK(imo); + vif = imo->imo_multicast_vif; + ttl = imo->imo_multicast_ttl; + loop = imo->imo_multicast_loop; + if ((flags & IP_RAWOUTPUT) == 0) + ip->ip_ttl = ttl; + if (imo->imo_multicast_ifp != NULL) ifp = imo->imo_multicast_ifp; - } - if (imo->imo_multicast_vif != -1 && - ((flags & IP_RAWOUTPUT) == 0 || ip->ip_src.s_addr == INADDR_ANY)) - ip->ip_src.s_addr = - ip_mcast_src(imo->imo_multicast_vif); - } else - if ((flags & IP_RAWOUTPUT) == 0) ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; + IMO_UNLOCK(imo); +#if MROUTING + if (vif != -1 && ((flags & IP_RAWOUTPUT) == 0 || + ip->ip_src.s_addr == INADDR_ANY)) + ip->ip_src.s_addr = ip_mcast_src(vif); +#endif /* MROUTING */ + } else if ((flags & IP_RAWOUTPUT) == 0) { + vif = -1; + ip->ip_ttl = ttl; + } /* * Confirm that the outgoing interface supports multicast. */ - if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { + if (imo == NULL || vif == -1) { if ((ifp->if_flags & IFF_MULTICAST) == 0) { - ipstat.ips_noroute++; + OSAddAtomic(1, &ipstat.ips_noroute); error = ENETUNREACH; goto bad; } @@ -445,25 +794,28 @@ loopit: * of outgoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) { - register struct in_ifaddr *ia1; - - TAILQ_FOREACH(ia1, &in_ifaddrhead, ia_link) + struct in_ifaddr *ia1; + lck_rw_lock_shared(in_ifaddr_rwlock); + TAILQ_FOREACH(ia1, &in_ifaddrhead, ia_link) { + IFA_LOCK_SPIN(&ia1->ia_ifa); if (ia1->ia_ifp == ifp) { ip->ip_src = IA_SIN(ia1)->sin_addr; - + IFA_UNLOCK(&ia1->ia_ifa); break; } + IFA_UNLOCK(&ia1->ia_ifa); + } + lck_rw_done(in_ifaddr_rwlock); if (ip->ip_src.s_addr == INADDR_ANY) { error = ENETUNREACH; goto bad; } } - ifnet_lock_shared(ifp); - IN_LOOKUP_MULTI(pkt_dst, ifp, inm); - ifnet_lock_done(ifp); - if (inm != NULL && - (imo == NULL || imo->imo_multicast_loop)) { + in_multihead_lock_shared(); + IN_LOOKUP_MULTI(&pkt_dst, ifp, inm); + in_multihead_lock_done(); + if (inm != NULL && (imo == NULL || loop)) { /* * If we belong to the destination multicast group * on the outgoing interface, and the caller did not @@ -472,22 +824,23 @@ loopit: if (!TAILQ_EMPTY(&ipv4_filters)) { struct ipfilter *filter; int seen = (inject_filter_ref == 0); - struct ipf_pktopts *ippo = 0, ipf_pktopts; - if (imo) { - ippo = &ipf_pktopts; - ipf_pktopts.ippo_mcast_ifnet = imo->imo_multicast_ifp; - ipf_pktopts.ippo_mcast_ttl = imo->imo_multicast_ttl; - ipf_pktopts.ippo_mcast_loop = imo->imo_multicast_loop; + if (imo != NULL) { + ipf_pktopts.ippo_flags |= IPPOF_MCAST_OPTS; + ipf_pktopts.ippo_mcast_ifnet = ifp; + ipf_pktopts.ippo_mcast_ttl = ttl; + ipf_pktopts.ippo_mcast_loop = loop; } - - lck_mtx_unlock(ip_mutex); + ipf_ref(); - + /* 4135317 - always pass network byte order to filter */ + +#if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_len); HTONS(ip->ip_off); - +#endif + TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) { if (seen == 0) { if ((struct ipfilter *)inject_filter_ref == filter) @@ -497,26 +850,31 @@ loopit: result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, ippo); if (result == EJUSTRETURN) { ipf_unref(); + INM_REMREF(inm); goto done; } if (result != 0) { ipf_unref(); - lck_mtx_lock(ip_mutex); + INM_REMREF(inm); goto bad; } } } - + /* set back to host byte order */ + ip = mtod(m, struct ip *); + +#if BYTE_ORDER != BIG_ENDIAN NTOHS(ip->ip_len); NTOHS(ip->ip_off); - - lck_mtx_lock(ip_mutex); +#endif + ipf_unref(); didfilter = 1; } ip_mloopback(ifp, m, dst, hlen); } +#if MROUTING else { /* * If we are acting as a multicast router, perform @@ -538,15 +896,18 @@ loopit: * as prescribed by rsvpd. */ if (!rsvp_on) - imo = NULL; + imo = NULL; if (ip_mforward(ip, ifp, m, imo) != 0) { m_freem(m); - lck_mtx_unlock(ip_mutex); + if (inm != NULL) + INM_REMREF(inm); goto done; } } } - +#endif /* MROUTING */ + if (inm != NULL) + INM_REMREF(inm); /* * Multicasts with a time-to-live of zero may be looped- * back, above, but must not be transmitted on a network. @@ -557,7 +918,6 @@ loopit: */ if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { m_freem(m); - lck_mtx_unlock(ip_mutex); goto done; } @@ -569,7 +929,9 @@ loopit: * of outgoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) { + IFA_LOCK_SPIN(&ia->ia_ifa); ip->ip_src = IA_SIN(ia)->sin_addr; + IFA_UNLOCK(&ia->ia_ifa); #if IPFIREWALL_FORWARD /* Keep note that we did this - if the firewall changes * the next-hop, our interface may change, changing the @@ -606,6 +968,33 @@ loopit: } sendit: +#if PF + /* Invoke outbound packet filter */ + if ( PF_IS_ENABLED) { + int rc; + rc = pf_af_hook(ifp, mppn, &m, AF_INET, FALSE); + if (rc != 0) { + if (packetlist == m0) { + packetlist = m; + mppn = NULL; + } + if (m != NULL) { + m0 = m; + /* Next packet in the chain */ + goto loopit; + } else if (packetlist != NULL) { + /* No more packet; send down the chain */ + goto sendchain; + } + /* Nothing left; we're done */ + goto done; + } + m0 = m; + ip = mtod(m, struct ip *); + pkt_dst = ip->ip_dst; + hlen = IP_VHL_HL(ip->ip_vhl) << 2; + } +#endif /* PF */ /* * Force IP TTL to 255 following draft-ietf-zeroconf-ipv4-linklocal.txt */ @@ -617,43 +1006,56 @@ sendit: } } -injectit: if (!didfilter && !TAILQ_EMPTY(&ipv4_filters)) { struct ipfilter *filter; int seen = (inject_filter_ref == 0); - - lck_mtx_unlock(ip_mutex); + ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS; + + /* Check that a TSO frame isn't passed to a filter. + * This could happen if a filter is inserted while + * TCP is sending the TSO packet. + */ + if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) { + error = EMSGSIZE; + goto bad; + } + ipf_ref(); /* 4135317 - always pass network byte order to filter */ + +#if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_len); HTONS(ip->ip_off); - +#endif + TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) { if (seen == 0) { if ((struct ipfilter *)inject_filter_ref == filter) seen = 1; } else if (filter->ipf_filter.ipf_output) { errno_t result; - result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, 0); + result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, ippo); if (result == EJUSTRETURN) { ipf_unref(); goto done; } if (result != 0) { ipf_unref(); - lck_mtx_lock(ip_mutex); goto bad; } } } /* set back to host byte order */ + ip = mtod(m, struct ip *); + +#if BYTE_ORDER != BIG_ENDIAN NTOHS(ip->ip_len); NTOHS(ip->ip_off); - +#endif + ipf_unref(); - lck_mtx_lock(ip_mutex); } #if IPSEC @@ -664,7 +1066,6 @@ injectit: KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_START, 0,0,0,0,0); - lck_mtx_lock(sadb_mutex); /* get SP for this packet */ if (so == NULL) @@ -673,9 +1074,8 @@ injectit: sp = ipsec4_getpolicybysock(m, IPSEC_DIR_OUTBOUND, so, &error); if (sp == NULL) { - ipsecstat.out_inval++; + IPSEC_STAT_INCREMENT(ipsecstat.out_inval); KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); - lck_mtx_unlock(sadb_mutex); goto bad; } @@ -684,19 +1084,18 @@ injectit: /* check policy */ switch (sp->policy) { case IPSEC_POLICY_DISCARD: + case IPSEC_POLICY_GENERATE: /* * This packet is just discarded. */ - ipsecstat.out_polvio++; + IPSEC_STAT_INCREMENT(ipsecstat.out_polvio); KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 1,0,0,0,0); - lck_mtx_unlock(sadb_mutex); goto bad; case IPSEC_POLICY_BYPASS: case IPSEC_POLICY_NONE: /* no need to do IPsec. */ KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 2,0,0,0,0); - lck_mtx_unlock(sadb_mutex); goto skip_ipsec; case IPSEC_POLICY_IPSEC: @@ -704,7 +1103,6 @@ injectit: /* acquire a policy */ error = key_spdacquire(sp); KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 3,0,0,0,0); - lck_mtx_unlock(sadb_mutex); goto bad; } break; @@ -735,13 +1133,17 @@ injectit: m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } + +#if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_len); HTONS(ip->ip_off); +#endif + + DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL, + struct ip *, ip, struct ifnet *, ifp, + struct ip *, ip, struct ip6_hdr *, NULL); - lck_mtx_unlock(ip_mutex); error = ipsec4_output(&state, sp, flags); - lck_mtx_unlock(sadb_mutex); - lck_mtx_lock(ip_mutex); m0 = m = state.m; @@ -790,76 +1192,105 @@ injectit: hlen = ip->ip_hl << 2; #endif /* Check that there wasn't a route change and src is still valid */ - - if (ro->ro_rt && ro->ro_rt->generation_id != route_generation) { - if (ifa_foraddr(ip->ip_src.s_addr) == 0 && ((flags & (IP_ROUTETOIF | IP_FORWARDING)) == 0)) { - error = EADDRNOTAVAIL; - KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 5,0,0,0,0); + if (ro->ro_rt != NULL && ro->ro_rt->generation_id != route_generation) { + if ((src_ia = ifa_foraddr(ip->ip_src.s_addr)) == NULL && + ((flags & (IP_ROUTETOIF | IP_FORWARDING)) == 0)) { + error = EADDRNOTAVAIL; + KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, + 5,0,0,0,0); goto bad; } rtfree(ro->ro_rt); ro->ro_rt = NULL; + if (src_ia != NULL) + IFA_REMREF(&src_ia->ia_ifa); } if (ro->ro_rt == NULL) { if ((flags & IP_ROUTETOIF) == 0) { - printf("ip_output: " - "can't update route after IPsec processing\n"); - error = EHOSTUNREACH; /*XXX*/ - KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 6,0,0,0,0); + printf("ip_output: can't update route after " + "IPsec processing\n"); + error = EHOSTUNREACH; /*XXX*/ + KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, + 6,0,0,0,0); goto bad; } } else { if (ia) - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); + RT_LOCK_SPIN(ro->ro_rt); ia = ifatoia(ro->ro_rt->rt_ifa); - if (ia) - ifaref(&ia->ia_ifa); + if (ia) { + /* Become a regular mutex */ + RT_CONVERT_LOCK(ro->ro_rt); + IFA_ADDREF(&ia->ia_ifa); + } ifp = ro->ro_rt->rt_ifp; + RT_UNLOCK(ro->ro_rt); } /* make it flipped, again. */ + +#if BYTE_ORDER != BIG_ENDIAN NTOHS(ip->ip_len); NTOHS(ip->ip_off); +#endif + KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 7,0xff,0xff,0xff,0xff); /* Pass to filters again */ if (!TAILQ_EMPTY(&ipv4_filters)) { struct ipfilter *filter; - lck_mtx_unlock(ip_mutex); + ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS; + + /* Check that a TSO frame isn't passed to a filter. + * This could happen if a filter is inserted while + * TCP is sending the TSO packet. + */ + if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) { + error = EMSGSIZE; + goto bad; + } + ipf_ref(); /* 4135317 - always pass network byte order to filter */ + +#if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_len); HTONS(ip->ip_off); - +#endif + TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) { if (filter->ipf_filter.ipf_output) { errno_t result; - result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, 0); + result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, ippo); if (result == EJUSTRETURN) { ipf_unref(); goto done; } if (result != 0) { ipf_unref(); - lck_mtx_lock(ip_mutex); goto bad; } } } /* set back to host byte order */ + ip = mtod(m, struct ip *); + +#if BYTE_ORDER != BIG_ENDIAN NTOHS(ip->ip_len); NTOHS(ip->ip_off); - +#endif + ipf_unref(); - lck_mtx_lock(ip_mutex); } skip_ipsec: #endif /*IPSEC*/ +#if IPFIREWALL /* * IpHack's section. * - Xlate: translate packet's addr/port (NAT). @@ -871,7 +1302,6 @@ skip_ipsec: struct mbuf *m1 = m; if ((error = (*fr_checkp)(ip, hlen, ifp, 1, &m1)) || !m1) { - lck_mtx_unlock(ip_mutex); goto done; } ip = mtod(m0 = m = m1, struct ip *); @@ -887,7 +1317,6 @@ skip_ipsec: args.m = m; args.next_hop = dst; args.oif = ifp; - lck_mtx_unlock(ip_mutex); off = ip_fw_chk_ptr(&args); m = args.m; dst = args.next_hop; @@ -914,31 +1343,32 @@ skip_ipsec: goto done ; } ip = mtod(m, struct ip *); + if (off == 0 && dst == old) {/* common case */ - lck_mtx_lock(ip_mutex); goto pass ; } #if DUMMYNET if (DUMMYNET_LOADED && (off & IP_FW_PORT_DYNT_FLAG) != 0) { - /* - * pass the pkt to dummynet. Need to include - * pipe number, m, ifp, ro, dst because these are - * not recomputed in the next pass. - * All other parameters have been already used and - * so they are not needed anymore. - * XXX note: if the ifp or ro entry are deleted - * while a pkt is in dummynet, we are in trouble! - */ - args.ro = ro; - args.dst = dst; - args.flags = flags; - - error = ip_dn_io_ptr(m, off & 0xffff, DN_TO_IP_OUT, - &args); - goto done; + /* + * pass the pkt to dummynet. Need to include + * pipe number, m, ifp, ro, dst because these are + * not recomputed in the next pass. + * All other parameters have been already used and + * so they are not needed anymore. + * XXX note: if the ifp or ro entry are deleted + * while a pkt is in dummynet, we are in trouble! + */ + args.ro = ro; + args.dst = dst; + args.flags = flags; + if (flags & IP_OUTARGS) + args.ipoa = ipoa; + + error = ip_dn_io_ptr(m, off & 0xffff, DN_TO_IP_OUT, + &args); + goto done; } #endif /* DUMMYNET */ - lck_mtx_lock(ip_mutex); #if IPDIVERT if (off != 0 && (off & IP_FW_PORT_DYNT_FLAG) == 0) { struct mbuf *clone = NULL; @@ -957,8 +1387,11 @@ skip_ipsec: } /* Restore packet header fields to original values */ + +#if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_len); HTONS(ip->ip_off); +#endif /* Deliver packet to divert input routine */ divert_packet(m, 0, off & 0xffff, args.divert_rule); @@ -969,7 +1402,6 @@ skip_ipsec: ip = mtod(m, struct ip *); goto pass; } - lck_mtx_unlock(ip_mutex); goto done; } #endif @@ -1008,35 +1440,42 @@ skip_ipsec: * as the packet runs through ip_input() as * it is done through a ISR. */ + lck_rw_lock_shared(in_ifaddr_rwlock); TAILQ_FOREACH(ia_fw, &in_ifaddrhead, ia_link) { /* * If the addr to forward to is one * of ours, we pretend to * be the destination for this packet. */ + IFA_LOCK_SPIN(&ia_fw->ia_ifa); if (IA_SIN(ia_fw)->sin_addr.s_addr == - dst->sin_addr.s_addr) + dst->sin_addr.s_addr) { + IFA_UNLOCK(&ia_fw->ia_ifa); break; + } + IFA_UNLOCK(&ia_fw->ia_ifa); } - if (ia) { + lck_rw_done(in_ifaddr_rwlock); + if (ia_fw) { /* tell ip_input "dont filter" */ struct m_tag *fwd_tag; struct ip_fwd_tag *ipfwd_tag; - - fwd_tag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFORWARD, - sizeof(struct sockaddr_in), M_NOWAIT); + + fwd_tag = m_tag_create(KERNEL_MODULE_TAG_ID, + KERNEL_TAG_TYPE_IPFORWARD, + sizeof (*ipfwd_tag), M_NOWAIT, m); if (fwd_tag == NULL) { error = ENOBUFS; goto bad; } - + ipfwd_tag = (struct ip_fwd_tag *)(fwd_tag+1); ipfwd_tag->next_hop = args.next_hop; m_tag_prepend(m, fwd_tag); if (m->m_pkthdr.rcvif == NULL) - m->m_pkthdr.rcvif = ifunit("lo0"); + m->m_pkthdr.rcvif = lo_ifp; if ((~IF_HWASSIST_CSUM_FLAGS(m->m_pkthdr.rcvif->if_hwassist) & m->m_pkthdr.csum_flags) == 0) { if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { @@ -1053,10 +1492,11 @@ skip_ipsec: m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; ip->ip_sum = in_cksum(m, hlen); } + +#if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_len); HTONS(ip->ip_off); - - lck_mtx_unlock(ip_mutex); +#endif /* we need to call dlil_output to run filters * and resync to avoid recursion loops. @@ -1077,25 +1517,35 @@ skip_ipsec: */ bcopy(dst, &ro_fwd->ro_dst, sizeof(*dst)); - ro_fwd->ro_rt = 0; + ro_fwd->ro_rt = NULL; rtalloc_ign(ro_fwd, RTF_PRCLONING); - if (ro_fwd->ro_rt == 0) { - ipstat.ips_noroute++; + if (ro_fwd->ro_rt == NULL) { + OSAddAtomic(1, &ipstat.ips_noroute); error = EHOSTUNREACH; goto bad; } + RT_LOCK_SPIN(ro_fwd->ro_rt); ia_fw = ifatoia(ro_fwd->ro_rt->rt_ifa); + if (ia_fw != NULL) { + /* Become a regular mutex */ + RT_CONVERT_LOCK(ro_fwd->ro_rt); + IFA_ADDREF(&ia_fw->ia_ifa); + } ifp = ro_fwd->ro_rt->rt_ifp; ro_fwd->ro_rt->rt_use++; if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY) dst = (struct sockaddr_in *)ro_fwd->ro_rt->rt_gateway; - if (ro_fwd->ro_rt->rt_flags & RTF_HOST) + if (ro_fwd->ro_rt->rt_flags & RTF_HOST) { isbroadcast = (ro_fwd->ro_rt->rt_flags & RTF_BROADCAST); - else + } else { + /* Become a regular mutex */ + RT_CONVERT_LOCK(ro_fwd->ro_rt); isbroadcast = in_broadcast(dst->sin_addr, ifp); + } + RT_UNLOCK(ro_fwd->ro_rt); rtfree(ro->ro_rt); ro->ro_rt = ro_fwd->ro_rt; dst = (struct sockaddr_in *)&ro_fwd->ro_dst; @@ -1105,8 +1555,14 @@ skip_ipsec: * which would have been gotten from the-then * interface, do it again, from the new one. */ - if (fwd_rewrite_src) - ip->ip_src = IA_SIN(ia_fw)->sin_addr; + if (ia_fw != NULL) { + if (fwd_rewrite_src) { + IFA_LOCK_SPIN(&ia_fw->ia_ifa); + ip->ip_src = IA_SIN(ia_fw)->sin_addr; + IFA_UNLOCK(&ia_fw->ia_ifa); + } + IFA_REMREF(&ia_fw->ia_ifa); + } goto pass ; } #endif /* IPFIREWALL_FORWARD */ @@ -1116,17 +1572,17 @@ skip_ipsec: */ m_freem(m); error = EACCES; /* not sure this is the right error msg */ - lck_mtx_unlock(ip_mutex); goto done; } pass: +#endif /* IPFIREWALL */ #if __APPLE__ /* Do not allow loopback address to wind up on a wire */ if ((ifp->if_flags & IFF_LOOPBACK) == 0 && ((ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)) { - ipstat.ips_badaddr++; + OSAddAtomic(1, &ipstat.ips_badaddr); m_freem(m); /* * Do not simply drop the packet just like a firewall -- we want the @@ -1136,11 +1592,12 @@ pass: * loopback as the source address. */ error = ENETUNREACH; - lck_mtx_unlock(ip_mutex); goto done; } #endif m->m_pkthdr.csum_flags |= CSUM_IP; + tso = (ifp->if_hwassist & IFNET_TSO_IPV4) && (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4); + sw_csum = m->m_pkthdr.csum_flags & ~IF_HWASSIST_CSUM_FLAGS(ifp->if_hwassist); @@ -1164,6 +1621,9 @@ pass: /* let the software handle any UDP or TCP checksums */ sw_csum |= (CSUM_DELAY_DATA & m->m_pkthdr.csum_flags); } + } else if (apple_hwcksum_tx == 0) { + sw_csum |= (CSUM_DELAY_DATA | CSUM_DELAY_IP) & + m->m_pkthdr.csum_flags; } if (sw_csum & CSUM_DELAY_DATA) { @@ -1171,17 +1631,29 @@ pass: sw_csum &= ~CSUM_DELAY_DATA; m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } - - m->m_pkthdr.csum_flags &= IF_HWASSIST_CSUM_FLAGS(ifp->if_hwassist); + + if (apple_hwcksum_tx != 0) { + m->m_pkthdr.csum_flags &= + IF_HWASSIST_CSUM_FLAGS(ifp->if_hwassist); + } else { + m->m_pkthdr.csum_flags = 0; + } /* * If small enough for interface, or the interface will take * care of the fragmentation for us, can just send directly. */ - if ((u_short)ip->ip_len <= ifp->if_mtu || + if ((u_short)ip->ip_len <= ifp->if_mtu || tso || ifp->if_hwassist & CSUM_FRAGMENT) { + if (tso) + m->m_pkthdr.csum_flags |= CSUM_TSO_IPV4; + + +#if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_len); HTONS(ip->ip_off); +#endif + ip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) { ip->ip_sum = in_cksum(m, hlen); @@ -1201,21 +1673,29 @@ pass: ipsec_delaux(m); #endif if (packetchain == 0) { - lck_mtx_unlock(ip_mutex); - error = dlil_output(ifp, PF_INET, m, (void *) ro->ro_rt, - (struct sockaddr *)dst, 0); - goto done; + if (ro->ro_rt && nstat_collect) + nstat_route_tx(ro->ro_rt, 1, m->m_pkthdr.len, 0); + error = ifnet_output(ifp, PF_INET, m, ro->ro_rt, + (struct sockaddr *)dst); + goto done; } else { /* packet chaining allows us to reuse the route for all packets */ + bytecnt += m->m_pkthdr.len; + mppn = &m->m_nextpkt; m = m->m_nextpkt; if (m == NULL) { +#if PF +sendchain: +#endif /* PF */ if (pktcnt > ip_maxchainsent) ip_maxchainsent = pktcnt; + if (ro->ro_rt && nstat_collect) + nstat_route_tx(ro->ro_rt, pktcnt, bytecnt, 0); //send - lck_mtx_unlock(ip_mutex); - error = dlil_output_list(ifp, PF_INET, packetlist, (void *) ro->ro_rt, - (struct sockaddr *)dst, 0); + error = ifnet_output(ifp, PF_INET, packetlist, + ro->ro_rt, (struct sockaddr *)dst); pktcnt = 0; + bytecnt = 0; goto done; } @@ -1228,7 +1708,9 @@ pass: * Too large for interface; fragment if possible. * Must be able to put at least 8 bytes per fragment. */ - if (ip->ip_off & IP_DF) { + + if (ip->ip_off & IP_DF || (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) || + pktcnt > 0) { error = EMSGSIZE; /* * This case can happen if the user changed the MTU @@ -1237,40 +1719,117 @@ pass: * them, there is no way for one to update all its * routes when the MTU is changed. */ - if (ro->ro_rt && (ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) - && !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) - && (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) { - ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu; + if (ro->ro_rt) { + RT_LOCK_SPIN(ro->ro_rt); + if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) + && !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) + && (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) { + ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu; + } + RT_UNLOCK(ro->ro_rt); } - ipstat.ips_cantfrag++; + if (pktcnt > 0) { + m0 = packetlist; + } + OSAddAtomic(1, &ipstat.ips_cantfrag); goto bad; } - len = (ifp->if_mtu - hlen) &~ 7; - if (len < 8) { - error = EMSGSIZE; + + error = ip_fragment(m, ifp, ifp->if_mtu, sw_csum); + if (error != 0) { + m0 = m = NULL; goto bad; } - /* - * if the interface will not calculate checksums on - * fragmented packets, then do it here. - */ - if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA && - (ifp->if_hwassist & CSUM_IP_FRAGS) == 0) { - in_delayed_cksum(m); - if (m == NULL) { - lck_mtx_unlock(ip_mutex); - return(ENOMEM); - } - m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr, + ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len); + + for (m = m0; m; m = m0) { + m0 = m->m_nextpkt; + m->m_nextpkt = 0; +#if IPSEC + /* clean ipsec history once it goes out of the node */ + if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) + ipsec_delaux(m); +#endif + if (error == 0) { +#ifndef __APPLE__ + /* Record statistics for this interface address. */ + if (ia != NULL) { + ia->ia_ifa.if_opackets++; + ia->ia_ifa.if_obytes += m->m_pkthdr.len; + } +#endif + if ((packetchain != 0) && (pktcnt > 0)) + panic("ip_output: mix of packet in packetlist is wrong=%p", packetlist); + if (ro->ro_rt && nstat_collect) + nstat_route_tx(ro->ro_rt, 1, m->m_pkthdr.len, 0); + error = ifnet_output(ifp, PF_INET, m, ro->ro_rt, + (struct sockaddr *)dst); + } else + m_freem(m); } + if (error == 0) + OSAddAtomic(1, &ipstat.ips_fragmented); - { - int mhlen, firstlen = len; - struct mbuf **mnext = &m->m_nextpkt; +done: + if (ia) { + IFA_REMREF(&ia->ia_ifa); + ia = NULL; + } +#if IPSEC + if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) { + if (ro == &iproute && ro->ro_rt) { + rtfree(ro->ro_rt); + ro->ro_rt = NULL; + } + if (sp != NULL) { + KEYDEBUG(KEYDEBUG_IPSEC_STAMP, + printf("DP ip_output call free SP:%x\n", sp)); + key_freesp(sp, KEY_SADB_UNLOCKED); + } + } +#endif /* IPSEC */ + + KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_END, error,0,0,0,0); + return (error); +bad: + m_freem(m0); + goto done; +} + +int +ip_fragment(struct mbuf *m, struct ifnet *ifp, unsigned long mtu, int sw_csum) +{ + struct ip *ip, *mhip; + int len, hlen, mhlen, firstlen, off, error = 0; + struct mbuf **mnext = &m->m_nextpkt, *m0; int nfrags = 1; + ip = mtod(m, struct ip *); +#ifdef _IP_VHL + hlen = IP_VHL_HL(ip->ip_vhl) << 2; +#else + hlen = ip->ip_hl << 2; +#endif + + firstlen = len = (mtu - hlen) &~ 7; + if (len < 8) { + m_freem(m); + return (EMSGSIZE); + } + + /* + * if the interface will not calculate checksums on + * fragmented packets, then do it here. + */ + if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA && + (ifp->if_hwassist & CSUM_IP_FRAGS) == 0) { + in_delayed_cksum(m); + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + } + /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto chain. @@ -1278,10 +1837,10 @@ pass: m0 = m; mhlen = sizeof (struct ip); for (off = hlen + len; off < (u_short)ip->ip_len; off += len) { - MGETHDR(m, M_DONTWAIT, MT_HEADER); + MGETHDR(m, M_DONTWAIT, MT_HEADER); /* MAC-OK */ if (m == 0) { error = ENOBUFS; - ipstat.ips_odropped++; + OSAddAtomic(1, &ipstat.ips_odropped); goto sendorfree; } m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG; @@ -1305,14 +1864,21 @@ pass: if (m->m_next == 0) { (void) m_free(m); error = ENOBUFS; /* ??? */ - ipstat.ips_odropped++; + OSAddAtomic(1, &ipstat.ips_odropped); goto sendorfree; } m->m_pkthdr.len = mhlen + len; m->m_pkthdr.rcvif = 0; m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags; m->m_pkthdr.socket_id = m0->m_pkthdr.socket_id; +#if CONFIG_MACF_NET + mac_netinet_fragment(m0, m); +#endif + +#if BYTE_ORDER != BIG_ENDIAN HTONS(mhip->ip_off); +#endif + mhip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) { mhip->ip_sum = in_cksum(m, mhlen); @@ -1321,7 +1887,7 @@ pass: mnext = &m->m_nextpkt; nfrags++; } - ipstat.ips_ofragments += nfrags; + OSAddAtomic(nfrags, &ipstat.ips_ofragments); /* set first/last markers for fragment chain */ m->m_flags |= M_LASTFRAG; @@ -1337,71 +1903,36 @@ pass: m->m_pkthdr.len = hlen + firstlen; ip->ip_len = htons((u_short)m->m_pkthdr.len); ip->ip_off |= IP_MF; + +#if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_off); +#endif + ip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) { ip->ip_sum = in_cksum(m, hlen); } sendorfree: + if (error) + m_freem_list(m0); - KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr, - ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len); - - lck_mtx_unlock(ip_mutex); - for (m = m0; m; m = m0) { - m0 = m->m_nextpkt; - m->m_nextpkt = 0; -#if IPSEC - /* clean ipsec history once it goes out of the node */ - if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) - ipsec_delaux(m); -#endif - if (error == 0) { -#ifndef __APPLE__ - /* Record statistics for this interface address. */ - if (ia != NULL) { - ia->ia_ifa.if_opackets++; - ia->ia_ifa.if_obytes += m->m_pkthdr.len; - } -#endif - if ((packetchain != 0) && (pktcnt > 0)) - panic("ip_output: mix of packet in packetlist is wrong=%x", packetlist); - error = dlil_output(ifp, PF_INET, m, (void *) ro->ro_rt, - (struct sockaddr *)dst, 0); - } else - m_freem(m); - } + return (error); +} - if (error == 0) - ipstat.ips_fragmented++; - } -done: - if (ia) { - ifafree(&ia->ia_ifa); - ia = NULL; - } -#if IPSEC - if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) { - if (ro == &iproute && ro->ro_rt) { - rtfree(ro->ro_rt); - ro->ro_rt = NULL; - } - if (sp != NULL) { - KEYDEBUG(KEYDEBUG_IPSEC_STAMP, - printf("DP ip_output call free SP:%x\n", sp)); - lck_mtx_lock(sadb_mutex); - key_freesp(sp); - lck_mtx_unlock(sadb_mutex); - } +static void +ip_out_cksum_stats(int proto, u_int32_t len) +{ + switch (proto) { + case IPPROTO_TCP: + tcp_out_cksum_stats(len); + break; + case IPPROTO_UDP: + udp_out_cksum_stats(len); + break; + default: + /* keep only TCP or UDP stats for now */ + break; } -#endif /* IPSEC */ - - KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_END, error,0,0,0,0); - return (error); -bad: - m_freem(m0); - lck_mtx_unlock(ip_mutex); - goto done; } void @@ -1410,13 +1941,16 @@ in_delayed_cksum_offset(struct mbuf *m0, int ip_offset) struct ip *ip; unsigned char buf[sizeof(struct ip)]; u_short csum, offset, ip_len; - struct mbuf *m = m0; - + + /* Save copy of first mbuf pointer and the ip_offset before modifying */ + struct mbuf *m = m0; + int ip_offset_copy = ip_offset; + while (ip_offset >= m->m_len) { ip_offset -= m->m_len; m = m->m_next; if (m == NULL) { - printf("in_delayed_cksum_offset failed - ip_offset wasn't in the packet\n"); + printf("in_delayed_cksum_withoffset failed - ip_offset wasn't in the packet\n"); return; } } @@ -1427,7 +1961,7 @@ in_delayed_cksum_offset(struct mbuf *m0, int ip_offset) printf("delayed m_pullup, m->len: %d off: %d\n", m->m_len, ip_offset); #endif - m_copydata(m, ip_offset, sizeof(struct ip), buf); + m_copydata(m, ip_offset, sizeof(struct ip), (caddr_t) buf); ip = (struct ip *)buf; } else { @@ -1452,18 +1986,21 @@ in_delayed_cksum_offset(struct mbuf *m0, int ip_offset) * is bogus and we give up. */ ip_len = ip->ip_len; - if (ip_len != (m0->m_pkthdr.len - ip_offset)) { + if (ip_len != (m0->m_pkthdr.len - ip_offset_copy)) { ip_len = SWAP16(ip_len); - if (ip_len != (m0->m_pkthdr.len - ip_offset)) { + if (ip_len != (m0->m_pkthdr.len - ip_offset_copy)) { printf("in_delayed_cksum_offset: ip_len %d (%d) " "doesn't match actual length %d\n", ip->ip_len, - ip_len, (m0->m_pkthdr.len - ip_offset)); + ip_len, (m0->m_pkthdr.len - ip_offset_copy)); return; } } csum = in_cksum_skip(m, ip_len, offset); + /* Update stats */ + ip_out_cksum_stats(ip->ip_p, ip_len - offset); + if (m0->m_pkthdr.csum_flags & CSUM_UDP && csum == 0) csum = 0xffff; offset += m0->m_pkthdr.csum_data & 0xFFFF; /* checksum offset */ @@ -1506,6 +2043,10 @@ in_cksum_offset(struct mbuf* m, size_t ip_offset) int hlen = 0; unsigned char buf[sizeof(struct ip)]; int swapped = 0; + + /* Save copy of first mbuf pointer and the ip_offset before modifying */ + struct mbuf* m0 = m; + size_t ip_offset_copy = ip_offset; while (ip_offset >= m->m_len) { ip_offset -= m->m_len; @@ -1520,10 +2061,10 @@ in_cksum_offset(struct mbuf* m, size_t ip_offset) if (ip_offset + sizeof(struct ip) > m->m_len) { #if DEBUG - printf("in_cksum_offset - delayed m_pullup, m->len: %d off: %d\n", + printf("in_cksum_offset - delayed m_pullup, m->len: %d off: %lu\n", m->m_len, ip_offset); #endif - m_copydata(m, ip_offset, sizeof(struct ip), buf); + m_copydata(m, ip_offset, sizeof(struct ip), (caddr_t) buf); ip = (struct ip *)buf; ip->ip_sum = 0; @@ -1553,15 +2094,15 @@ in_cksum_offset(struct mbuf* m, size_t ip_offset) * the length and check again. If it still fails, then the packet * is bogus and we give up. */ - if (ntohs(ip->ip_len) != (m->m_pkthdr.len - ip_offset)) { + if (ntohs(ip->ip_len) != (m0->m_pkthdr.len - ip_offset_copy)) { ip->ip_len = SWAP16(ip->ip_len); swapped = 1; - if (ntohs(ip->ip_len) != (m->m_pkthdr.len - ip_offset)) { + if (ntohs(ip->ip_len) != (m0->m_pkthdr.len - ip_offset_copy)) { ip->ip_len = SWAP16(ip->ip_len); printf("in_cksum_offset: ip_len %d (%d) " - "doesn't match actual length %d\n", + "doesn't match actual length %lu\n", ip->ip_len, SWAP16(ip->ip_len), - (m->m_pkthdr.len - ip_offset)); + (m0->m_pkthdr.len - ip_offset_copy)); return; } } @@ -1584,7 +2125,7 @@ in_cksum_offset(struct mbuf* m, size_t ip_offset) char tmp[2]; #if DEBUG - printf("in_cksum_offset m_copyback, m->len: %d off: %d p: %d\n", + printf("in_cksum_offset m_copyback, m->len: %u off: %lu p: %d\n", m->m_len, ip_offset + offsetof(struct ip, ip_sum), ip->ip_p); #endif *(u_short *)tmp = ip->ip_sum; @@ -1616,10 +2157,13 @@ ip_insertoptions(m, opt, phlen) if (p->ipopt_dst.s_addr) ip->ip_dst = p->ipopt_dst; if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) { - MGETHDR(n, M_DONTWAIT, MT_HEADER); + MGETHDR(n, M_DONTWAIT, MT_HEADER); /* MAC-OK */ if (n == 0) return (m); n->m_pkthdr.rcvif = 0; +#if CONFIG_MACF_NET + mac_mbuf_label_copy(m, n); +#endif n->m_pkthdr.len = m->m_pkthdr.len + optlen; m->m_len -= sizeof(struct ip); m->m_data += sizeof(struct ip); @@ -1717,7 +2261,8 @@ ip_ctloutput(so, sopt) error = EMSGSIZE; break; } - MGET(m, sopt->sopt_p ? M_WAIT : M_DONTWAIT, MT_HEADER); + MGET(m, sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT, + MT_HEADER); if (m == 0) { error = ENOBUFS; break; @@ -1742,6 +2287,7 @@ ip_ctloutput(so, sopt) #if defined(NFAITH) && NFAITH > 0 case IP_FAITH: #endif + case IP_RECVPKTINFO: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) @@ -1786,17 +2332,99 @@ ip_ctloutput(so, sopt) OPTSET(INP_FAITH); break; #endif + case IP_RECVPKTINFO: + OPTSET(INP_PKTINFO); + break; } break; #undef OPTSET +#if CONFIG_FORCE_OUT_IFP + /* + * Apple private interface, similar to IP_BOUND_IF, except + * that the parameter is a NULL-terminated string containing + * the name of the network interface; an emptry string means + * unbind. Applications are encouraged to use IP_BOUND_IF + * instead, as that is the current "official" API. + */ + case IP_FORCE_OUT_IFP: { + char ifname[IFNAMSIZ]; + unsigned int ifscope; + + /* This option is settable only for IPv4 */ + if (!(inp->inp_vflag & INP_IPV4)) { + error = EINVAL; + break; + } + + /* Verify interface name parameter is sane */ + if (sopt->sopt_valsize > sizeof(ifname)) { + error = EINVAL; + break; + } + + /* Copy the interface name */ + if (sopt->sopt_valsize != 0) { + error = sooptcopyin(sopt, ifname, + sizeof (ifname), sopt->sopt_valsize); + if (error) + break; + } + + if (sopt->sopt_valsize == 0 || ifname[0] == '\0') { + /* Unbind this socket from any interface */ + ifscope = IFSCOPE_NONE; + } else { + ifnet_t ifp; + + /* Verify name is NULL terminated */ + if (ifname[sopt->sopt_valsize - 1] != '\0') { + error = EINVAL; + break; + } + + /* Bail out if given bogus interface name */ + if (ifnet_find_by_name(ifname, &ifp) != 0) { + error = ENXIO; + break; + } + + /* Bind this socket to this interface */ + ifscope = ifp->if_index; + + /* + * Won't actually free; since we don't release + * this later, we should do it now. + */ + ifnet_release(ifp); + } + inp_bindif(inp, ifscope); + } + break; +#endif + /* + * Multicast socket options are processed by the in_mcast + * module. + */ case IP_MULTICAST_IF: + case IP_MULTICAST_IFINDEX: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_ADD_MEMBERSHIP: case IP_DROP_MEMBERSHIP: - error = ip_setmoptions(sopt, &inp->inp_moptions); + case IP_ADD_SOURCE_MEMBERSHIP: + case IP_DROP_SOURCE_MEMBERSHIP: + case IP_BLOCK_SOURCE: + case IP_UNBLOCK_SOURCE: + case IP_MSFILTER: + case MCAST_JOIN_GROUP: + case MCAST_LEAVE_GROUP: + case MCAST_JOIN_SOURCE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + error = inp_setmoptions(inp, sopt); break; case IP_PORTRANGE: @@ -1836,29 +2464,97 @@ ip_ctloutput(so, sopt) struct mbuf *m; int optname; - if (sopt->sopt_valsize > MCLBYTES) { - error = EMSGSIZE; - break; - } if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ break; if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ break; - priv = (sopt->sopt_p != NULL && - proc_suser(sopt->sopt_p) != 0) ? 0 : 1; + priv = (proc_suser(sopt->sopt_p) == 0); if (m) { req = mtod(m, caddr_t); len = m->m_len; } optname = sopt->sopt_name; - lck_mtx_lock(sadb_mutex); error = ipsec4_set_policy(inp, optname, req, len, priv); - lck_mtx_unlock(sadb_mutex); m_freem(m); break; } #endif /*IPSEC*/ +#if TRAFFIC_MGT + case IP_TRAFFIC_MGT_BACKGROUND: + { + unsigned background = 0; + error = sooptcopyin(sopt, &background, sizeof(background), sizeof(background)); + if (error) + break; + + if (background) { + socket_set_traffic_mgt_flags_locked(so, + TRAFFIC_MGT_SO_BACKGROUND); + } else { + socket_clear_traffic_mgt_flags_locked(so, + TRAFFIC_MGT_SO_BACKGROUND); + } + + break; + } +#endif /* TRAFFIC_MGT */ + + /* + * On a multihomed system, scoped routing can be used to + * restrict the source interface used for sending packets. + * The socket option IP_BOUND_IF binds a particular AF_INET + * socket to an interface such that data sent on the socket + * is restricted to that interface. This is unlike the + * SO_DONTROUTE option where the routing table is bypassed; + * therefore it allows for a greater flexibility and control + * over the system behavior, and does not place any restriction + * on the destination address type (e.g. unicast, multicast, + * or broadcast if applicable) or whether or not the host is + * directly reachable. Note that in the multicast transmit + * case, IP_MULTICAST_{IF,IFINDEX} takes precedence over + * IP_BOUND_IF, since the former practically bypasses the + * routing table; in this case, IP_BOUND_IF sets the default + * interface used for sending multicast packets in the absence + * of an explicit multicast transmit interface. + */ + case IP_BOUND_IF: + /* This option is settable only for IPv4 */ + if (!(inp->inp_vflag & INP_IPV4)) { + error = EINVAL; + break; + } + + error = sooptcopyin(sopt, &optval, sizeof (optval), + sizeof (optval)); + + if (error) + break; + + inp_bindif(inp, optval); + break; + + case IP_NO_IFT_CELLULAR: + /* This option is settable only for IPv4 */ + if (!(inp->inp_vflag & INP_IPV4)) { + error = EINVAL; + break; + } + + error = sooptcopyin(sopt, &optval, sizeof (optval), + sizeof (optval)); + + if (error) + break; + + error = inp_nocellular(inp, optval); + break; + + case IP_OUT_IF: + /* This option is not settable */ + error = EINVAL; + break; + default: error = ENOPROTOOPT; break; @@ -1889,6 +2585,7 @@ ip_ctloutput(so, sopt) #if defined(NFAITH) && NFAITH > 0 case IP_FAITH: #endif + case IP_RECVPKTINFO: switch (sopt->sopt_name) { case IP_TOS: @@ -1935,17 +2632,20 @@ ip_ctloutput(so, sopt) optval = OPTBIT(INP_FAITH); break; #endif + case IP_RECVPKTINFO: + optval = OPTBIT(INP_PKTINFO); + break; } error = sooptcopyout(sopt, &optval, sizeof optval); break; case IP_MULTICAST_IF: + case IP_MULTICAST_IFINDEX: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: - case IP_ADD_MEMBERSHIP: - case IP_DROP_MEMBERSHIP: - error = ip_getmoptions(sopt, inp->inp_moptions); + case IP_MSFILTER: + error = inp_getmoptions(inp, sopt); break; #if IPSEC @@ -1959,9 +2659,7 @@ ip_ctloutput(so, sopt) req = mtod(m, caddr_t); len = m->m_len; } - lck_mtx_lock(sadb_mutex); error = ipsec4_get_policy(sotoinpcb(so), req, len, &m); - lck_mtx_unlock(sadb_mutex); if (error == 0) error = soopt_mcopyout(sopt, m); /* XXX */ if (error == 0) @@ -1970,6 +2668,31 @@ ip_ctloutput(so, sopt) } #endif /*IPSEC*/ +#if TRAFFIC_MGT + case IP_TRAFFIC_MGT_BACKGROUND: + { + unsigned background = (so->so_traffic_mgt_flags & TRAFFIC_MGT_SO_BACKGROUND); + return (sooptcopyout(sopt, &background, sizeof(background))); + break; + } +#endif /* TRAFFIC_MGT */ + + case IP_BOUND_IF: + if (inp->inp_flags & INP_BOUND_IF) + optval = inp->inp_boundif; + error = sooptcopyout(sopt, &optval, sizeof (optval)); + break; + + case IP_NO_IFT_CELLULAR: + optval = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0; + error = sooptcopyout(sopt, &optval, sizeof (optval)); + break; + + case IP_OUT_IF: + optval = inp->inp_last_outif; + error = sooptcopyout(sopt, &optval, sizeof (optval)); + break; + default: error = ENOPROTOOPT; break; @@ -1985,10 +2708,10 @@ ip_ctloutput(so, sopt) * with destination address if source routed. */ static int -ip_pcbopts(optname, pcbopt, m) - int optname; - struct mbuf **pcbopt; - register struct mbuf *m; +ip_pcbopts( + __unused int optname, + struct mbuf **pcbopt, + register struct mbuf *m) { register int cnt, optlen; register u_char *cp; @@ -2084,460 +2807,138 @@ bad: return (EINVAL); } -/* - * XXX - * The whole multicast option thing needs to be re-thought. - * Several of these options are equally applicable to non-multicast - * transmission, and one (IP_MULTICAST_TTL) totally duplicates a - * standard option (IP_TTL). - */ - -/* - * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index. - */ -static struct ifnet * -ip_multicast_if(a, ifindexp) - struct in_addr *a; - int *ifindexp; +void +ip_moptions_init(void) { - int ifindex; - struct ifnet *ifp; + PE_parse_boot_argn("ifa_debug", &imo_debug, sizeof (imo_debug)); - if (ifindexp) - *ifindexp = 0; - if (ntohl(a->s_addr) >> 24 == 0) { - ifindex = ntohl(a->s_addr) & 0xffffff; - ifnet_head_lock_shared(); - if (ifindex < 0 || if_index < ifindex) { - ifnet_head_done(); - return NULL; - } - ifp = ifindex2ifnet[ifindex]; - ifnet_head_done(); - if (ifindexp) - *ifindexp = ifindex; - } else { - INADDR_TO_IFP(*a, ifp); + imo_size = (imo_debug == 0) ? sizeof (struct ip_moptions) : + sizeof (struct ip_moptions_dbg); + + imo_zone = zinit(imo_size, IMO_ZONE_MAX * imo_size, 0, + IMO_ZONE_NAME); + if (imo_zone == NULL) { + panic("%s: failed allocating %s", __func__, IMO_ZONE_NAME); + /* NOTREACHED */ } - return ifp; + zone_change(imo_zone, Z_EXPAND, TRUE); } -/* - * Set the IP multicast options in response to user setsockopt(). - */ -static int -ip_setmoptions(sopt, imop) - struct sockopt *sopt; - struct ip_moptions **imop; +void +imo_addref(struct ip_moptions *imo, int locked) { - int error = 0; - int i; - struct in_addr addr; - struct ip_mreq mreq; - struct ifnet *ifp = NULL; - struct ip_moptions *imo = *imop; - int ifindex; + if (!locked) + IMO_LOCK(imo); + else + IMO_LOCK_ASSERT_HELD(imo); - if (imo == NULL) { - /* - * No multicast option buffer attached to the pcb; - * allocate one and initialize to default values. - */ - error = ip_createmoptions(imop); - if (error != 0) - return error; - imo = *imop; + if (++imo->imo_refcnt == 0) { + panic("%s: imo %p wraparound refcnt\n", __func__, imo); + /* NOTREACHED */ + } else if (imo->imo_trace != NULL) { + (*imo->imo_trace)(imo, TRUE); } - switch (sopt->sopt_name) { - /* store an index number for the vif you wanna use in the send */ - case IP_MULTICAST_VIF: - if (legal_vif_num == 0) { - error = EOPNOTSUPP; - break; - } - error = sooptcopyin(sopt, &i, sizeof i, sizeof i); - if (error) - break; - if (!legal_vif_num(i) && (i != -1)) { - error = EINVAL; - break; - } - imo->imo_multicast_vif = i; - break; - - case IP_MULTICAST_IF: - /* - * Select the interface for outgoing multicast packets. - */ - error = sooptcopyin(sopt, &addr, sizeof addr, sizeof addr); - if (error) - break; - /* - * INADDR_ANY is used to remove a previous selection. - * When no interface is selected, a default one is - * chosen every time a multicast packet is sent. - */ - if (addr.s_addr == INADDR_ANY) { - imo->imo_multicast_ifp = NULL; - break; - } - /* - * The selected interface is identified by its local - * IP address. Find the interface and confirm that - * it supports multicasting. - */ - ifp = ip_multicast_if(&addr, &ifindex); - if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { - error = EADDRNOTAVAIL; - break; - } - imo->imo_multicast_ifp = ifp; - if (ifindex) - imo->imo_multicast_addr = addr; - else - imo->imo_multicast_addr.s_addr = INADDR_ANY; - break; - - case IP_MULTICAST_TTL: - /* - * Set the IP time-to-live for outgoing multicast packets. - * The original multicast API required a char argument, - * which is inconsistent with the rest of the socket API. - * We allow either a char or an int. - */ - if (sopt->sopt_valsize == 1) { - u_char ttl; - error = sooptcopyin(sopt, &ttl, 1, 1); - if (error) - break; - imo->imo_multicast_ttl = ttl; - } else { - u_int ttl; - error = sooptcopyin(sopt, &ttl, sizeof ttl, - sizeof ttl); - if (error) - break; - if (ttl > 255) - error = EINVAL; - else - imo->imo_multicast_ttl = ttl; - } - break; - - case IP_MULTICAST_LOOP: - /* - * Set the loopback flag for outgoing multicast packets. - * Must be zero or one. The original multicast API required a - * char argument, which is inconsistent with the rest - * of the socket API. We allow either a char or an int. - */ - if (sopt->sopt_valsize == 1) { - u_char loop; - error = sooptcopyin(sopt, &loop, 1, 1); - if (error) - break; - imo->imo_multicast_loop = !!loop; - } else { - u_int loop; - error = sooptcopyin(sopt, &loop, sizeof loop, - sizeof loop); - if (error) - break; - imo->imo_multicast_loop = !!loop; - } - break; - - case IP_ADD_MEMBERSHIP: - /* - * Add a multicast group membership. - * Group must be a valid IP multicast address. - */ - error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); - if (error) - break; - - error = ip_addmembership(imo, &mreq); - break; + if (!locked) + IMO_UNLOCK(imo); +} - case IP_DROP_MEMBERSHIP: - /* - * Drop a multicast group membership. - * Group must be a valid IP multicast address. - */ - error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); - if (error) - break; - - error = ip_dropmembership(imo, &mreq); - break; +void +imo_remref(struct ip_moptions *imo) +{ + int i; - default: - error = EOPNOTSUPP; - break; + IMO_LOCK(imo); + if (imo->imo_refcnt == 0) { + panic("%s: imo %p negative refcnt", __func__, imo); + /* NOTREACHED */ + } else if (imo->imo_trace != NULL) { + (*imo->imo_trace)(imo, FALSE); } - /* - * If all options have default values, no need to keep the mbuf. - */ - if (imo->imo_multicast_ifp == NULL && - imo->imo_multicast_vif == -1 && - imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL && - imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP && - imo->imo_num_memberships == 0) { - FREE(*imop, M_IPMOPTS); - *imop = NULL; + --imo->imo_refcnt; + if (imo->imo_refcnt > 0) { + IMO_UNLOCK(imo); + return; } - return (error); -} + for (i = 0; i < imo->imo_num_memberships; ++i) { + struct in_mfilter *imf; -/* - * Set the IP multicast options in response to user setsockopt(). - */ -__private_extern__ int -ip_createmoptions( - struct ip_moptions **imop) -{ - struct ip_moptions *imo; - imo = (struct ip_moptions*) _MALLOC(sizeof(*imo), M_IPMOPTS, - M_WAITOK); - - if (imo == NULL) - return (ENOBUFS); - *imop = imo; - imo->imo_multicast_ifp = NULL; - imo->imo_multicast_addr.s_addr = INADDR_ANY; - imo->imo_multicast_vif = -1; - imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; - imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; - imo->imo_num_memberships = 0; - - return 0; -} + imf = imo->imo_mfilters ? &imo->imo_mfilters[i] : NULL; + if (imf != NULL) + imf_leave(imf); -/* - * Add membership to an IPv4 multicast. - */ -__private_extern__ int -ip_addmembership( - struct ip_moptions *imo, - struct ip_mreq *mreq) -{ - struct route ro; - struct sockaddr_in *dst; - struct ifnet *ifp = NULL; - int error = 0; - int i; - - if (!IN_MULTICAST(ntohl(mreq->imr_multiaddr.s_addr))) { - error = EINVAL; - return error; - } - /* - * If no interface address was provided, use the interface of - * the route to the given multicast address. - */ - if (mreq->imr_interface.s_addr == INADDR_ANY) { - bzero((caddr_t)&ro, sizeof(ro)); - dst = (struct sockaddr_in *)&ro.ro_dst; - dst->sin_len = sizeof(*dst); - dst->sin_family = AF_INET; - dst->sin_addr = mreq->imr_multiaddr; - rtalloc(&ro); - if (ro.ro_rt != NULL) { - ifp = ro.ro_rt->rt_ifp; - rtfree(ro.ro_rt); - } - else { - /* If there's no default route, try using loopback */ - mreq->imr_interface.s_addr = INADDR_LOOPBACK; - } - } - - if (ifp == NULL) { - ifp = ip_multicast_if(&mreq->imr_interface, NULL); - } + (void) in_leavegroup(imo->imo_membership[i], imf); - /* - * See if we found an interface, and confirm that it - * supports multicast. - */ - if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { - error = EADDRNOTAVAIL; - return error; - } - /* - * See if the membership already exists or if all the - * membership slots are full. - */ - for (i = 0; i < imo->imo_num_memberships; ++i) { - if (imo->imo_membership[i]->inm_ifp == ifp && - imo->imo_membership[i]->inm_addr.s_addr - == mreq->imr_multiaddr.s_addr) - break; - } - if (i < imo->imo_num_memberships) { - error = EADDRINUSE; - return error; + if (imf != NULL) + imf_purge(imf); + + INM_REMREF(imo->imo_membership[i]); + imo->imo_membership[i] = NULL; } - if (i == IP_MAX_MEMBERSHIPS) { - error = ETOOMANYREFS; - return error; + imo->imo_num_memberships = 0; + if (imo->imo_mfilters != NULL) { + FREE(imo->imo_mfilters, M_INMFILTER); + imo->imo_mfilters = NULL; } - /* - * Everything looks good; add a new record to the multicast - * address list for the given interface. - */ - if ((imo->imo_membership[i] = - in_addmulti(&mreq->imr_multiaddr, ifp)) == NULL) { - error = ENOBUFS; - return error; + if (imo->imo_membership != NULL) { + FREE(imo->imo_membership, M_IPMOPTS); + imo->imo_membership = NULL; } - ++imo->imo_num_memberships; - - return error; -} + IMO_UNLOCK(imo); -/* - * Drop membership of an IPv4 multicast. - */ -__private_extern__ int -ip_dropmembership( - struct ip_moptions *imo, - struct ip_mreq *mreq) -{ - int error = 0; - struct ifnet* ifp = NULL; - int i; - - if (!IN_MULTICAST(ntohl(mreq->imr_multiaddr.s_addr))) { - error = EINVAL; - return error; - } + lck_mtx_destroy(&imo->imo_lock, ifa_mtx_grp); - /* - * If an interface address was specified, get a pointer - * to its ifnet structure. - */ - if (mreq->imr_interface.s_addr == INADDR_ANY) - ifp = NULL; - else { - ifp = ip_multicast_if(&mreq->imr_interface, NULL); - if (ifp == NULL) { - error = EADDRNOTAVAIL; - return error; - } - } - /* - * Find the membership in the membership array. - */ - for (i = 0; i < imo->imo_num_memberships; ++i) { - if ((ifp == NULL || - imo->imo_membership[i]->inm_ifp == ifp) && - imo->imo_membership[i]->inm_addr.s_addr == - mreq->imr_multiaddr.s_addr) - break; - } - if (i == imo->imo_num_memberships) { - error = EADDRNOTAVAIL; - return error; + if (!(imo->imo_debug & IFD_ALLOC)) { + panic("%s: imo %p cannot be freed", __func__, imo); + /* NOTREACHED */ } - /* - * Give up the multicast address record to which the - * membership points. - */ - in_delmulti(&imo->imo_membership[i]); - /* - * Remove the gap in the membership array. - */ - for (++i; i < imo->imo_num_memberships; ++i) - imo->imo_membership[i-1] = imo->imo_membership[i]; - --imo->imo_num_memberships; - - return error; + zfree(imo_zone, imo); } -/* - * Return the IP multicast options in response to user getsockopt(). - */ -static int -ip_getmoptions(sopt, imo) - struct sockopt *sopt; - register struct ip_moptions *imo; +static void +imo_trace(struct ip_moptions *imo, int refhold) { - struct in_addr addr; - struct in_ifaddr *ia; - int error, optval; - u_char coptval; - - error = 0; - switch (sopt->sopt_name) { - case IP_MULTICAST_VIF: - if (imo != NULL) - optval = imo->imo_multicast_vif; - else - optval = -1; - error = sooptcopyout(sopt, &optval, sizeof optval); - break; - - case IP_MULTICAST_IF: - if (imo == NULL || imo->imo_multicast_ifp == NULL) - addr.s_addr = INADDR_ANY; - else if (imo->imo_multicast_addr.s_addr) { - /* return the value user has set */ - addr = imo->imo_multicast_addr; - } else { - IFP_TO_IA(imo->imo_multicast_ifp, ia); - addr.s_addr = (ia == NULL) ? INADDR_ANY - : IA_SIN(ia)->sin_addr.s_addr; - } - error = sooptcopyout(sopt, &addr, sizeof addr); - break; - - case IP_MULTICAST_TTL: - if (imo == 0) - optval = coptval = IP_DEFAULT_MULTICAST_TTL; - else - optval = coptval = imo->imo_multicast_ttl; - if (sopt->sopt_valsize == 1) - error = sooptcopyout(sopt, &coptval, 1); - else - error = sooptcopyout(sopt, &optval, sizeof optval); - break; - - case IP_MULTICAST_LOOP: - if (imo == 0) - optval = coptval = IP_DEFAULT_MULTICAST_LOOP; - else - optval = coptval = imo->imo_multicast_loop; - if (sopt->sopt_valsize == 1) - error = sooptcopyout(sopt, &coptval, 1); - else - error = sooptcopyout(sopt, &optval, sizeof optval); - break; - - default: - error = ENOPROTOOPT; - break; + struct ip_moptions_dbg *imo_dbg = (struct ip_moptions_dbg *)imo; + ctrace_t *tr; + u_int32_t idx; + u_int16_t *cnt; + + if (!(imo->imo_debug & IFD_DEBUG)) { + panic("%s: imo %p has no debug structure", __func__, imo); + /* NOTREACHED */ + } + if (refhold) { + cnt = &imo_dbg->imo_refhold_cnt; + tr = imo_dbg->imo_refhold; + } else { + cnt = &imo_dbg->imo_refrele_cnt; + tr = imo_dbg->imo_refrele; } - return (error); + + idx = atomic_add_16_ov(cnt, 1) % IMO_TRACE_HIST_SIZE; + ctrace_record(&tr[idx]); } -/* - * Discard the IP multicast options. - */ -void -ip_freemoptions(imo) - register struct ip_moptions *imo; +struct ip_moptions * +ip_allocmoptions(int how) { - register int i; + struct ip_moptions *imo; + imo = (how == M_WAITOK) ? zalloc(imo_zone) : zalloc_noblock(imo_zone); if (imo != NULL) { - for (i = 0; i < imo->imo_num_memberships; ++i) - in_delmulti(&imo->imo_membership[i]); - FREE(imo, M_IPMOPTS); + bzero(imo, imo_size); + lck_mtx_init(&imo->imo_lock, ifa_mtx_grp, ifa_mtx_attr); + imo->imo_debug |= IFD_ALLOC; + if (imo_debug != 0) { + imo->imo_debug |= IFD_DEBUG; + imo->imo_trace = imo_trace; + } + IMO_ADDREF(imo); } + + return (imo); } /* @@ -2556,83 +2957,370 @@ ip_mloopback(ifp, m, dst, hlen) { register struct ip *ip; struct mbuf *copym; + int sw_csum = (apple_hwcksum_tx == 0); copym = m_copy(m, 0, M_COPYALL); if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen)) copym = m_pullup(copym, hlen); - if (copym != NULL) { - /* - * We don't bother to fragment if the IP length is greater - * than the interface's MTU. Can this possibly matter? - */ - ip = mtod(copym, struct ip *); - HTONS(ip->ip_len); - HTONS(ip->ip_off); - ip->ip_sum = 0; - ip->ip_sum = in_cksum(copym, hlen); - /* - * NB: - * It's not clear whether there are any lingering - * reentrancy problems in other areas which might - * be exposed by using ip_input directly (in - * particular, everything which modifies the packet - * in-place). Yet another option is using the - * protosw directly to deliver the looped back - * packet. For the moment, we'll err on the side - * of safety by using if_simloop(). - */ + + if (copym == NULL) + return; + + /* + * We don't bother to fragment if the IP length is greater + * than the interface's MTU. Can this possibly matter? + */ + ip = mtod(copym, struct ip *); + +#if BYTE_ORDER != BIG_ENDIAN + HTONS(ip->ip_len); + HTONS(ip->ip_off); +#endif + + ip->ip_sum = 0; + ip->ip_sum = in_cksum(copym, hlen); + /* + * NB: + * It's not clear whether there are any lingering + * reentrancy problems in other areas which might + * be exposed by using ip_input directly (in + * particular, everything which modifies the packet + * in-place). Yet another option is using the + * protosw directly to deliver the looped back + * packet. For the moment, we'll err on the side + * of safety by using if_simloop(). + */ #if 1 /* XXX */ - if (dst->sin_family != AF_INET) { - printf("ip_mloopback: bad address family %d\n", - dst->sin_family); - dst->sin_family = AF_INET; - } + if (dst->sin_family != AF_INET) { + printf("ip_mloopback: bad address family %d\n", + dst->sin_family); + dst->sin_family = AF_INET; + } #endif - /* - * Mark checksum as valid or calculate checksum for loopback. - * - * This is done this way because we have to embed the ifp of - * the interface we will send the original copy of the packet - * out on in the mbuf. ip_input will check if_hwassist of the - * embedded ifp and ignore all csum_flags if if_hwassist is 0. - * The UDP checksum has not been calculated yet. - */ - if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { - if (IF_HWASSIST_CSUM_FLAGS(ifp->if_hwassist)) { - copym->m_pkthdr.csum_flags |= - CSUM_DATA_VALID | CSUM_PSEUDO_HDR | - CSUM_IP_CHECKED | CSUM_IP_VALID; - copym->m_pkthdr.csum_data = 0xffff; - } else { - NTOHS(ip->ip_len); - in_delayed_cksum(copym); - HTONS(ip->ip_len); - } + * Mark checksum as valid or calculate checksum for loopback. + * + * This is done this way because we have to embed the ifp of + * the interface we will send the original copy of the packet + * out on in the mbuf. ip_input will check if_hwassist of the + * embedded ifp and ignore all csum_flags if if_hwassist is 0. + * The UDP checksum has not been calculated yet. + */ + if (sw_csum || (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA)) { + if (!sw_csum && IF_HWASSIST_CSUM_FLAGS(ifp->if_hwassist)) { + copym->m_pkthdr.csum_flags |= + CSUM_DATA_VALID | CSUM_PSEUDO_HDR | + CSUM_IP_CHECKED | CSUM_IP_VALID; + copym->m_pkthdr.csum_data = 0xffff; + } else { + +#if BYTE_ORDER != BIG_ENDIAN + NTOHS(ip->ip_len); +#endif + + in_delayed_cksum(copym); + +#if BYTE_ORDER != BIG_ENDIAN + HTONS(ip->ip_len); +#endif + + } } + /* + * TedW: + * We need to send all loopback traffic down to dlil in case + * a filter has tapped-in. + */ + + /* + * Stuff the 'real' ifp into the pkthdr, to be used in matching + * in ip_input(); we need the loopback ifp/dl_tag passed as args + * to make the loopback driver compliant with the data link + * requirements. + */ + if (lo_ifp) { + copym->m_pkthdr.rcvif = ifp; + dlil_output(lo_ifp, PF_INET, copym, 0, + (struct sockaddr *) dst, 0); + } else { + printf("Warning: ip_output call to dlil_find_dltag failed!\n"); + m_freem(copym); + } +} + +/* + * Given a source IP address (and route, if available), determine the best + * interface to send the packet from. Checking for (and updating) the + * ROF_SRCIF_SELECTED flag in the pcb-supplied route placeholder is done + * without any locks based on the assumption that ip_output() is single- + * threaded per-pcb, i.e. for any given pcb there can only be one thread + * performing output at the IP layer. + * + * This routine is analogous to in6_selectroute() for IPv6. + */ +static struct ifaddr * +in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) +{ + struct ifaddr *ifa = NULL; + struct in_addr src = ip->ip_src; + struct in_addr dst = ip->ip_dst; + struct ifnet *rt_ifp; + char s_src[MAX_IPv4_STR_LEN], s_dst[MAX_IPv4_STR_LEN]; + + if (ip_select_srcif_debug) { + (void) inet_ntop(AF_INET, &src.s_addr, s_src, sizeof (s_src)); + (void) inet_ntop(AF_INET, &dst.s_addr, s_dst, sizeof (s_dst)); + } + + if (ro->ro_rt != NULL) + RT_LOCK(ro->ro_rt); + + rt_ifp = (ro->ro_rt != NULL) ? ro->ro_rt->rt_ifp : NULL; + + /* + * Given the source IP address, find a suitable source interface + * to use for transmission; if the caller has specified a scope, + * optimize the search by looking at the addresses only for that + * interface. This is still suboptimal, however, as we need to + * traverse the per-interface list. + */ + if (ifscope != IFSCOPE_NONE || ro->ro_rt != NULL) { + unsigned int scope = ifscope; /* - * TedW: - * We need to send all loopback traffic down to dlil in case - * a filter has tapped-in. + * If no scope is specified and the route is stale (pointing + * to a defunct interface) use the current primary interface; + * this happens when switching between interfaces configured + * with the same IP address. Otherwise pick up the scope + * information from the route; the ULP may have looked up a + * correct route and we just need to verify it here and mark + * it with the ROF_SRCIF_SELECTED flag below. */ + if (scope == IFSCOPE_NONE) { + scope = rt_ifp->if_index; + if (scope != get_primary_ifscope(AF_INET) && + ro->ro_rt->generation_id != route_generation) + scope = get_primary_ifscope(AF_INET); + } + + ifa = (struct ifaddr *)ifa_foraddr_scoped(src.s_addr, scope); + + if (ifa == NULL && ip->ip_p != IPPROTO_UDP && + ip->ip_p != IPPROTO_TCP && ipforwarding) { + /* + * If forwarding is enabled, and if the packet isn't + * TCP or UDP, check if the source address belongs + * to one of our own interfaces; if so, demote the + * interface scope and do a route lookup right below. + */ + ifa = (struct ifaddr *)ifa_foraddr(src.s_addr); + if (ifa != NULL) { + IFA_REMREF(ifa); + ifa = NULL; + ifscope = IFSCOPE_NONE; + } + } + + if (ip_select_srcif_debug && ifa != NULL) { + if (ro->ro_rt != NULL) { + printf("%s->%s ifscope %d->%d ifa_if %s " + "ro_if %s\n", s_src, s_dst, ifscope, + scope, if_name(ifa->ifa_ifp), + if_name(rt_ifp)); + } else { + printf("%s->%s ifscope %d->%d ifa_if %s\n", + s_src, s_dst, ifscope, scope, + if_name(ifa->ifa_ifp)); + } + } + } + + /* + * Slow path; search for an interface having the corresponding source + * IP address if the scope was not specified by the caller, and: + * + * 1) There currently isn't any route, or, + * 2) The interface used by the route does not own that source + * IP address; in this case, the route will get blown away + * and we'll do a more specific scoped search using the newly + * found interface. + */ + if (ifa == NULL && ifscope == IFSCOPE_NONE) { + ifa = (struct ifaddr *)ifa_foraddr(src.s_addr); /* - * Stuff the 'real' ifp into the pkthdr, to be used in matching - * in ip_input(); we need the loopback ifp/dl_tag passed as args - * to make the loopback driver compliant with the data link - * requirements. + * If we have the IP address, but not the route, we don't + * really know whether or not it belongs to the correct + * interface (it could be shared across multiple interfaces.) + * The only way to find out is to do a route lookup. */ - if (lo_ifp) { - copym->m_pkthdr.rcvif = ifp; - dlil_output(lo_ifp, PF_INET, copym, 0, (struct sockaddr *) dst, 0); - } else { - printf("Warning: ip_output call to dlil_find_dltag failed!\n"); - m_freem(copym); + if (ifa != NULL && ro->ro_rt == NULL) { + struct rtentry *rt; + struct sockaddr_in sin; + struct ifaddr *oifa = NULL; + + bzero(&sin, sizeof (sin)); + sin.sin_family = AF_INET; + sin.sin_len = sizeof (sin); + sin.sin_addr = dst; + + lck_mtx_lock(rnh_lock); + if ((rt = rt_lookup(TRUE, (struct sockaddr *)&sin, NULL, + rt_tables[AF_INET], IFSCOPE_NONE)) != NULL) { + RT_LOCK(rt); + /* + * If the route uses a different interface, + * use that one instead. The IP address of + * the ifaddr that we pick up here is not + * relevant. + */ + if (ifa->ifa_ifp != rt->rt_ifp) { + oifa = ifa; + ifa = rt->rt_ifa; + IFA_ADDREF(ifa); + RT_UNLOCK(rt); + } else { + RT_UNLOCK(rt); + } + rtfree_locked(rt); + } + lck_mtx_unlock(rnh_lock); + + if (oifa != NULL) { + struct ifaddr *iifa; + + /* + * See if the interface pointed to by the + * route is configured with the source IP + * address of the packet. + */ + iifa = (struct ifaddr *)ifa_foraddr_scoped( + src.s_addr, ifa->ifa_ifp->if_index); + + if (iifa != NULL) { + /* + * Found it; drop the original one + * as well as the route interface + * address, and use this instead. + */ + IFA_REMREF(oifa); + IFA_REMREF(ifa); + ifa = iifa; + } else if (!ipforwarding || + (rt->rt_flags & RTF_GATEWAY)) { + /* + * This interface doesn't have that + * source IP address; drop the route + * interface address and just use the + * original one, and let the caller + * do a scoped route lookup. + */ + IFA_REMREF(ifa); + ifa = oifa; + } else { + /* + * Forwarding is enabled and the source + * address belongs to one of our own + * interfaces which isn't the outgoing + * interface, and we have a route, and + * the destination is on a network that + * is directly attached (onlink); drop + * the original one and use the route + * interface address instead. + */ + IFA_REMREF(oifa); + } + } + } else if (ifa != NULL && ro->ro_rt != NULL && + !(ro->ro_rt->rt_flags & RTF_GATEWAY) && + ifa->ifa_ifp != ro->ro_rt->rt_ifp && ipforwarding) { + /* + * Forwarding is enabled and the source address belongs + * to one of our own interfaces which isn't the same + * as the interface used by the known route; drop the + * original one and use the route interface address. + */ + IFA_REMREF(ifa); + ifa = ro->ro_rt->rt_ifa; + IFA_ADDREF(ifa); } -/* if_simloop(ifp, copym, (struct sockaddr *)dst, 0);*/ + if (ip_select_srcif_debug && ifa != NULL) { + printf("%s->%s ifscope %d ifa_if %s\n", + s_src, s_dst, ifscope, if_name(ifa->ifa_ifp)); + } } + + if (ro->ro_rt != NULL) + RT_LOCK_ASSERT_HELD(ro->ro_rt); + /* + * If there is a non-loopback route with the wrong interface, or if + * there is no interface configured with such an address, blow it + * away. Except for local/loopback, we look for one with a matching + * interface scope/index. + */ + if (ro->ro_rt != NULL && + (ifa == NULL || (ifa->ifa_ifp != rt_ifp && rt_ifp != lo_ifp) || + !(ro->ro_rt->rt_flags & RTF_UP))) { + if (ip_select_srcif_debug) { + if (ifa != NULL) { + printf("%s->%s ifscope %d ro_if %s != " + "ifa_if %s (cached route cleared)\n", + s_src, s_dst, ifscope, if_name(rt_ifp), + if_name(ifa->ifa_ifp)); + } else { + printf("%s->%s ifscope %d ro_if %s " + "(no ifa_if found)\n", + s_src, s_dst, ifscope, if_name(rt_ifp)); + } + } + + RT_UNLOCK(ro->ro_rt); + rtfree(ro->ro_rt); + ro->ro_rt = NULL; + ro->ro_flags &= ~ROF_SRCIF_SELECTED; + + /* + * If the destination is IPv4 LLA and the route's interface + * doesn't match the source interface, then the source IP + * address is wrong; it most likely belongs to the primary + * interface associated with the IPv4 LL subnet. Drop the + * packet rather than letting it go out and return an error + * to the ULP. This actually applies not only to IPv4 LL + * but other shared subnets; for now we explicitly test only + * for the former case and save the latter for future. + */ + if (IN_LINKLOCAL(ntohl(dst.s_addr)) && + !IN_LINKLOCAL(ntohl(src.s_addr)) && ifa != NULL) { + IFA_REMREF(ifa); + ifa = NULL; + } + } + + if (ip_select_srcif_debug && ifa == NULL) { + printf("%s->%s ifscope %d (neither ro_if/ifa_if found)\n", + s_src, s_dst, ifscope); + } + + /* + * If there is a route, mark it accordingly. If there isn't one, + * we'll get here again during the next transmit (possibly with a + * route) and the flag will get set at that point. For IPv4 LLA + * destination, mark it only if the route has been fully resolved; + * otherwise we want to come back here again when the route points + * to the interface over which the ARP reply arrives on. + */ + if (ro->ro_rt != NULL && (!IN_LINKLOCAL(ntohl(dst.s_addr)) || + (ro->ro_rt->rt_gateway->sa_family == AF_LINK && + SDL(ro->ro_rt->rt_gateway)->sdl_alen != 0))) { + ro->ro_flags |= ROF_SRCIF_SELECTED; + ro->ro_rt->generation_id = route_generation; + } + + if (ro->ro_rt != NULL) + RT_UNLOCK(ro->ro_rt); + + return (ifa); }