X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/91447636331957f3d9b5ca5b508f07c526b0074d..316670eb35587141e969394ae8537d66b9211e80:/bsd/netinet/ip_output.c diff --git a/bsd/netinet/ip_output.c b/bsd/netinet/ip_output.c index 9fd7a09a1..aece80368 100644 --- a/bsd/netinet/ip_output.c +++ b/bsd/netinet/ip_output.c @@ -1,23 +1,29 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * - * @APPLE_LICENSE_HEADER_START@ + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. * - * @APPLE_LICENSE_HEADER_END@ + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* * Copyright (c) 1982, 1986, 1988, 1990, 1993 @@ -54,6 +60,12 @@ * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 * $FreeBSD: src/sys/netinet/ip_output.c,v 1.99.2.16 2001/07/19 06:37:26 kris Exp $ */ +/* + * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce + * support for mandatory and extensible security protections. This notice + * is included in support of clause 2.2 (b) of the Apple Public License, + * Version 2.0. + */ #define _IP_VHL @@ -67,9 +79,17 @@ #include #include #include +#include + +#include +#include #include +#include +#include #include +#include +#include #include #include @@ -80,16 +100,20 @@ #include -#include "faith.h" +#if CONFIG_MACF_NET +#include +#endif #include #include +#include #define DBG_LAYER_BEG NETDBG_CODE(DBG_NETIP, 1) #define DBG_LAYER_END NETDBG_CODE(DBG_NETIP, 3) #define DBG_FNC_IP_OUTPUT NETDBG_CODE(DBG_NETIP, (1 << 8) | 1) #define DBG_FNC_IPSEC4_OUTPUT NETDBG_CODE(DBG_NETIP, (2 << 8) | 1) +#define SWAP16(v) ((((v) & 0xff) << 8) | ((v) >> 8)) #if IPSEC #include @@ -103,11 +127,16 @@ #include #include +#include #if DUMMYNET #include #endif +#if PF +#include +#endif /* PF */ + #if IPFIREWALL_FORWARD_DEBUG #define print_ip(a) printf("%ld.%ld.%ld.%ld",(ntohl(a.s_addr)>>24)&0xFF,\ (ntohl(a.s_addr)>>16)&0xFF,\ @@ -115,31 +144,20 @@ (ntohl(a.s_addr))&0xFF); #endif -#if IPSEC -extern lck_mtx_t *sadb_mutex; -#endif - u_short ip_id; static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *); -static struct ifnet *ip_multicast_if(struct in_addr *, int *); static void ip_mloopback(struct ifnet *, struct mbuf *, struct sockaddr_in *, int); -static int ip_getmoptions(struct sockopt *, struct ip_moptions *); static int ip_pcbopts(int, struct mbuf **, struct mbuf *); -static int ip_setmoptions(struct sockopt *, struct ip_moptions **); +static void imo_trace(struct ip_moptions *, int); -int ip_createmoptions(struct ip_moptions **imop); -int ip_addmembership(struct ip_moptions *imo, struct ip_mreq *mreq); -int ip_dropmembership(struct ip_moptions *imo, struct ip_mreq *mreq); -int ip_optcopy(struct ip *, struct ip *); -extern int (*fr_checkp)(struct ip *, int, struct ifnet *, int, struct mbuf **); -#ifdef __APPLE__ -extern struct mbuf* m_dup(register struct mbuf *m, int how); -#endif +static void ip_out_cksum_stats(int, u_int32_t); +static struct ifaddr *in_selectsrcif(struct ip *, struct route *, unsigned int); -extern int apple_hwcksum_tx; -extern u_long route_generation; +int ip_optcopy(struct ip *, struct ip *); +void in_delayed_cksum_offset(struct mbuf *, int ); +void in_cksum_offset(struct mbuf* , size_t ); extern struct protosw inetsw[]; @@ -152,8 +170,50 @@ extern int ipsec_bypass; #endif static int ip_maxchainsent = 0; -SYSCTL_INT(_net_inet_ip, OID_AUTO, maxchainsent, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, maxchainsent, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_maxchainsent, 0, "use dlil_output_list"); +#if DEBUG +static int forge_ce = 0; +SYSCTL_INT(_net_inet_ip, OID_AUTO, forge_ce, CTLFLAG_RW | CTLFLAG_LOCKED, + &forge_ce, 0, "Forge ECN CE"); +#endif /* DEBUG */ + +static int ip_select_srcif_debug = 0; +SYSCTL_INT(_net_inet_ip, OID_AUTO, select_srcif_debug, CTLFLAG_RW | CTLFLAG_LOCKED, + &ip_select_srcif_debug, 0, "log source interface selection debug info"); + +#define IMO_TRACE_HIST_SIZE 32 /* size of trace history */ + +/* For gdb */ +__private_extern__ unsigned int imo_trace_hist_size = IMO_TRACE_HIST_SIZE; + +struct ip_moptions_dbg { + struct ip_moptions imo; /* ip_moptions */ + u_int16_t imo_refhold_cnt; /* # of IMO_ADDREF */ + u_int16_t imo_refrele_cnt; /* # of IMO_REMREF */ + /* + * Alloc and free callers. + */ + ctrace_t imo_alloc; + ctrace_t imo_free; + /* + * Circular lists of IMO_ADDREF and IMO_REMREF callers. + */ + ctrace_t imo_refhold[IMO_TRACE_HIST_SIZE]; + ctrace_t imo_refrele[IMO_TRACE_HIST_SIZE]; +}; + +#if DEBUG +static unsigned int imo_debug = 1; /* debugging (enabled) */ +#else +static unsigned int imo_debug; /* debugging (disabled) */ +#endif /* !DEBUG */ +static unsigned int imo_size; /* size of zone element */ +static struct zone *imo_zone; /* zone for ip_moptions */ + +#define IMO_ZONE_MAX 64 /* maximum elements in zone */ +#define IMO_ZONE_NAME "ip_moptions" /* zone name */ + /* * IP output. The packet in mbuf chain m contains a skeletal IP * header (with len, off, ttl, proto, tos, src, dst). @@ -166,13 +226,35 @@ ip_output( struct mbuf *opt, struct route *ro, int flags, - struct ip_moptions *imo) + struct ip_moptions *imo, + struct ip_out_args *ipoa) { int error; - error = ip_output_list(m0, 0, opt, ro, flags, imo); + error = ip_output_list(m0, 0, opt, ro, flags, imo, ipoa); return error; } +/* + * Returns: 0 Success + * ENOMEM + * EADDRNOTAVAIL + * ENETUNREACH + * EHOSTUNREACH + * EACCES + * EMSGSIZE + * ENOBUFS + * ipsec4_getpolicybyaddr:??? [IPSEC 4th argument, contents modified] + * ipsec4_getpolicybysock:??? [IPSEC 4th argument, contents modified] + * key_spdacquire:??? [IPSEC] + * ipsec4_output:??? [IPSEC] + * ip_dn_io_ptr:??? [dummynet] + * dlil_output:??? [DLIL] + * dlil_output_list:??? [DLIL] + * + * Notes: The ipsec4_getpolicyby{addr|sock} function error returns are + * only used as the error return from this function where one of + * these functions fails to return a policy. + */ int ip_output_list( struct mbuf *m0, @@ -180,82 +262,120 @@ ip_output_list( struct mbuf *opt, struct route *ro, int flags, - struct ip_moptions *imo) + struct ip_moptions *imo, + struct ip_out_args *ipoa) { - struct ip *ip, *mhip; + struct ip *ip; struct ifnet *ifp = NULL; - struct mbuf *m = m0; + struct mbuf *m = m0, *prevnxt = NULL, **mppn = &prevnxt; int hlen = sizeof (struct ip); - int len, off, error = 0; + int len = 0, error = 0; struct sockaddr_in *dst = NULL; - struct in_ifaddr *ia = NULL; + struct in_ifaddr *ia = NULL, *src_ia = NULL; int isbroadcast, sw_csum; struct in_addr pkt_dst; + struct ipf_pktopts *ippo = NULL, ipf_pktopts; #if IPSEC - struct route iproute; + struct ipsec_output_state ipsec_state; + struct route *ipsec_saved_route = NULL; struct socket *so = NULL; struct secpolicy *sp = NULL; #endif #if IPFIREWALL_FORWARD int fwd_rewrite_src = 0; #endif +#if IPFIREWALL + int off; + struct sockaddr_in *next_hop_from_ipfwd_tag = NULL; +#endif +#if IPFIREWALL || DUMMYNET struct ip_fw_args args; + struct m_tag *tag; +#endif int didfilter = 0; ipfilter_t inject_filter_ref = 0; - struct m_tag *tag; - struct route dn_route; +#if DUMMYNET + struct route saved_route; + struct ip_out_args saved_ipoa; + struct sockaddr_in dst_buf; +#endif /* DUMMYNET */ struct mbuf * packetlist; - int pktcnt = 0; - - lck_mtx_lock(ip_mutex); + int pktcnt = 0, tso = 0; + u_int32_t bytecnt = 0; + unsigned int ifscope = IFSCOPE_NONE; + unsigned int nocell = 0; + boolean_t select_srcif, srcbound; + struct flowadv *adv = NULL; KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0); +#if IPSEC + bzero(&ipsec_state, sizeof(ipsec_state)); +#endif /* IPSEC */ + packetlist = m0; - args.eh = NULL; - args.rule = NULL; - args.next_hop = NULL; - args.divert_rule = 0; /* divert cookie */ - +#if IPFIREWALL || DUMMYNET + bzero(&args, sizeof(struct ip_fw_args)); + + if (SLIST_EMPTY(&m0->m_pkthdr.tags)) + goto ipfw_tags_done; + /* Grab info from mtags prepended to the chain */ #if DUMMYNET - if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET, NULL)) != NULL) { + if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID, + KERNEL_TAG_TYPE_DUMMYNET, NULL)) != NULL) { struct dn_pkt_tag *dn_tag; - + dn_tag = (struct dn_pkt_tag *)(tag+1); - args.rule = dn_tag->rule; + args.fwa_ipfw_rule = dn_tag->dn_ipfw_rule; + args.fwa_pf_rule = dn_tag->dn_pf_rule; opt = NULL; - dn_route = dn_tag->ro; - ro = &dn_route; - + saved_route = dn_tag->dn_ro; + ro = &saved_route; + imo = NULL; - dst = dn_tag->dn_dst; - ifp = dn_tag->ifp; - flags = dn_tag->flags; - + bcopy(&dn_tag->dn_dst, &dst_buf, sizeof(dst_buf)); + dst = &dst_buf; + ifp = dn_tag->dn_ifp; + flags = dn_tag->dn_flags; + if ((dn_tag->dn_flags & IP_OUTARGS)) { + saved_ipoa = dn_tag->dn_ipoa; + ipoa = &saved_ipoa; + } + m_tag_delete(m0, tag); } #endif /* DUMMYNET */ - if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DIVERT, NULL)) != NULL) { +#if IPDIVERT + if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID, + KERNEL_TAG_TYPE_DIVERT, NULL)) != NULL) { struct divert_tag *div_tag; - + div_tag = (struct divert_tag *)(tag+1); - args.divert_rule = div_tag->cookie; + args.fwa_divert_rule = div_tag->cookie; m_tag_delete(m0, tag); } - if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFORWARD, NULL)) != NULL) { +#endif /* IPDIVERT */ + +#if IPFIREWALL + if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID, + KERNEL_TAG_TYPE_IPFORWARD, NULL)) != NULL) { struct ip_fwd_tag *ipfwd_tag; - + ipfwd_tag = (struct ip_fwd_tag *)(tag+1); - args.next_hop = ipfwd_tag->next_hop; - + next_hop_from_ipfwd_tag = ipfwd_tag->next_hop; + m_tag_delete(m0, tag); } +#endif /* IPFIREWALL */ + +ipfw_tags_done: +#endif /* IPFIREWALL || DUMMYNET */ m = m0; - + #if DIAGNOSTIC if ( !m || (m->m_flags & M_PKTHDR) != 0) panic("ip_output no HDR"); @@ -264,21 +384,74 @@ ip_output_list( mtod(m, struct ip *)->ip_p); #endif - if (args.rule != NULL) { /* dummynet already saw us */ - ip = mtod(m, struct ip *); - hlen = IP_VHL_HL(ip->ip_vhl) << 2 ; - if (ro->ro_rt != NULL) - ia = (struct in_ifaddr *)ro->ro_rt->rt_ifa; - if (ia) - ifaref(&ia->ia_ifa); -#if IPSEC - if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) { - so = ipsec_getsocket(m); - (void)ipsec_setsocket(m, NULL); + bzero(&ipf_pktopts, sizeof(struct ipf_pktopts)); + ippo = &ipf_pktopts; + + if (ip_doscopedroute && (flags & IP_OUTARGS)) { + /* + * In the forwarding case, only the ifscope value is used, + * as source interface selection doesn't take place. + */ + if ((select_srcif = (!(flags & IP_FORWARDING) && + (ipoa->ipoa_flags & IPOAF_SELECT_SRCIF)))) { + ipf_pktopts.ippo_flags |= IPPOF_SELECT_SRCIF; + } + + if ((ipoa->ipoa_flags & IPOAF_BOUND_IF) && + ipoa->ipoa_boundif != IFSCOPE_NONE) { + ifscope = ipoa->ipoa_boundif; + ipf_pktopts.ippo_flags |= + (IPPOF_BOUND_IF | (ifscope << IPPOF_SHIFT_IFSCOPE)); + } + + if ((srcbound = (ipoa->ipoa_flags & IPOAF_BOUND_SRCADDR))) + ipf_pktopts.ippo_flags |= IPPOF_BOUND_SRCADDR; + } else { + select_srcif = FALSE; + srcbound = FALSE; + ifscope = IFSCOPE_NONE; + } + + if ((flags & IP_OUTARGS) && (ipoa->ipoa_flags & IPOAF_NO_CELLULAR)) { + nocell = 1; + ipf_pktopts.ippo_flags |= IPPOF_NO_IFT_CELLULAR; + } + + if (flags & IP_OUTARGS) { + adv = &ipoa->ipoa_flowadv; + adv->code = FADV_SUCCESS; + } + +#if DUMMYNET + if (args.fwa_ipfw_rule != NULL || args.fwa_pf_rule != NULL) { + /* dummynet already saw us */ + ip = mtod(m, struct ip *); + hlen = IP_VHL_HL(ip->ip_vhl) << 2; + pkt_dst = ip->ip_dst; + if (ro->ro_rt != NULL) { + RT_LOCK_SPIN(ro->ro_rt); + ia = (struct in_ifaddr *)ro->ro_rt->rt_ifa; + if (ia) { + /* Become a regular mutex */ + RT_CONVERT_LOCK(ro->ro_rt); + IFA_ADDREF(&ia->ia_ifa); } -#endif - goto sendit; + RT_UNLOCK(ro->ro_rt); + } +#if IPSEC + if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) { + so = ipsec_getsocket(m); + (void)ipsec_setsocket(m, NULL); + } +#endif /* IPSEC */ +#if IPFIREWALL + if (args.fwa_ipfw_rule != NULL) + goto skip_ipsec; +#endif /* #if IPFIREWALL */ + if (args.fwa_pf_rule != NULL) + goto sendit; } +#endif /* DUMMYNET */ #if IPSEC if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) { @@ -291,14 +464,45 @@ loopit: * No need to proccess packet twice if we've * already seen it */ - inject_filter_ref = ipf_get_inject_filter(m); + if (!SLIST_EMPTY(&m->m_pkthdr.tags)) + inject_filter_ref = ipf_get_inject_filter(m); + else + inject_filter_ref = 0; if (opt) { m = ip_insertoptions(m, opt, &len); hlen = len; + /* Update the chain */ + if (m != m0) { + if (m0 == packetlist) + packetlist = m; + m0 = m; + } } ip = mtod(m, struct ip *); - pkt_dst = args.next_hop ? args.next_hop->sin_addr : ip->ip_dst; +#if IPFIREWALL + /* + * rdar://8542331 + * + * When dealing with a packet chain, we need to reset "next_hop" because + * "dst" may have been changed to the gateway address below for the previous + * packet of the chain. This could cause the route to be inavertandly changed + * to the route to the gateway address (instead of the route to the destination). + */ + args.fwa_next_hop = next_hop_from_ipfwd_tag; + pkt_dst = args.fwa_next_hop ? args.fwa_next_hop->sin_addr : ip->ip_dst; +#else + pkt_dst = ip->ip_dst; +#endif + + /* + * We must not send if the packet is destined to network zero. + * RFC1122 3.2.1.3 (a) and (b). + */ + if (IN_ZERONET(ntohl(pkt_dst.s_addr))) { + error = EHOSTUNREACH; + goto bad; + } /* * Fill in IP header. @@ -311,15 +515,25 @@ loopit: #else ip->ip_id = htons(ip_id++); #endif - ipstat.ips_localout++; + OSAddAtomic(1, &ipstat.ips_localout); } else { hlen = IP_VHL_HL(ip->ip_vhl) << 2; } +#if DEBUG + /* For debugging, we let the stack forge congestion */ + if (forge_ce != 0 && + ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT1 || + (ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT0)) { + ip->ip_tos = (ip->ip_tos & ~IPTOS_ECN_MASK) | IPTOS_ECN_CE; + forge_ce--; + } +#endif /* DEBUG */ + KERNEL_DEBUG(DBG_LAYER_BEG, ip->ip_dst.s_addr, ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len); - dst = (struct sockaddr_in *)&ro->ro_dst; + dst = (struct sockaddr_in *)(void *)&ro->ro_dst; /* * If there is a cached route, @@ -329,21 +543,39 @@ loopit: * cache with IPv6. */ - { - if (ro->ro_rt && (ro->ro_rt->generation_id != route_generation) && - ((flags & (IP_ROUTETOIF | IP_FORWARDING)) == 0) && (ip->ip_src.s_addr != INADDR_ANY) && - (ifa_foraddr(ip->ip_src.s_addr) == 0)) { - error = EADDRNOTAVAIL; - goto bad; + if (ro->ro_rt != NULL) { + if (ro->ro_rt->generation_id != route_generation && + ((flags & (IP_ROUTETOIF | IP_FORWARDING)) == 0) && + (ip->ip_src.s_addr != INADDR_ANY)) { + src_ia = ifa_foraddr(ip->ip_src.s_addr); + if (src_ia == NULL) { + error = EADDRNOTAVAIL; + goto bad; + } + IFA_REMREF(&src_ia->ia_ifa); } + /* + * Test rt_flags without holding rt_lock for performance + * reasons; if the route is down it will hopefully be + * caught by the layer below (since it uses this route + * as a hint) or during the next transmit. + */ + if ((ro->ro_rt->rt_flags & RTF_UP) == 0 || + dst->sin_family != AF_INET || + dst->sin_addr.s_addr != pkt_dst.s_addr) { + rtfree(ro->ro_rt); + ro->ro_rt = NULL; + } + /* + * If we're doing source interface selection, we may not + * want to use this route; only synch up the generation + * count otherwise. + */ + if (!select_srcif && ro->ro_rt != NULL && + ro->ro_rt->generation_id != route_generation) + ro->ro_rt->generation_id = route_generation; } - if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 || - dst->sin_family != AF_INET || - dst->sin_addr.s_addr != pkt_dst.s_addr)) { - rtfree(ro->ro_rt); - ro->ro_rt = (struct rtentry *)0; - } - if (ro->ro_rt == 0) { + if (ro->ro_rt == NULL) { bzero(dst, sizeof(*dst)); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); @@ -353,14 +585,12 @@ loopit: * If routing to interface only, * short circuit routing lookup. */ -#define ifatoia(ifa) ((struct in_ifaddr *)(ifa)) -#define sintosa(sin) ((struct sockaddr *)(sin)) if (flags & IP_ROUTETOIF) { if (ia) - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == 0) { if ((ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == 0) { - ipstat.ips_noroute++; + OSAddAtomic(1, &ipstat.ips_noroute); error = ENETUNREACH; goto bad; } @@ -368,7 +598,86 @@ loopit: ifp = ia->ia_ifp; ip->ip_ttl = 1; isbroadcast = in_broadcast(dst->sin_addr, ifp); + } else if (IN_MULTICAST(ntohl(pkt_dst.s_addr)) && + imo != NULL && (ifp = imo->imo_multicast_ifp) != NULL) { + /* + * Bypass the normal routing lookup for multicast + * packets if the interface is specified. + */ + isbroadcast = 0; + if (ia != NULL) + IFA_REMREF(&ia->ia_ifa); + + /* Macro takes reference on ia */ + IFP_TO_IA(ifp, ia); } else { + boolean_t cloneok = FALSE; + /* + * Perform source interface selection; the source IP address + * must belong to one of the addresses of the interface used + * by the route. For performance reasons, do this only if + * there is no route, or if the routing table has changed, + * or if we haven't done source interface selection on this + * route (for this PCB instance) before. + */ + if (select_srcif && ip->ip_src.s_addr != INADDR_ANY && + (ro->ro_rt == NULL || !(ro->ro_rt->rt_flags & RTF_UP) || + ro->ro_rt->generation_id != route_generation || + !(ro->ro_flags & ROF_SRCIF_SELECTED))) { + struct ifaddr *ifa; + + /* Find the source interface */ + ifa = in_selectsrcif(ip, ro, ifscope); + + /* + * If the source address belongs to a cellular interface + * and the caller forbids our using interfaces of such + * type, pretend that there is no source address. + */ + if (nocell && ifa != NULL && + ifa->ifa_ifp->if_type == IFT_CELLULAR) { + IFA_REMREF(ifa); + error = EADDRNOTAVAIL; + goto bad; + } + + /* + * If the source address is spoofed (in the case of + * IP_RAWOUTPUT on an unbounded socket), or if this + * is destined for local/loopback, just let it go out + * using the interface of the route. Otherwise, + * there's no interface having such an address, + * so bail out. + */ + if (ifa == NULL && (!(flags & IP_RAWOUTPUT) || + srcbound) && ifscope != lo_ifp->if_index) { + error = EADDRNOTAVAIL; + goto bad; + } + + /* + * If the caller didn't explicitly specify the scope, + * pick it up from the source interface. If the cached + * route was wrong and was blown away as part of source + * interface selection, don't mask out RTF_PRCLONING + * since that route may have been allocated by the ULP, + * unless the IP header was created by the caller or + * the destination is IPv4 LLA. The check for the + * latter is needed because IPv4 LLAs are never scoped + * in the current implementation, and we don't want to + * replace the resolved IPv4 LLA route with one whose + * gateway points to that of the default gateway on + * the primary interface of the system. + */ + if (ifa != NULL) { + if (ifscope == IFSCOPE_NONE) + ifscope = ifa->ifa_ifp->if_index; + IFA_REMREF(ifa); + cloneok = (!(flags & IP_RAWOUTPUT) && + !(IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)))); + } + } + /* * If this is the case, we probably don't want to allocate * a protocol-cloned route since we didn't get one from the @@ -378,29 +687,97 @@ loopit: * the link layer, as this is probably required in all cases * for correct operation (as it is for ARP). */ - if (ro->ro_rt == 0) - rtalloc_ign(ro, RTF_PRCLONING); - if (ro->ro_rt == 0) { - ipstat.ips_noroute++; + if (ro->ro_rt == NULL) { + unsigned long ign = RTF_PRCLONING; + /* + * We make an exception here: if the destination + * address is INADDR_BROADCAST, allocate a protocol- + * cloned host route so that we end up with a route + * marked with the RTF_BROADCAST flag. Otherwise, + * we would end up referring to the default route, + * instead of creating a cloned host route entry. + * That would introduce inconsistencies between ULPs + * that allocate a route and those that don't. The + * RTF_BROADCAST route is important since we'd want + * to send out undirected IP broadcast packets using + * link-level broadcast address. Another exception + * is for ULP-created routes that got blown away by + * source interface selection (see above). + * + * These exceptions will no longer be necessary when + * the RTF_PRCLONING scheme is no longer present. + */ + if (cloneok || dst->sin_addr.s_addr == INADDR_BROADCAST) + ign &= ~RTF_PRCLONING; + + /* + * Loosen the route lookup criteria if the ifscope + * corresponds to the loopback interface; this is + * needed to support Application Layer Gateways + * listening on loopback, in conjunction with packet + * filter redirection rules. The final source IP + * address will be rewritten by the packet filter + * prior to the RFC1122 loopback check below. + */ + if (ifscope == lo_ifp->if_index) + rtalloc_ign(ro, ign); + else + rtalloc_scoped_ign(ro, ign, ifscope); + + /* + * If the route points to a cellular interface and the + * caller forbids our using interfaces of such type, + * pretend that there is no route. + */ + if (nocell && ro->ro_rt != NULL) { + RT_LOCK_SPIN(ro->ro_rt); + if (ro->ro_rt->rt_ifp->if_type == + IFT_CELLULAR) { + RT_UNLOCK(ro->ro_rt); + rtfree(ro->ro_rt); + ro->ro_rt = NULL; + } else { + RT_UNLOCK(ro->ro_rt); + } + } + } + + if (ro->ro_rt == NULL) { + OSAddAtomic(1, &ipstat.ips_noroute); error = EHOSTUNREACH; goto bad; } + if (ia) - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); + RT_LOCK_SPIN(ro->ro_rt); ia = ifatoia(ro->ro_rt->rt_ifa); - if (ia) - ifaref(&ia->ia_ifa); + if (ia) { + /* Become a regular mutex */ + RT_CONVERT_LOCK(ro->ro_rt); + IFA_ADDREF(&ia->ia_ifa); + } ifp = ro->ro_rt->rt_ifp; ro->ro_rt->rt_use++; - if (ro->ro_rt->rt_flags & RTF_GATEWAY) - dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway; - if (ro->ro_rt->rt_flags & RTF_HOST) + if (ro->ro_rt->rt_flags & RTF_GATEWAY) { + dst = (struct sockaddr_in *)(void *) + ro->ro_rt->rt_gateway; + } + if (ro->ro_rt->rt_flags & RTF_HOST) { isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST); - else + } else { + /* Become a regular mutex */ + RT_CONVERT_LOCK(ro->ro_rt); isbroadcast = in_broadcast(dst->sin_addr, ifp); + } + RT_UNLOCK(ro->ro_rt); } + if (IN_MULTICAST(ntohl(pkt_dst.s_addr))) { struct in_multi *inm; + u_int32_t vif; + u_int8_t ttl = IP_DEFAULT_MULTICAST_TTL; + u_int8_t loop = IP_DEFAULT_MULTICAST_LOOP; m->m_flags |= M_MCAST; /* @@ -408,27 +785,35 @@ loopit: * still points to the address in "ro". (It may have been * changed to point to a gateway address, above.) */ - dst = (struct sockaddr_in *)&ro->ro_dst; + dst = (struct sockaddr_in *)(void *)&ro->ro_dst; /* * See if the caller provided any multicast options */ if (imo != NULL) { - if ((flags & IP_RAWOUTPUT) == 0) ip->ip_ttl = imo->imo_multicast_ttl; - if (imo->imo_multicast_ifp != NULL) { + IMO_LOCK(imo); + vif = imo->imo_multicast_vif; + ttl = imo->imo_multicast_ttl; + loop = imo->imo_multicast_loop; + if ((flags & IP_RAWOUTPUT) == 0) + ip->ip_ttl = ttl; + if (imo->imo_multicast_ifp != NULL) ifp = imo->imo_multicast_ifp; - } - if (imo->imo_multicast_vif != -1 && - ((flags & IP_RAWOUTPUT) == 0 || ip->ip_src.s_addr == INADDR_ANY)) - ip->ip_src.s_addr = - ip_mcast_src(imo->imo_multicast_vif); - } else - if ((flags & IP_RAWOUTPUT) == 0) ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; + IMO_UNLOCK(imo); +#if MROUTING + if (vif != -1 && ((flags & IP_RAWOUTPUT) == 0 || + ip->ip_src.s_addr == INADDR_ANY)) + ip->ip_src.s_addr = ip_mcast_src(vif); +#endif /* MROUTING */ + } else if ((flags & IP_RAWOUTPUT) == 0) { + vif = -1; + ip->ip_ttl = ttl; + } /* * Confirm that the outgoing interface supports multicast. */ - if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { + if (imo == NULL || vif == -1) { if ((ifp->if_flags & IFF_MULTICAST) == 0) { - ipstat.ips_noroute++; + OSAddAtomic(1, &ipstat.ips_noroute); error = ENETUNREACH; goto bad; } @@ -438,25 +823,28 @@ loopit: * of outgoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) { - register struct in_ifaddr *ia1; - - TAILQ_FOREACH(ia1, &in_ifaddrhead, ia_link) + struct in_ifaddr *ia1; + lck_rw_lock_shared(in_ifaddr_rwlock); + TAILQ_FOREACH(ia1, &in_ifaddrhead, ia_link) { + IFA_LOCK_SPIN(&ia1->ia_ifa); if (ia1->ia_ifp == ifp) { ip->ip_src = IA_SIN(ia1)->sin_addr; - + IFA_UNLOCK(&ia1->ia_ifa); break; } + IFA_UNLOCK(&ia1->ia_ifa); + } + lck_rw_done(in_ifaddr_rwlock); if (ip->ip_src.s_addr == INADDR_ANY) { error = ENETUNREACH; goto bad; } } - ifnet_lock_shared(ifp); - IN_LOOKUP_MULTI(pkt_dst, ifp, inm); - ifnet_lock_done(ifp); - if (inm != NULL && - (imo == NULL || imo->imo_multicast_loop)) { + in_multihead_lock_shared(); + IN_LOOKUP_MULTI(&pkt_dst, ifp, inm); + in_multihead_lock_done(); + if (inm != NULL && (imo == NULL || loop)) { /* * If we belong to the destination multicast group * on the outgoing interface, and the caller did not @@ -465,17 +853,23 @@ loopit: if (!TAILQ_EMPTY(&ipv4_filters)) { struct ipfilter *filter; int seen = (inject_filter_ref == 0); - struct ipf_pktopts *ippo = 0, ipf_pktopts; - if (imo) { - ippo = &ipf_pktopts; - ipf_pktopts.ippo_mcast_ifnet = imo->imo_multicast_ifp; - ipf_pktopts.ippo_mcast_ttl = imo->imo_multicast_ttl; - ipf_pktopts.ippo_mcast_loop = imo->imo_multicast_loop; + if (imo != NULL) { + ipf_pktopts.ippo_flags |= IPPOF_MCAST_OPTS; + ipf_pktopts.ippo_mcast_ifnet = ifp; + ipf_pktopts.ippo_mcast_ttl = ttl; + ipf_pktopts.ippo_mcast_loop = loop; } - - lck_mtx_unlock(ip_mutex); + ipf_ref(); + + /* 4135317 - always pass network byte order to filter */ + +#if BYTE_ORDER != BIG_ENDIAN + HTONS(ip->ip_len); + HTONS(ip->ip_off); +#endif + TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) { if (seen == 0) { if ((struct ipfilter *)inject_filter_ref == filter) @@ -485,21 +879,31 @@ loopit: result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, ippo); if (result == EJUSTRETURN) { ipf_unref(); + INM_REMREF(inm); goto done; } if (result != 0) { ipf_unref(); - lck_mtx_lock(ip_mutex); + INM_REMREF(inm); goto bad; } } } - lck_mtx_lock(ip_mutex); + + /* set back to host byte order */ + ip = mtod(m, struct ip *); + +#if BYTE_ORDER != BIG_ENDIAN + NTOHS(ip->ip_len); + NTOHS(ip->ip_off); +#endif + ipf_unref(); didfilter = 1; } ip_mloopback(ifp, m, dst, hlen); } +#if MROUTING else { /* * If we are acting as a multicast router, perform @@ -521,15 +925,19 @@ loopit: * as prescribed by rsvpd. */ if (!rsvp_on) - imo = NULL; + imo = NULL; if (ip_mforward(ip, ifp, m, imo) != 0) { m_freem(m); - lck_mtx_unlock(ip_mutex); + if (inm != NULL) + INM_REMREF(inm); + OSAddAtomic(1, &ipstat.ips_cantforward); goto done; } } } - +#endif /* MROUTING */ + if (inm != NULL) + INM_REMREF(inm); /* * Multicasts with a time-to-live of zero may be looped- * back, above, but must not be transmitted on a network. @@ -540,19 +948,19 @@ loopit: */ if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { m_freem(m); - lck_mtx_unlock(ip_mutex); goto done; } goto sendit; } -#ifndef notdef /* * If source address not specified yet, use address * of outgoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) { + IFA_LOCK_SPIN(&ia->ia_ifa); ip->ip_src = IA_SIN(ia)->sin_addr; + IFA_UNLOCK(&ia->ia_ifa); #if IPFIREWALL_FORWARD /* Keep note that we did this - if the firewall changes * the next-hop, our interface may change, changing the @@ -562,7 +970,6 @@ loopit: fwd_rewrite_src++; #endif /* IPFIREWALL_FORWARD */ } -#endif /* notdef */ /* * Look for broadcast address and @@ -589,6 +996,50 @@ loopit: } sendit: +#if PF + /* Invoke outbound packet filter */ + if (PF_IS_ENABLED) { + int rc; + + m0 = m; /* Save for later */ +#if DUMMYNET + args.fwa_m = m; + args.fwa_next_hop = dst; + args.fwa_oif = ifp; + args.fwa_ro = ro; + args.fwa_dst = dst; + args.fwa_oflags = flags; + if (flags & IP_OUTARGS) + args.fwa_ipoa = ipoa; + rc = pf_af_hook(ifp, mppn, &m, AF_INET, FALSE, &args); +#else /* DUMMYNET */ + rc = pf_af_hook(ifp, mppn, &m, AF_INET, FALSE, NULL); +#endif /* DUMMYNET */ + if (rc != 0 || m == NULL) { + /* Move to the next packet */ + m = *mppn; + + /* Skip ahead if first packet in list got dropped */ + if (packetlist == m0) + packetlist = m; + + if (m != NULL) { + m0 = m; + /* Next packet in the chain */ + goto loopit; + } else if (packetlist != NULL) { + /* No more packet; send down the chain */ + goto sendchain; + } + /* Nothing left; we're done */ + goto done; + } + m0 = m; + ip = mtod(m, struct ip *); + pkt_dst = ip->ip_dst; + hlen = IP_VHL_HL(ip->ip_vhl) << 2; + } +#endif /* PF */ /* * Force IP TTL to 255 following draft-ietf-zeroconf-ipv4-linklocal.txt */ @@ -596,37 +1047,60 @@ sendit: ip_linklocal_stat.iplls_out_total++; if (ip->ip_ttl != MAXTTL) { ip_linklocal_stat.iplls_out_badttl++; - ip->ip_ttl = MAXTTL; + ip->ip_ttl = MAXTTL; } } -injectit: if (!didfilter && !TAILQ_EMPTY(&ipv4_filters)) { struct ipfilter *filter; int seen = (inject_filter_ref == 0); - - lck_mtx_unlock(ip_mutex); + ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS; + + /* Check that a TSO frame isn't passed to a filter. + * This could happen if a filter is inserted while + * TCP is sending the TSO packet. + */ + if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) { + error = EMSGSIZE; + goto bad; + } + ipf_ref(); + + /* 4135317 - always pass network byte order to filter */ + +#if BYTE_ORDER != BIG_ENDIAN + HTONS(ip->ip_len); + HTONS(ip->ip_off); +#endif + TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) { if (seen == 0) { if ((struct ipfilter *)inject_filter_ref == filter) seen = 1; } else if (filter->ipf_filter.ipf_output) { errno_t result; - result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, 0); + result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, ippo); if (result == EJUSTRETURN) { ipf_unref(); goto done; } if (result != 0) { ipf_unref(); - lck_mtx_lock(ip_mutex); goto bad; } } } + + /* set back to host byte order */ + ip = mtod(m, struct ip *); + +#if BYTE_ORDER != BIG_ENDIAN + NTOHS(ip->ip_len); + NTOHS(ip->ip_off); +#endif + ipf_unref(); - lck_mtx_lock(ip_mutex); } #if IPSEC @@ -637,7 +1111,6 @@ injectit: KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_START, 0,0,0,0,0); - lck_mtx_lock(sadb_mutex); /* get SP for this packet */ if (so == NULL) @@ -646,9 +1119,8 @@ injectit: sp = ipsec4_getpolicybysock(m, IPSEC_DIR_OUTBOUND, so, &error); if (sp == NULL) { - ipsecstat.out_inval++; + IPSEC_STAT_INCREMENT(ipsecstat.out_inval); KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); - lck_mtx_unlock(sadb_mutex); goto bad; } @@ -657,27 +1129,25 @@ injectit: /* check policy */ switch (sp->policy) { case IPSEC_POLICY_DISCARD: + case IPSEC_POLICY_GENERATE: /* * This packet is just discarded. */ - ipsecstat.out_polvio++; + IPSEC_STAT_INCREMENT(ipsecstat.out_polvio); KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 1,0,0,0,0); - lck_mtx_unlock(sadb_mutex); goto bad; case IPSEC_POLICY_BYPASS: case IPSEC_POLICY_NONE: /* no need to do IPsec. */ KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 2,0,0,0,0); - lck_mtx_unlock(sadb_mutex); goto skip_ipsec; - + case IPSEC_POLICY_IPSEC: if (sp->req == NULL) { /* acquire a policy */ error = key_spdacquire(sp); KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 3,0,0,0,0); - lck_mtx_unlock(sadb_mutex); goto bad; } break; @@ -687,15 +1157,12 @@ injectit: printf("ip_output: Invalid policy found. %d\n", sp->policy); } { - struct ipsec_output_state state; - bzero(&state, sizeof(state)); - state.m = m; + ipsec_state.m = m; if (flags & IP_ROUTETOIF) { - state.ro = &iproute; - bzero(&iproute, sizeof(iproute)); + bzero(&ipsec_state.ro, sizeof(ipsec_state.ro)); } else - state.ro = ro; - state.dst = (struct sockaddr *)dst; + route_copyout(&ipsec_state.ro, ro, sizeof(ipsec_state.ro)); + ipsec_state.dst = (struct sockaddr *)dst; ip->ip_sum = 0; @@ -708,29 +1175,35 @@ injectit: m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } + +#if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_len); HTONS(ip->ip_off); +#endif + + DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL, + struct ip *, ip, struct ifnet *, ifp, + struct ip *, ip, struct ip6_hdr *, NULL); + + error = ipsec4_output(&ipsec_state, sp, flags); + + m0 = m = ipsec_state.m; - lck_mtx_unlock(ip_mutex); - error = ipsec4_output(&state, sp, flags); - lck_mtx_unlock(sadb_mutex); - lck_mtx_lock(ip_mutex); - - m0 = m = state.m; - if (flags & IP_ROUTETOIF) { /* * if we have tunnel mode SA, we may need to ignore * IP_ROUTETOIF. */ - if (state.ro != &iproute || state.ro->ro_rt != NULL) { + if (ipsec_state.tunneled) { flags &= ~IP_ROUTETOIF; - ro = state.ro; + ipsec_saved_route = ro; + ro = &ipsec_state.ro; } - } else - ro = state.ro; - - dst = (struct sockaddr_in *)state.dst; + } else { + ipsec_saved_route = ro; + ro = &ipsec_state.ro; + } + dst = (struct sockaddr_in *)(void *)ipsec_state.dst; if (error) { /* mbuf is already reclaimed in ipsec4_output. */ m0 = NULL; @@ -756,103 +1229,125 @@ injectit: /* be sure to update variables that are affected by ipsec4_output() */ ip = mtod(m, struct ip *); - + #ifdef _IP_VHL hlen = IP_VHL_HL(ip->ip_vhl) << 2; #else hlen = ip->ip_hl << 2; #endif /* Check that there wasn't a route change and src is still valid */ - - if (ro->ro_rt->generation_id != route_generation) { - if (ifa_foraddr(ip->ip_src.s_addr) == 0 && ((flags & (IP_ROUTETOIF | IP_FORWARDING)) == 0)) { - error = EADDRNOTAVAIL; - KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 5,0,0,0,0); + if (ro->ro_rt != NULL && ro->ro_rt->generation_id != route_generation) { + if ((src_ia = ifa_foraddr(ip->ip_src.s_addr)) == NULL && + ((flags & (IP_ROUTETOIF | IP_FORWARDING)) == 0)) { + error = EADDRNOTAVAIL; + KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, + 5,0,0,0,0); goto bad; } rtfree(ro->ro_rt); ro->ro_rt = NULL; + if (src_ia != NULL) + IFA_REMREF(&src_ia->ia_ifa); } if (ro->ro_rt == NULL) { if ((flags & IP_ROUTETOIF) == 0) { - printf("ip_output: " - "can't update route after IPsec processing\n"); - error = EHOSTUNREACH; /*XXX*/ - KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 6,0,0,0,0); + printf("ip_output: can't update route after " + "IPsec processing\n"); + error = EHOSTUNREACH; /*XXX*/ + KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, + 6,0,0,0,0); goto bad; } } else { if (ia) - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); + RT_LOCK_SPIN(ro->ro_rt); ia = ifatoia(ro->ro_rt->rt_ifa); - if (ia) - ifaref(&ia->ia_ifa); + if (ia) { + /* Become a regular mutex */ + RT_CONVERT_LOCK(ro->ro_rt); + IFA_ADDREF(&ia->ia_ifa); + } ifp = ro->ro_rt->rt_ifp; + RT_UNLOCK(ro->ro_rt); } /* make it flipped, again. */ + +#if BYTE_ORDER != BIG_ENDIAN NTOHS(ip->ip_len); NTOHS(ip->ip_off); +#endif + KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 7,0xff,0xff,0xff,0xff); - + /* Pass to filters again */ if (!TAILQ_EMPTY(&ipv4_filters)) { struct ipfilter *filter; - - lck_mtx_unlock(ip_mutex); + + ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS; + + /* Check that a TSO frame isn't passed to a filter. + * This could happen if a filter is inserted while + * TCP is sending the TSO packet. + */ + if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) { + error = EMSGSIZE; + goto bad; + } + ipf_ref(); + + /* 4135317 - always pass network byte order to filter */ + +#if BYTE_ORDER != BIG_ENDIAN + HTONS(ip->ip_len); + HTONS(ip->ip_off); +#endif + TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) { if (filter->ipf_filter.ipf_output) { errno_t result; - result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, 0); + result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, ippo); if (result == EJUSTRETURN) { ipf_unref(); goto done; } if (result != 0) { ipf_unref(); - lck_mtx_lock(ip_mutex); goto bad; } } } + + /* set back to host byte order */ + ip = mtod(m, struct ip *); + +#if BYTE_ORDER != BIG_ENDIAN + NTOHS(ip->ip_len); + NTOHS(ip->ip_off); +#endif + ipf_unref(); - lck_mtx_lock(ip_mutex); } skip_ipsec: #endif /*IPSEC*/ - /* - * IpHack's section. - * - Xlate: translate packet's addr/port (NAT). - * - Firewall: deny/allow/etc. - * - Wrap: fake packet's addr/port - * - Encapsulate: put it in another IP and send out. - */ - if (fr_checkp) { - struct mbuf *m1 = m; - - if ((error = (*fr_checkp)(ip, hlen, ifp, 1, &m1)) || !m1) { - lck_mtx_unlock(ip_mutex); - goto done; - } - ip = mtod(m0 = m = m1, struct ip *); - } - +#if IPFIREWALL /* * Check with the firewall... * but not if we are already being fwd'd from a firewall. */ - if (fw_enable && IPFW_LOADED && !args.next_hop) { + if (fw_enable && IPFW_LOADED && !args.fwa_next_hop) { struct sockaddr_in *old = dst; - args.m = m; - args.next_hop = dst; - args.oif = ifp; + args.fwa_m = m; + args.fwa_next_hop = dst; + args.fwa_oif = ifp; off = ip_fw_chk_ptr(&args); - m = args.m; - dst = args.next_hop; + m = args.fwa_m; + dst = args.fwa_next_hop; /* * On return we must do the following: @@ -873,33 +1368,35 @@ skip_ipsec: if (m) m_freem(m); error = EACCES ; - lck_mtx_unlock(ip_mutex); goto done ; } ip = mtod(m, struct ip *); - if (off == 0 && dst == old) /* common case */ + + if (off == 0 && dst == old) {/* common case */ goto pass ; -#if DUMMYNET - if (DUMMYNET_LOADED && (off & IP_FW_PORT_DYNT_FLAG) != 0) { - /* - * pass the pkt to dummynet. Need to include - * pipe number, m, ifp, ro, dst because these are - * not recomputed in the next pass. - * All other parameters have been already used and - * so they are not needed anymore. - * XXX note: if the ifp or ro entry are deleted - * while a pkt is in dummynet, we are in trouble! - */ - args.ro = ro; - args.dst = dst; - args.flags = flags; - - lck_mtx_unlock(ip_mutex); - error = ip_dn_io_ptr(m, off & 0xffff, DN_TO_IP_OUT, - &args); - goto done; } -#endif /* DUMMYNET */ +#if DUMMYNET + if (DUMMYNET_LOADED && (off & IP_FW_PORT_DYNT_FLAG) != 0) { + /* + * pass the pkt to dummynet. Need to include + * pipe number, m, ifp, ro, dst because these are + * not recomputed in the next pass. + * All other parameters have been already used and + * so they are not needed anymore. + * XXX note: if the ifp or ro entry are deleted + * while a pkt is in dummynet, we are in trouble! + */ + args.fwa_ro = ro; + args.fwa_dst = dst; + args.fwa_oflags = flags; + if (flags & IP_OUTARGS) + args.fwa_ipoa = ipoa; + + error = ip_dn_io_ptr(m, off & 0xffff, DN_TO_IP_OUT, + &args, DN_CLIENT_IPFW); + goto done; + } +#endif /* DUMMYNET */ #if IPDIVERT if (off != 0 && (off & IP_FW_PORT_DYNT_FLAG) == 0) { struct mbuf *clone = NULL; @@ -918,11 +1415,14 @@ skip_ipsec: } /* Restore packet header fields to original values */ + +#if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_len); HTONS(ip->ip_off); +#endif /* Deliver packet to divert input routine */ - divert_packet(m, 0, off & 0xffff, args.divert_rule); + divert_packet(m, 0, off & 0xffff, args.fwa_divert_rule); /* If 'tee', continue with original packet */ if (clone != NULL) { @@ -930,7 +1430,6 @@ skip_ipsec: ip = mtod(m, struct ip *); goto pass; } - lck_mtx_unlock(ip_mutex); goto done; } #endif @@ -969,35 +1468,42 @@ skip_ipsec: * as the packet runs through ip_input() as * it is done through a ISR. */ + lck_rw_lock_shared(in_ifaddr_rwlock); TAILQ_FOREACH(ia_fw, &in_ifaddrhead, ia_link) { /* * If the addr to forward to is one * of ours, we pretend to * be the destination for this packet. */ + IFA_LOCK_SPIN(&ia_fw->ia_ifa); if (IA_SIN(ia_fw)->sin_addr.s_addr == - dst->sin_addr.s_addr) + dst->sin_addr.s_addr) { + IFA_UNLOCK(&ia_fw->ia_ifa); break; + } + IFA_UNLOCK(&ia_fw->ia_ifa); } - if (ia) { + lck_rw_done(in_ifaddr_rwlock); + if (ia_fw) { /* tell ip_input "dont filter" */ struct m_tag *fwd_tag; struct ip_fwd_tag *ipfwd_tag; - - fwd_tag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFORWARD, - sizeof(struct sockaddr_in), M_NOWAIT); + + fwd_tag = m_tag_create(KERNEL_MODULE_TAG_ID, + KERNEL_TAG_TYPE_IPFORWARD, + sizeof (*ipfwd_tag), M_NOWAIT, m); if (fwd_tag == NULL) { error = ENOBUFS; goto bad; } - + ipfwd_tag = (struct ip_fwd_tag *)(fwd_tag+1); - ipfwd_tag->next_hop = args.next_hop; + ipfwd_tag->next_hop = args.fwa_next_hop; m_tag_prepend(m, fwd_tag); if (m->m_pkthdr.rcvif == NULL) - m->m_pkthdr.rcvif = ifunit("lo0"); + m->m_pkthdr.rcvif = lo_ifp; if ((~IF_HWASSIST_CSUM_FLAGS(m->m_pkthdr.rcvif->if_hwassist) & m->m_pkthdr.csum_flags) == 0) { if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { @@ -1014,16 +1520,18 @@ skip_ipsec: m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; ip->ip_sum = in_cksum(m, hlen); } + +#if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_len); HTONS(ip->ip_off); - - lck_mtx_unlock(ip_mutex); - +#endif + /* we need to call dlil_output to run filters * and resync to avoid recursion loops. */ if (lo_ifp) { - dlil_output(lo_ifp, PF_INET, m, 0, (struct sockaddr *)dst, 0); + dlil_output(lo_ifp, PF_INET, m, 0, + (struct sockaddr *)dst, 0, adv); } else { printf("ip_output: no loopback ifp for forwarding!!!\n"); @@ -1038,36 +1546,52 @@ skip_ipsec: */ bcopy(dst, &ro_fwd->ro_dst, sizeof(*dst)); - ro_fwd->ro_rt = 0; + ro_fwd->ro_rt = NULL; rtalloc_ign(ro_fwd, RTF_PRCLONING); - if (ro_fwd->ro_rt == 0) { - ipstat.ips_noroute++; + if (ro_fwd->ro_rt == NULL) { + OSAddAtomic(1, &ipstat.ips_noroute); error = EHOSTUNREACH; goto bad; } + RT_LOCK_SPIN(ro_fwd->ro_rt); ia_fw = ifatoia(ro_fwd->ro_rt->rt_ifa); + if (ia_fw != NULL) { + /* Become a regular mutex */ + RT_CONVERT_LOCK(ro_fwd->ro_rt); + IFA_ADDREF(&ia_fw->ia_ifa); + } ifp = ro_fwd->ro_rt->rt_ifp; ro_fwd->ro_rt->rt_use++; if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY) - dst = (struct sockaddr_in *)ro_fwd->ro_rt->rt_gateway; - if (ro_fwd->ro_rt->rt_flags & RTF_HOST) + dst = (struct sockaddr_in *)(void *)ro_fwd->ro_rt->rt_gateway; + if (ro_fwd->ro_rt->rt_flags & RTF_HOST) { isbroadcast = (ro_fwd->ro_rt->rt_flags & RTF_BROADCAST); - else + } else { + /* Become a regular mutex */ + RT_CONVERT_LOCK(ro_fwd->ro_rt); isbroadcast = in_broadcast(dst->sin_addr, ifp); + } + RT_UNLOCK(ro_fwd->ro_rt); rtfree(ro->ro_rt); ro->ro_rt = ro_fwd->ro_rt; - dst = (struct sockaddr_in *)&ro_fwd->ro_dst; + dst = (struct sockaddr_in *)(void *)&ro_fwd->ro_dst; /* * If we added a default src ip earlier, * which would have been gotten from the-then * interface, do it again, from the new one. */ - if (fwd_rewrite_src) - ip->ip_src = IA_SIN(ia_fw)->sin_addr; + if (ia_fw != NULL) { + if (fwd_rewrite_src) { + IFA_LOCK_SPIN(&ia_fw->ia_ifa); + ip->ip_src = IA_SIN(ia_fw)->sin_addr; + IFA_UNLOCK(&ia_fw->ia_ifa); + } + IFA_REMREF(&ia_fw->ia_ifa); + } goto pass ; } #endif /* IPFIREWALL_FORWARD */ @@ -1077,17 +1601,17 @@ skip_ipsec: */ m_freem(m); error = EACCES; /* not sure this is the right error msg */ - lck_mtx_unlock(ip_mutex); goto done; } pass: +#endif /* IPFIREWALL */ #if __APPLE__ /* Do not allow loopback address to wind up on a wire */ if ((ifp->if_flags & IFF_LOOPBACK) == 0 && ((ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)) { - ipstat.ips_badaddr++; + OSAddAtomic(1, &ipstat.ips_badaddr); m_freem(m); /* * Do not simply drop the packet just like a firewall -- we want the @@ -1097,11 +1621,12 @@ pass: * loopback as the source address. */ error = ENETUNREACH; - lck_mtx_unlock(ip_mutex); goto done; } #endif m->m_pkthdr.csum_flags |= CSUM_IP; + tso = (ifp->if_hwassist & IFNET_TSO_IPV4) && (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4); + sw_csum = m->m_pkthdr.csum_flags & ~IF_HWASSIST_CSUM_FLAGS(ifp->if_hwassist); @@ -1116,38 +1641,52 @@ pass: /* Apple GMAC HW, expects STUFF_OFFSET << 16 | START_OFFSET */ u_short offset = (IP_VHL_HL(ip->ip_vhl) << 2) +14 ; /* IP+Enet header length */ u_short csumprev= m->m_pkthdr.csum_data & 0xFFFF; - m->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_TCP_SUM16; /* for GMAC */ + m->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_TCP_SUM16; /* for GMAC */ m->m_pkthdr.csum_data = (csumprev + offset) << 16 ; m->m_pkthdr.csum_data += offset; - sw_csum = CSUM_DELAY_IP; /* do IP hdr chksum in software */ - } - else { + sw_csum = CSUM_DELAY_IP; /* do IP hdr chksum in software */ + } else { /* let the software handle any UDP or TCP checksums */ sw_csum |= (CSUM_DELAY_DATA & m->m_pkthdr.csum_flags); } + } else if (apple_hwcksum_tx == 0) { + sw_csum |= (CSUM_DELAY_DATA | CSUM_DELAY_IP) & + m->m_pkthdr.csum_flags; } - + if (sw_csum & CSUM_DELAY_DATA) { in_delayed_cksum(m); sw_csum &= ~CSUM_DELAY_DATA; m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } - - m->m_pkthdr.csum_flags &= IF_HWASSIST_CSUM_FLAGS(ifp->if_hwassist); + + if (apple_hwcksum_tx != 0) { + m->m_pkthdr.csum_flags &= + IF_HWASSIST_CSUM_FLAGS(ifp->if_hwassist); + } else { + m->m_pkthdr.csum_flags = 0; + } /* * If small enough for interface, or the interface will take * care of the fragmentation for us, can just send directly. */ - if ((u_short)ip->ip_len <= ifp->if_mtu || + if ((u_short)ip->ip_len <= ifp->if_mtu || tso || ifp->if_hwassist & CSUM_FRAGMENT) { + if (tso) + m->m_pkthdr.csum_flags |= CSUM_TSO_IPV4; + + +#if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_len); HTONS(ip->ip_off); +#endif + ip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) { ip->ip_sum = in_cksum(m, hlen); } - + #ifndef __APPLE__ /* Record statistics for this interface address. */ if (!(flags & IP_FORWARDING) && ia != NULL) { @@ -1162,23 +1701,31 @@ pass: ipsec_delaux(m); #endif if (packetchain == 0) { - lck_mtx_unlock(ip_mutex); - error = dlil_output(ifp, PF_INET, m, (void *) ro->ro_rt, - (struct sockaddr *)dst, 0); - goto done; + if (ro->ro_rt && nstat_collect) + nstat_route_tx(ro->ro_rt, 1, m->m_pkthdr.len, 0); + error = dlil_output(ifp, PF_INET, m, ro->ro_rt, + (struct sockaddr *)dst, 0, adv); + goto done; } else { /* packet chaining allows us to reuse the route for all packets */ + bytecnt += m->m_pkthdr.len; + mppn = &m->m_nextpkt; m = m->m_nextpkt; if (m == NULL) { +#if PF +sendchain: +#endif /* PF */ if (pktcnt > ip_maxchainsent) ip_maxchainsent = pktcnt; + if (ro->ro_rt && nstat_collect) + nstat_route_tx(ro->ro_rt, pktcnt, bytecnt, 0); //send - lck_mtx_unlock(ip_mutex); - error = dlil_output_list(ifp, PF_INET, packetlist, (void *) ro->ro_rt, - (struct sockaddr *)dst, 0); + error = dlil_output(ifp, PF_INET, packetlist, + ro->ro_rt, (struct sockaddr *)dst, 0, adv); pktcnt = 0; + bytecnt = 0; goto done; - + } m0 = m; pktcnt++; @@ -1189,7 +1736,9 @@ pass: * Too large for interface; fragment if possible. * Must be able to put at least 8 bytes per fragment. */ - if (ip->ip_off & IP_DF) { + + if (ip->ip_off & IP_DF || (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) || + pktcnt > 0) { error = EMSGSIZE; /* * This case can happen if the user changed the MTU @@ -1198,20 +1747,105 @@ pass: * them, there is no way for one to update all its * routes when the MTU is changed. */ - if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) - && !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) - && (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) { - ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu; + if (ro->ro_rt) { + RT_LOCK_SPIN(ro->ro_rt); + if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) + && !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) + && (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) { + ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu; + } + RT_UNLOCK(ro->ro_rt); + } + if (pktcnt > 0) { + m0 = packetlist; } - ipstat.ips_cantfrag++; + OSAddAtomic(1, &ipstat.ips_cantfrag); goto bad; } - len = (ifp->if_mtu - hlen) &~ 7; - if (len < 8) { - error = EMSGSIZE; + + error = ip_fragment(m, ifp, ifp->if_mtu, sw_csum); + if (error != 0) { + m0 = m = NULL; goto bad; } + KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr, + ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len); + + for (m = m0; m; m = m0) { + m0 = m->m_nextpkt; + m->m_nextpkt = 0; +#if IPSEC + /* clean ipsec history once it goes out of the node */ + if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) + ipsec_delaux(m); +#endif + if (error == 0) { +#ifndef __APPLE__ + /* Record statistics for this interface address. */ + if (ia != NULL) { + ia->ia_ifa.if_opackets++; + ia->ia_ifa.if_obytes += m->m_pkthdr.len; + } +#endif + if ((packetchain != 0) && (pktcnt > 0)) + panic("ip_output: mix of packet in packetlist is wrong=%p", packetlist); + if (ro->ro_rt && nstat_collect) + nstat_route_tx(ro->ro_rt, 1, m->m_pkthdr.len, 0); + error = dlil_output(ifp, PF_INET, m, ro->ro_rt, + (struct sockaddr *)dst, 0, adv); + } else + m_freem(m); + } + + if (error == 0) + OSAddAtomic(1, &ipstat.ips_fragmented); + +done: + if (ia) { + IFA_REMREF(&ia->ia_ifa); + ia = NULL; + } +#if IPSEC + if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) { + if (ipsec_state.ro.ro_rt) + rtfree(ipsec_state.ro.ro_rt); + if (sp != NULL) { + KEYDEBUG(KEYDEBUG_IPSEC_STAMP, + printf("DP ip_output call free SP:%x\n", sp)); + key_freesp(sp, KEY_SADB_UNLOCKED); + } + } +#endif /* IPSEC */ + + KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_END, error,0,0,0,0); + return (error); +bad: + m_freem(m0); + goto done; +} + +int +ip_fragment(struct mbuf *m, struct ifnet *ifp, unsigned long mtu, int sw_csum) +{ + struct ip *ip, *mhip; + int len, hlen, mhlen, firstlen, off, error = 0; + struct mbuf **mnext = &m->m_nextpkt, *m0; + int nfrags = 1; + + ip = mtod(m, struct ip *); +#ifdef _IP_VHL + hlen = IP_VHL_HL(ip->ip_vhl) << 2; +#else + hlen = ip->ip_hl << 2; +#endif + + firstlen = len = (mtu - hlen) &~ 7; + if (len < 8) { + m_freem(m); + return (EMSGSIZE); + } + /* * if the interface will not calculate checksums on * fragmented packets, then do it here. @@ -1219,19 +1853,9 @@ pass: if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA && (ifp->if_hwassist & CSUM_IP_FRAGS) == 0) { in_delayed_cksum(m); - if (m == NULL) { - lck_mtx_unlock(ip_mutex); - return(ENOMEM); - } m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } - - { - int mhlen, firstlen = len; - struct mbuf **mnext = &m->m_nextpkt; - int nfrags = 1; - /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto chain. @@ -1239,10 +1863,10 @@ pass: m0 = m; mhlen = sizeof (struct ip); for (off = hlen + len; off < (u_short)ip->ip_len; off += len) { - MGETHDR(m, M_DONTWAIT, MT_HEADER); + MGETHDR(m, M_DONTWAIT, MT_HEADER); /* MAC-OK */ if (m == 0) { error = ENOBUFS; - ipstat.ips_odropped++; + OSAddAtomic(1, &ipstat.ips_odropped); goto sendorfree; } m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG; @@ -1266,14 +1890,25 @@ pass: if (m->m_next == 0) { (void) m_free(m); error = ENOBUFS; /* ??? */ - ipstat.ips_odropped++; + OSAddAtomic(1, &ipstat.ips_odropped); goto sendorfree; } m->m_pkthdr.len = mhlen + len; m->m_pkthdr.rcvif = 0; m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags; m->m_pkthdr.socket_id = m0->m_pkthdr.socket_id; + + M_COPY_PFTAG(m, m0); + m_set_service_class(m, m0->m_pkthdr.svc); + +#if CONFIG_MACF_NET + mac_netinet_fragment(m0, m); +#endif + +#if BYTE_ORDER != BIG_ENDIAN HTONS(mhip->ip_off); +#endif + mhip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) { mhip->ip_sum = in_cksum(m, mhlen); @@ -1282,7 +1917,7 @@ pass: mnext = &m->m_nextpkt; nfrags++; } - ipstat.ips_ofragments += nfrags; + OSAddAtomic(nfrags, &ipstat.ips_ofragments); /* set first/last markers for fragment chain */ m->m_flags |= M_LASTFRAG; @@ -1298,134 +1933,139 @@ pass: m->m_pkthdr.len = hlen + firstlen; ip->ip_len = htons((u_short)m->m_pkthdr.len); ip->ip_off |= IP_MF; + +#if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_off); +#endif + ip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) { ip->ip_sum = in_cksum(m, hlen); } sendorfree: + if (error) + m_freem_list(m0); - KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr, - ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len); - - lck_mtx_unlock(ip_mutex); - for (m = m0; m; m = m0) { - m0 = m->m_nextpkt; - m->m_nextpkt = 0; -#if IPSEC - /* clean ipsec history once it goes out of the node */ - if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) - ipsec_delaux(m); -#endif - if (error == 0) { -#ifndef __APPLE__ - /* Record statistics for this interface address. */ - if (ia != NULL) { - ia->ia_ifa.if_opackets++; - ia->ia_ifa.if_obytes += m->m_pkthdr.len; - } -#endif - if ((packetchain != 0) && (pktcnt > 0)) - panic("ip_output: mix of packet in packetlist is wrong=%x", packetlist); - error = dlil_output(ifp, PF_INET, m, (void *) ro->ro_rt, - (struct sockaddr *)dst, 0); - } else - m_freem(m); - } + return (error); +} - if (error == 0) - ipstat.ips_fragmented++; - } -done: - if (ia) { - ifafree(&ia->ia_ifa); - ia = NULL; - } -#if IPSEC - if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) { - if (ro == &iproute && ro->ro_rt) { - rtfree(ro->ro_rt); - ro->ro_rt = NULL; - } - if (sp != NULL) { - KEYDEBUG(KEYDEBUG_IPSEC_STAMP, - printf("DP ip_output call free SP:%x\n", sp)); - lck_mtx_lock(sadb_mutex); - key_freesp(sp); - lck_mtx_unlock(sadb_mutex); - } +static void +ip_out_cksum_stats(int proto, u_int32_t len) +{ + switch (proto) { + case IPPROTO_TCP: + tcp_out_cksum_stats(len); + break; + case IPPROTO_UDP: + udp_out_cksum_stats(len); + break; + default: + /* keep only TCP or UDP stats for now */ + break; } -#endif /* IPSEC */ - - KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_END, error,0,0,0,0); - return (error); -bad: - m_freem(m0); - lck_mtx_unlock(ip_mutex); - goto done; } void -in_delayed_cksum_offset(struct mbuf *m, int ip_offset) +in_delayed_cksum_offset(struct mbuf *m0, int ip_offset) { struct ip *ip; - u_short csum, offset; - - while (ip_offset > m->m_len) { + unsigned char buf[sizeof(struct ip)]; + u_short csum, offset, ip_len; + + /* Save copy of first mbuf pointer and the ip_offset before modifying */ + struct mbuf *m = m0; + int ip_offset_copy = ip_offset; + + while (ip_offset >= m->m_len) { ip_offset -= m->m_len; m = m->m_next; - if (m) { - printf("in_delayed_cksum_withoffset failed - ip_offset wasn't in the packet\n"); + if (m == NULL) { + printf("in_delayed_cksum_withoffset failed - " + "ip_offset wasn't in the packet\n"); return; } } - - if (ip_offset + sizeof(struct ip) > m->m_len) { - printf("delayed m_pullup, m->len: %d off: %d p: %d\n", - m->m_len, ip_offset, ip->ip_p); - /* - * XXX - * this shouldn't happen - */ - m = m_pullup(m, ip_offset + sizeof(struct ip)); + + /* + * In case the IP header is not contiguous, or not 32-bit + * aligned, copy it to a local buffer. + */ + if ((ip_offset + sizeof(struct ip) > m->m_len) || + !IP_HDR_ALIGNED_P(mtod(m, caddr_t) + ip_offset)) { +#if DEBUG + printf("delayed m_pullup, m->len: %d off: %d\n", + m->m_len, ip_offset); +#endif + m_copydata(m, ip_offset, sizeof(struct ip), (caddr_t) buf); + + ip = (struct ip *)(void *)buf; + } else { + ip = (struct ip*)(void *)(m->m_data + ip_offset); } - + /* Gross */ if (ip_offset) { m->m_len -= ip_offset; m->m_data += ip_offset; } - - ip = mtod(m, struct ip*); + offset = IP_VHL_HL(ip->ip_vhl) << 2 ; - csum = in_cksum_skip(m, ip->ip_len, offset); - if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0) + + /* + * We could be in the context of an IP or interface filter; in the + * former case, ip_len would be in host (correct) order while for + * the latter it would be in network order. Because of this, we + * attempt to interpret the length field by comparing it against + * the actual packet length. If the comparison fails, byte swap + * the length and check again. If it still fails, then the packet + * is bogus and we give up. + */ + ip_len = ip->ip_len; + if (ip_len != (m0->m_pkthdr.len - ip_offset_copy)) { + ip_len = SWAP16(ip_len); + if (ip_len != (m0->m_pkthdr.len - ip_offset_copy)) { + printf("in_delayed_cksum_offset: ip_len %d (%d) " + "doesn't match actual length %d\n", ip->ip_len, + ip_len, (m0->m_pkthdr.len - ip_offset_copy)); + return; + } + } + + csum = in_cksum_skip(m, ip_len, offset); + + /* Update stats */ + ip_out_cksum_stats(ip->ip_p, ip_len - offset); + + if (m0->m_pkthdr.csum_flags & CSUM_UDP && csum == 0) csum = 0xffff; - offset += m->m_pkthdr.csum_data & 0xFFFF; /* checksum offset */ - + offset += m0->m_pkthdr.csum_data & 0xFFFF; /* checksum offset */ + /* Gross */ if (ip_offset) { if (M_LEADINGSPACE(m) < ip_offset) - panic("in_delayed_cksum_withoffset - chain modified!\n"); + panic("in_delayed_cksum_offset - chain modified!\n"); m->m_len += ip_offset; m->m_data -= ip_offset; } - if (offset > ip->ip_len) /* bogus offset */ + if (offset > ip_len) /* bogus offset */ return; + /* Insert the checksum in the existing chain */ if (offset + ip_offset + sizeof(u_short) > m->m_len) { - printf("delayed m_pullup, m->len: %d off: %d p: %d\n", + char tmp[2]; + +#if DEBUG + printf("delayed m_copyback, m->len: %d off: %d p: %d\n", m->m_len, offset + ip_offset, ip->ip_p); - /* - * XXX - * this shouldn't happen, but if it does, the - * correct behavior may be to insert the checksum - * in the existing chain instead of rearranging it. - */ - m = m_pullup(m, offset + ip_offset + sizeof(u_short)); +#endif + *(u_short *)(void *)tmp = csum; + m_copyback(m, offset + ip_offset, 2, tmp); + } else if (IP_HDR_ALIGNED_P(mtod(m, caddr_t) + ip_offset)) { + *(u_short *)(void *)(m->m_data + offset + ip_offset) = csum; + } else { + bcopy(&csum, (m->m_data + offset + ip_offset), sizeof (csum)); } - *(u_short *)(m->m_data + offset + ip_offset) = csum; } void @@ -1439,43 +2079,82 @@ in_cksum_offset(struct mbuf* m, size_t ip_offset) { struct ip* ip = NULL; int hlen = 0; - - while (ip_offset > m->m_len) { + unsigned char buf[sizeof(struct ip)]; + int swapped = 0; + + /* Save copy of first mbuf pointer and the ip_offset before modifying */ + struct mbuf* m0 = m; + size_t ip_offset_copy = ip_offset; + + while (ip_offset >= m->m_len) { ip_offset -= m->m_len; m = m->m_next; - if (m) { - printf("in_cksum_offset failed - ip_offset wasn't in the packet\n"); + if (m == NULL) { + printf("in_cksum_offset failed - ip_offset wasn't " + "in the packet\n"); return; } } - - if (ip_offset + sizeof(struct ip) > m->m_len) { - printf("in_cksum_offset - delayed m_pullup, m->len: %d off: %d\n", - m->m_len, ip_offset); - /* - * XXX - * this shouldn't happen - */ - m = m_pullup(m, ip_offset + sizeof(struct ip)); + + /* + * In case the IP header is not contiguous, or not 32-bit + * aligned, copy it to a local buffer. + */ + if ((ip_offset + sizeof(struct ip) > m->m_len) || + !IP_HDR_ALIGNED_P(mtod(m, caddr_t) + ip_offset)) { +#if DEBUG + printf("in_cksum_offset - delayed m_pullup, m->len: %d " + "off: %lu\n", m->m_len, ip_offset); +#endif + m_copydata(m, ip_offset, sizeof(struct ip), (caddr_t) buf); + + ip = (struct ip *)(void *)buf; + ip->ip_sum = 0; + m_copyback(m, ip_offset + offsetof(struct ip, ip_sum), 2, + (caddr_t)&ip->ip_sum); + } else { + ip = (struct ip*)(void *)(m->m_data + ip_offset); + ip->ip_sum = 0; } - + /* Gross */ if (ip_offset) { m->m_len -= ip_offset; m->m_data += ip_offset; } - ip = mtod(m, struct ip*); - #ifdef _IP_VHL hlen = IP_VHL_HL(ip->ip_vhl) << 2; #else hlen = ip->ip_hl << 2; #endif - + /* + * We could be in the context of an IP or interface filter; in the + * former case, ip_len would be in host order while for the latter + * it would be in network (correct) order. Because of this, we + * attempt to interpret the length field by comparing it against + * the actual packet length. If the comparison fails, byte swap + * the length and check again. If it still fails, then the packet + * is bogus and we give up. + */ + if (ntohs(ip->ip_len) != (m0->m_pkthdr.len - ip_offset_copy)) { + ip->ip_len = SWAP16(ip->ip_len); + swapped = 1; + if (ntohs(ip->ip_len) != (m0->m_pkthdr.len - ip_offset_copy)) { + ip->ip_len = SWAP16(ip->ip_len); + printf("in_cksum_offset: ip_len %d (%d) " + "doesn't match actual length %lu\n", + ip->ip_len, SWAP16(ip->ip_len), + (m0->m_pkthdr.len - ip_offset_copy)); + return; + } + } + ip->ip_sum = 0; ip->ip_sum = in_cksum(m, hlen); - + if (swapped) + ip->ip_len = SWAP16(ip->ip_len); + /* Gross */ if (ip_offset) { if (M_LEADINGSPACE(m) < ip_offset) @@ -1483,6 +2162,27 @@ in_cksum_offset(struct mbuf* m, size_t ip_offset) m->m_len += ip_offset; m->m_data -= ip_offset; } + + /* + * Insert the checksum in the existing chain if IP header not + * contiguous, or if it's not 32-bit aligned, i.e. all the cases + * where it was copied to a local buffer. + */ + if (ip_offset + sizeof(struct ip) > m->m_len) { + char tmp[2]; + +#if DEBUG + printf("in_cksum_offset m_copyback, m->len: %u off: %lu " + "p: %d\n", m->m_len, + ip_offset + offsetof(struct ip, ip_sum), ip->ip_p); +#endif + *(u_short *)(void *)tmp = ip->ip_sum; + m_copyback(m, ip_offset + offsetof(struct ip, ip_sum), 2, tmp); + } else if (!IP_HDR_ALIGNED_P(mtod(m, caddr_t) + ip_offset)) { + bcopy(&ip->ip_sum, + (m->m_data + ip_offset + offsetof(struct ip, ip_sum)), + sizeof (u_short)); + } } /* @@ -1509,10 +2209,13 @@ ip_insertoptions(m, opt, phlen) if (p->ipopt_dst.s_addr) ip->ip_dst = p->ipopt_dst; if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) { - MGETHDR(n, M_DONTWAIT, MT_HEADER); + MGETHDR(n, M_DONTWAIT, MT_HEADER); /* MAC-OK */ if (n == 0) return (m); n->m_pkthdr.rcvif = 0; +#if CONFIG_MACF_NET + mac_mbuf_label_copy(m, n); +#endif n->m_pkthdr.len = m->m_pkthdr.len + optlen; m->m_len -= sizeof(struct ip); m->m_data += sizeof(struct ip); @@ -1610,7 +2313,8 @@ ip_ctloutput(so, sopt) error = EMSGSIZE; break; } - MGET(m, sopt->sopt_p ? M_WAIT : M_DONTWAIT, MT_HEADER); + MGET(m, sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT, + MT_HEADER); if (m == 0) { error = ENOBUFS; break; @@ -1632,9 +2336,7 @@ ip_ctloutput(so, sopt) case IP_RECVDSTADDR: case IP_RECVIF: case IP_RECVTTL: -#if defined(NFAITH) && NFAITH > 0 - case IP_FAITH: -#endif + case IP_RECVPKTINFO: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) @@ -1674,23 +2376,100 @@ ip_ctloutput(so, sopt) OPTSET(INP_RECVTTL); break; -#if defined(NFAITH) && NFAITH > 0 - case IP_FAITH: - OPTSET(INP_FAITH); + case IP_RECVPKTINFO: + OPTSET(INP_PKTINFO); break; -#endif } break; #undef OPTSET - case IP_MULTICAST_IF: - case IP_MULTICAST_VIF: - case IP_MULTICAST_TTL: - case IP_MULTICAST_LOOP: - case IP_ADD_MEMBERSHIP: - case IP_DROP_MEMBERSHIP: - error = ip_setmoptions(sopt, &inp->inp_moptions); - break; +#if CONFIG_FORCE_OUT_IFP + /* + * Apple private interface, similar to IP_BOUND_IF, except + * that the parameter is a NULL-terminated string containing + * the name of the network interface; an emptry string means + * unbind. Applications are encouraged to use IP_BOUND_IF + * instead, as that is the current "official" API. + */ + case IP_FORCE_OUT_IFP: { + char ifname[IFNAMSIZ]; + unsigned int ifscope; + + /* This option is settable only for IPv4 */ + if (!(inp->inp_vflag & INP_IPV4)) { + error = EINVAL; + break; + } + + /* Verify interface name parameter is sane */ + if (sopt->sopt_valsize > sizeof(ifname)) { + error = EINVAL; + break; + } + + /* Copy the interface name */ + if (sopt->sopt_valsize != 0) { + error = sooptcopyin(sopt, ifname, + sizeof (ifname), sopt->sopt_valsize); + if (error) + break; + } + + if (sopt->sopt_valsize == 0 || ifname[0] == '\0') { + /* Unbind this socket from any interface */ + ifscope = IFSCOPE_NONE; + } else { + ifnet_t ifp; + + /* Verify name is NULL terminated */ + if (ifname[sopt->sopt_valsize - 1] != '\0') { + error = EINVAL; + break; + } + + /* Bail out if given bogus interface name */ + if (ifnet_find_by_name(ifname, &ifp) != 0) { + error = ENXIO; + break; + } + + /* Bind this socket to this interface */ + ifscope = ifp->if_index; + + /* + * Won't actually free; since we don't release + * this later, we should do it now. + */ + ifnet_release(ifp); + } + error = inp_bindif(inp, ifscope); + } + break; +#endif + /* + * Multicast socket options are processed by the in_mcast + * module. + */ + case IP_MULTICAST_IF: + case IP_MULTICAST_IFINDEX: + case IP_MULTICAST_VIF: + case IP_MULTICAST_TTL: + case IP_MULTICAST_LOOP: + case IP_ADD_MEMBERSHIP: + case IP_DROP_MEMBERSHIP: + case IP_ADD_SOURCE_MEMBERSHIP: + case IP_DROP_SOURCE_MEMBERSHIP: + case IP_BLOCK_SOURCE: + case IP_UNBLOCK_SOURCE: + case IP_MSFILTER: + case MCAST_JOIN_GROUP: + case MCAST_LEAVE_GROUP: + case MCAST_JOIN_SOURCE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + error = inp_setmoptions(inp, sopt); + break; case IP_PORTRANGE: error = sooptcopyin(sopt, &optval, sizeof optval, @@ -1729,29 +2508,97 @@ ip_ctloutput(so, sopt) struct mbuf *m; int optname; - if (sopt->sopt_valsize > MCLBYTES) { - error = EMSGSIZE; - break; - } if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ break; if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ break; - priv = (sopt->sopt_p != NULL && - proc_suser(sopt->sopt_p) != 0) ? 0 : 1; + priv = (proc_suser(sopt->sopt_p) == 0); if (m) { req = mtod(m, caddr_t); len = m->m_len; } optname = sopt->sopt_name; - lck_mtx_lock(sadb_mutex); error = ipsec4_set_policy(inp, optname, req, len, priv); - lck_mtx_unlock(sadb_mutex); m_freem(m); break; } #endif /*IPSEC*/ +#if TRAFFIC_MGT + case IP_TRAFFIC_MGT_BACKGROUND: + { + unsigned background = 0; + error = sooptcopyin(sopt, &background, sizeof(background), sizeof(background)); + if (error) + break; + + if (background) { + socket_set_traffic_mgt_flags_locked(so, + TRAFFIC_MGT_SO_BACKGROUND); + } else { + socket_clear_traffic_mgt_flags_locked(so, + TRAFFIC_MGT_SO_BACKGROUND); + } + + break; + } +#endif /* TRAFFIC_MGT */ + + /* + * On a multihomed system, scoped routing can be used to + * restrict the source interface used for sending packets. + * The socket option IP_BOUND_IF binds a particular AF_INET + * socket to an interface such that data sent on the socket + * is restricted to that interface. This is unlike the + * SO_DONTROUTE option where the routing table is bypassed; + * therefore it allows for a greater flexibility and control + * over the system behavior, and does not place any restriction + * on the destination address type (e.g. unicast, multicast, + * or broadcast if applicable) or whether or not the host is + * directly reachable. Note that in the multicast transmit + * case, IP_MULTICAST_{IF,IFINDEX} takes precedence over + * IP_BOUND_IF, since the former practically bypasses the + * routing table; in this case, IP_BOUND_IF sets the default + * interface used for sending multicast packets in the absence + * of an explicit multicast transmit interface. + */ + case IP_BOUND_IF: + /* This option is settable only for IPv4 */ + if (!(inp->inp_vflag & INP_IPV4)) { + error = EINVAL; + break; + } + + error = sooptcopyin(sopt, &optval, sizeof (optval), + sizeof (optval)); + + if (error) + break; + + error = inp_bindif(inp, optval); + break; + + case IP_NO_IFT_CELLULAR: + /* This option is settable only for IPv4 */ + if (!(inp->inp_vflag & INP_IPV4)) { + error = EINVAL; + break; + } + + error = sooptcopyin(sopt, &optval, sizeof (optval), + sizeof (optval)); + + if (error) + break; + + error = inp_nocellular(inp, optval); + break; + + case IP_OUT_IF: + /* This option is not settable */ + error = EINVAL; + break; + default: error = ENOPROTOOPT; break; @@ -1779,9 +2626,7 @@ ip_ctloutput(so, sopt) case IP_RECVIF: case IP_RECVTTL: case IP_PORTRANGE: -#if defined(NFAITH) && NFAITH > 0 - case IP_FAITH: -#endif + case IP_RECVPKTINFO: switch (sopt->sopt_name) { case IP_TOS: @@ -1823,22 +2668,20 @@ ip_ctloutput(so, sopt) optval = 0; break; -#if defined(NFAITH) && NFAITH > 0 - case IP_FAITH: - optval = OPTBIT(INP_FAITH); + case IP_RECVPKTINFO: + optval = OPTBIT(INP_PKTINFO); break; -#endif } error = sooptcopyout(sopt, &optval, sizeof optval); break; case IP_MULTICAST_IF: + case IP_MULTICAST_IFINDEX: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: - case IP_ADD_MEMBERSHIP: - case IP_DROP_MEMBERSHIP: - error = ip_getmoptions(sopt, inp->inp_moptions); + case IP_MSFILTER: + error = inp_getmoptions(inp, sopt); break; #if IPSEC @@ -1852,9 +2695,7 @@ ip_ctloutput(so, sopt) req = mtod(m, caddr_t); len = m->m_len; } - lck_mtx_lock(sadb_mutex); error = ipsec4_get_policy(sotoinpcb(so), req, len, &m); - lck_mtx_unlock(sadb_mutex); if (error == 0) error = soopt_mcopyout(sopt, m); /* XXX */ if (error == 0) @@ -1863,6 +2704,32 @@ ip_ctloutput(so, sopt) } #endif /*IPSEC*/ +#if TRAFFIC_MGT + case IP_TRAFFIC_MGT_BACKGROUND: + { + unsigned background = (so->so_traffic_mgt_flags & TRAFFIC_MGT_SO_BACKGROUND); + return (sooptcopyout(sopt, &background, sizeof(background))); + break; + } +#endif /* TRAFFIC_MGT */ + + case IP_BOUND_IF: + if (inp->inp_flags & INP_BOUND_IF) + optval = inp->inp_boundifp->if_index; + error = sooptcopyout(sopt, &optval, sizeof (optval)); + break; + + case IP_NO_IFT_CELLULAR: + optval = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0; + error = sooptcopyout(sopt, &optval, sizeof (optval)); + break; + + case IP_OUT_IF: + optval = (inp->inp_last_outifp != NULL) ? + inp->inp_last_outifp->if_index : 0; + error = sooptcopyout(sopt, &optval, sizeof (optval)); + break; + default: error = ENOPROTOOPT; break; @@ -1878,10 +2745,10 @@ ip_ctloutput(so, sopt) * with destination address if source routed. */ static int -ip_pcbopts(optname, pcbopt, m) - int optname; - struct mbuf **pcbopt; - register struct mbuf *m; +ip_pcbopts( + __unused int optname, + struct mbuf **pcbopt, + register struct mbuf *m) { register int cnt, optlen; register u_char *cp; @@ -1977,460 +2844,138 @@ bad: return (EINVAL); } -/* - * XXX - * The whole multicast option thing needs to be re-thought. - * Several of these options are equally applicable to non-multicast - * transmission, and one (IP_MULTICAST_TTL) totally duplicates a - * standard option (IP_TTL). - */ - -/* - * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index. - */ -static struct ifnet * -ip_multicast_if(a, ifindexp) - struct in_addr *a; - int *ifindexp; +void +ip_moptions_init(void) { - int ifindex; - struct ifnet *ifp; + PE_parse_boot_argn("ifa_debug", &imo_debug, sizeof (imo_debug)); - if (ifindexp) - *ifindexp = 0; - if (ntohl(a->s_addr) >> 24 == 0) { - ifindex = ntohl(a->s_addr) & 0xffffff; - ifnet_head_lock_shared(); - if (ifindex < 0 || if_index < ifindex) { - ifnet_head_done(); - return NULL; - } - ifp = ifindex2ifnet[ifindex]; - ifnet_head_done(); - if (ifindexp) - *ifindexp = ifindex; - } else { - INADDR_TO_IFP(*a, ifp); + imo_size = (imo_debug == 0) ? sizeof (struct ip_moptions) : + sizeof (struct ip_moptions_dbg); + + imo_zone = zinit(imo_size, IMO_ZONE_MAX * imo_size, 0, + IMO_ZONE_NAME); + if (imo_zone == NULL) { + panic("%s: failed allocating %s", __func__, IMO_ZONE_NAME); + /* NOTREACHED */ } - return ifp; + zone_change(imo_zone, Z_EXPAND, TRUE); } -/* - * Set the IP multicast options in response to user setsockopt(). - */ -static int -ip_setmoptions(sopt, imop) - struct sockopt *sopt; - struct ip_moptions **imop; +void +imo_addref(struct ip_moptions *imo, int locked) { - int error = 0; - int i; - struct in_addr addr; - struct ip_mreq mreq; - struct ifnet *ifp = NULL; - struct ip_moptions *imo = *imop; - int ifindex; + if (!locked) + IMO_LOCK(imo); + else + IMO_LOCK_ASSERT_HELD(imo); - if (imo == NULL) { - /* - * No multicast option buffer attached to the pcb; - * allocate one and initialize to default values. - */ - error = ip_createmoptions(imop); - if (error != 0) - return error; - imo = *imop; + if (++imo->imo_refcnt == 0) { + panic("%s: imo %p wraparound refcnt\n", __func__, imo); + /* NOTREACHED */ + } else if (imo->imo_trace != NULL) { + (*imo->imo_trace)(imo, TRUE); } - switch (sopt->sopt_name) { - /* store an index number for the vif you wanna use in the send */ - case IP_MULTICAST_VIF: - if (legal_vif_num == 0) { - error = EOPNOTSUPP; - break; - } - error = sooptcopyin(sopt, &i, sizeof i, sizeof i); - if (error) - break; - if (!legal_vif_num(i) && (i != -1)) { - error = EINVAL; - break; - } - imo->imo_multicast_vif = i; - break; - - case IP_MULTICAST_IF: - /* - * Select the interface for outgoing multicast packets. - */ - error = sooptcopyin(sopt, &addr, sizeof addr, sizeof addr); - if (error) - break; - /* - * INADDR_ANY is used to remove a previous selection. - * When no interface is selected, a default one is - * chosen every time a multicast packet is sent. - */ - if (addr.s_addr == INADDR_ANY) { - imo->imo_multicast_ifp = NULL; - break; - } - /* - * The selected interface is identified by its local - * IP address. Find the interface and confirm that - * it supports multicasting. - */ - ifp = ip_multicast_if(&addr, &ifindex); - if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { - error = EADDRNOTAVAIL; - break; - } - imo->imo_multicast_ifp = ifp; - if (ifindex) - imo->imo_multicast_addr = addr; - else - imo->imo_multicast_addr.s_addr = INADDR_ANY; - break; - - case IP_MULTICAST_TTL: - /* - * Set the IP time-to-live for outgoing multicast packets. - * The original multicast API required a char argument, - * which is inconsistent with the rest of the socket API. - * We allow either a char or an int. - */ - if (sopt->sopt_valsize == 1) { - u_char ttl; - error = sooptcopyin(sopt, &ttl, 1, 1); - if (error) - break; - imo->imo_multicast_ttl = ttl; - } else { - u_int ttl; - error = sooptcopyin(sopt, &ttl, sizeof ttl, - sizeof ttl); - if (error) - break; - if (ttl > 255) - error = EINVAL; - else - imo->imo_multicast_ttl = ttl; - } - break; - - case IP_MULTICAST_LOOP: - /* - * Set the loopback flag for outgoing multicast packets. - * Must be zero or one. The original multicast API required a - * char argument, which is inconsistent with the rest - * of the socket API. We allow either a char or an int. - */ - if (sopt->sopt_valsize == 1) { - u_char loop; - error = sooptcopyin(sopt, &loop, 1, 1); - if (error) - break; - imo->imo_multicast_loop = !!loop; - } else { - u_int loop; - error = sooptcopyin(sopt, &loop, sizeof loop, - sizeof loop); - if (error) - break; - imo->imo_multicast_loop = !!loop; - } - break; - - case IP_ADD_MEMBERSHIP: - /* - * Add a multicast group membership. - * Group must be a valid IP multicast address. - */ - error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); - if (error) - break; - - error = ip_addmembership(imo, &mreq); - break; + if (!locked) + IMO_UNLOCK(imo); +} - case IP_DROP_MEMBERSHIP: - /* - * Drop a multicast group membership. - * Group must be a valid IP multicast address. - */ - error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); - if (error) - break; - - error = ip_dropmembership(imo, &mreq); - break; +void +imo_remref(struct ip_moptions *imo) +{ + int i; - default: - error = EOPNOTSUPP; - break; + IMO_LOCK(imo); + if (imo->imo_refcnt == 0) { + panic("%s: imo %p negative refcnt", __func__, imo); + /* NOTREACHED */ + } else if (imo->imo_trace != NULL) { + (*imo->imo_trace)(imo, FALSE); } - /* - * If all options have default values, no need to keep the mbuf. - */ - if (imo->imo_multicast_ifp == NULL && - imo->imo_multicast_vif == -1 && - imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL && - imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP && - imo->imo_num_memberships == 0) { - FREE(*imop, M_IPMOPTS); - *imop = NULL; + --imo->imo_refcnt; + if (imo->imo_refcnt > 0) { + IMO_UNLOCK(imo); + return; } - return (error); -} + for (i = 0; i < imo->imo_num_memberships; ++i) { + struct in_mfilter *imf; -/* - * Set the IP multicast options in response to user setsockopt(). - */ -__private_extern__ int -ip_createmoptions( - struct ip_moptions **imop) -{ - struct ip_moptions *imo; - imo = (struct ip_moptions*) _MALLOC(sizeof(*imo), M_IPMOPTS, - M_WAITOK); - - if (imo == NULL) - return (ENOBUFS); - *imop = imo; - imo->imo_multicast_ifp = NULL; - imo->imo_multicast_addr.s_addr = INADDR_ANY; - imo->imo_multicast_vif = -1; - imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; - imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; - imo->imo_num_memberships = 0; - - return 0; -} + imf = imo->imo_mfilters ? &imo->imo_mfilters[i] : NULL; + if (imf != NULL) + imf_leave(imf); -/* - * Add membership to an IPv4 multicast. - */ -__private_extern__ int -ip_addmembership( - struct ip_moptions *imo, - struct ip_mreq *mreq) -{ - struct route ro; - struct sockaddr_in *dst; - struct ifnet *ifp = NULL; - int error = 0; - int i; - - if (!IN_MULTICAST(ntohl(mreq->imr_multiaddr.s_addr))) { - error = EINVAL; - return error; - } - /* - * If no interface address was provided, use the interface of - * the route to the given multicast address. - */ - if (mreq->imr_interface.s_addr == INADDR_ANY) { - bzero((caddr_t)&ro, sizeof(ro)); - dst = (struct sockaddr_in *)&ro.ro_dst; - dst->sin_len = sizeof(*dst); - dst->sin_family = AF_INET; - dst->sin_addr = mreq->imr_multiaddr; - rtalloc(&ro); - if (ro.ro_rt != NULL) { - ifp = ro.ro_rt->rt_ifp; - rtfree(ro.ro_rt); - } - else { - /* If there's no default route, try using loopback */ - mreq->imr_interface.s_addr = INADDR_LOOPBACK; - } - } - - if (ifp == NULL) { - ifp = ip_multicast_if(&mreq->imr_interface, NULL); - } + (void) in_leavegroup(imo->imo_membership[i], imf); - /* - * See if we found an interface, and confirm that it - * supports multicast. - */ - if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { - error = EADDRNOTAVAIL; - return error; - } - /* - * See if the membership already exists or if all the - * membership slots are full. - */ - for (i = 0; i < imo->imo_num_memberships; ++i) { - if (imo->imo_membership[i]->inm_ifp == ifp && - imo->imo_membership[i]->inm_addr.s_addr - == mreq->imr_multiaddr.s_addr) - break; - } - if (i < imo->imo_num_memberships) { - error = EADDRINUSE; - return error; + if (imf != NULL) + imf_purge(imf); + + INM_REMREF(imo->imo_membership[i]); + imo->imo_membership[i] = NULL; } - if (i == IP_MAX_MEMBERSHIPS) { - error = ETOOMANYREFS; - return error; + imo->imo_num_memberships = 0; + if (imo->imo_mfilters != NULL) { + FREE(imo->imo_mfilters, M_INMFILTER); + imo->imo_mfilters = NULL; } - /* - * Everything looks good; add a new record to the multicast - * address list for the given interface. - */ - if ((imo->imo_membership[i] = - in_addmulti(&mreq->imr_multiaddr, ifp)) == NULL) { - error = ENOBUFS; - return error; + if (imo->imo_membership != NULL) { + FREE(imo->imo_membership, M_IPMOPTS); + imo->imo_membership = NULL; } - ++imo->imo_num_memberships; - - return error; -} + IMO_UNLOCK(imo); -/* - * Drop membership of an IPv4 multicast. - */ -__private_extern__ int -ip_dropmembership( - struct ip_moptions *imo, - struct ip_mreq *mreq) -{ - int error = 0; - struct ifnet* ifp = NULL; - int i; - - if (!IN_MULTICAST(ntohl(mreq->imr_multiaddr.s_addr))) { - error = EINVAL; - return error; - } + lck_mtx_destroy(&imo->imo_lock, ifa_mtx_grp); - /* - * If an interface address was specified, get a pointer - * to its ifnet structure. - */ - if (mreq->imr_interface.s_addr == INADDR_ANY) - ifp = NULL; - else { - ifp = ip_multicast_if(&mreq->imr_interface, NULL); - if (ifp == NULL) { - error = EADDRNOTAVAIL; - return error; - } + if (!(imo->imo_debug & IFD_ALLOC)) { + panic("%s: imo %p cannot be freed", __func__, imo); + /* NOTREACHED */ } - /* - * Find the membership in the membership array. - */ - for (i = 0; i < imo->imo_num_memberships; ++i) { - if ((ifp == NULL || - imo->imo_membership[i]->inm_ifp == ifp) && - imo->imo_membership[i]->inm_addr.s_addr == - mreq->imr_multiaddr.s_addr) - break; - } - if (i == imo->imo_num_memberships) { - error = EADDRNOTAVAIL; - return error; - } - /* - * Give up the multicast address record to which the - * membership points. - */ - in_delmulti(&imo->imo_membership[i]); - /* - * Remove the gap in the membership array. - */ - for (++i; i < imo->imo_num_memberships; ++i) - imo->imo_membership[i-1] = imo->imo_membership[i]; - --imo->imo_num_memberships; - - return error; + zfree(imo_zone, imo); } -/* - * Return the IP multicast options in response to user getsockopt(). - */ -static int -ip_getmoptions(sopt, imo) - struct sockopt *sopt; - register struct ip_moptions *imo; +static void +imo_trace(struct ip_moptions *imo, int refhold) { - struct in_addr addr; - struct in_ifaddr *ia; - int error, optval; - u_char coptval; - - error = 0; - switch (sopt->sopt_name) { - case IP_MULTICAST_VIF: - if (imo != NULL) - optval = imo->imo_multicast_vif; - else - optval = -1; - error = sooptcopyout(sopt, &optval, sizeof optval); - break; - - case IP_MULTICAST_IF: - if (imo == NULL || imo->imo_multicast_ifp == NULL) - addr.s_addr = INADDR_ANY; - else if (imo->imo_multicast_addr.s_addr) { - /* return the value user has set */ - addr = imo->imo_multicast_addr; - } else { - IFP_TO_IA(imo->imo_multicast_ifp, ia); - addr.s_addr = (ia == NULL) ? INADDR_ANY - : IA_SIN(ia)->sin_addr.s_addr; - } - error = sooptcopyout(sopt, &addr, sizeof addr); - break; - - case IP_MULTICAST_TTL: - if (imo == 0) - optval = coptval = IP_DEFAULT_MULTICAST_TTL; - else - optval = coptval = imo->imo_multicast_ttl; - if (sopt->sopt_valsize == 1) - error = sooptcopyout(sopt, &coptval, 1); - else - error = sooptcopyout(sopt, &optval, sizeof optval); - break; - - case IP_MULTICAST_LOOP: - if (imo == 0) - optval = coptval = IP_DEFAULT_MULTICAST_LOOP; - else - optval = coptval = imo->imo_multicast_loop; - if (sopt->sopt_valsize == 1) - error = sooptcopyout(sopt, &coptval, 1); - else - error = sooptcopyout(sopt, &optval, sizeof optval); - break; - - default: - error = ENOPROTOOPT; - break; + struct ip_moptions_dbg *imo_dbg = (struct ip_moptions_dbg *)imo; + ctrace_t *tr; + u_int32_t idx; + u_int16_t *cnt; + + if (!(imo->imo_debug & IFD_DEBUG)) { + panic("%s: imo %p has no debug structure", __func__, imo); + /* NOTREACHED */ + } + if (refhold) { + cnt = &imo_dbg->imo_refhold_cnt; + tr = imo_dbg->imo_refhold; + } else { + cnt = &imo_dbg->imo_refrele_cnt; + tr = imo_dbg->imo_refrele; } - return (error); + + idx = atomic_add_16_ov(cnt, 1) % IMO_TRACE_HIST_SIZE; + ctrace_record(&tr[idx]); } -/* - * Discard the IP multicast options. - */ -void -ip_freemoptions(imo) - register struct ip_moptions *imo; +struct ip_moptions * +ip_allocmoptions(int how) { - register int i; + struct ip_moptions *imo; + imo = (how == M_WAITOK) ? zalloc(imo_zone) : zalloc_noblock(imo_zone); if (imo != NULL) { - for (i = 0; i < imo->imo_num_memberships; ++i) - in_delmulti(&imo->imo_membership[i]); - FREE(imo, M_IPMOPTS); + bzero(imo, imo_size); + lck_mtx_init(&imo->imo_lock, ifa_mtx_grp, ifa_mtx_attr); + imo->imo_debug |= IFD_ALLOC; + if (imo_debug != 0) { + imo->imo_debug |= IFD_DEBUG; + imo->imo_trace = imo_trace; + } + IMO_ADDREF(imo); } + + return (imo); } /* @@ -2449,83 +2994,370 @@ ip_mloopback(ifp, m, dst, hlen) { register struct ip *ip; struct mbuf *copym; + int sw_csum = (apple_hwcksum_tx == 0); copym = m_copy(m, 0, M_COPYALL); if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen)) copym = m_pullup(copym, hlen); - if (copym != NULL) { - /* - * We don't bother to fragment if the IP length is greater - * than the interface's MTU. Can this possibly matter? - */ - ip = mtod(copym, struct ip *); - HTONS(ip->ip_len); - HTONS(ip->ip_off); - ip->ip_sum = 0; - ip->ip_sum = in_cksum(copym, hlen); - /* - * NB: - * It's not clear whether there are any lingering - * reentrancy problems in other areas which might - * be exposed by using ip_input directly (in - * particular, everything which modifies the packet - * in-place). Yet another option is using the - * protosw directly to deliver the looped back - * packet. For the moment, we'll err on the side - * of safety by using if_simloop(). - */ + + if (copym == NULL) + return; + + /* + * We don't bother to fragment if the IP length is greater + * than the interface's MTU. Can this possibly matter? + */ + ip = mtod(copym, struct ip *); + +#if BYTE_ORDER != BIG_ENDIAN + HTONS(ip->ip_len); + HTONS(ip->ip_off); +#endif + + ip->ip_sum = 0; + ip->ip_sum = in_cksum(copym, hlen); + /* + * NB: + * It's not clear whether there are any lingering + * reentrancy problems in other areas which might + * be exposed by using ip_input directly (in + * particular, everything which modifies the packet + * in-place). Yet another option is using the + * protosw directly to deliver the looped back + * packet. For the moment, we'll err on the side + * of safety by using if_simloop(). + */ #if 1 /* XXX */ - if (dst->sin_family != AF_INET) { - printf("ip_mloopback: bad address family %d\n", - dst->sin_family); - dst->sin_family = AF_INET; - } + if (dst->sin_family != AF_INET) { + printf("ip_mloopback: bad address family %d\n", + dst->sin_family); + dst->sin_family = AF_INET; + } #endif - /* - * Mark checksum as valid or calculate checksum for loopback. - * - * This is done this way because we have to embed the ifp of - * the interface we will send the original copy of the packet - * out on in the mbuf. ip_input will check if_hwassist of the - * embedded ifp and ignore all csum_flags if if_hwassist is 0. - * The UDP checksum has not been calculated yet. - */ - if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { - if (IF_HWASSIST_CSUM_FLAGS(ifp->if_hwassist)) { - copym->m_pkthdr.csum_flags |= - CSUM_DATA_VALID | CSUM_PSEUDO_HDR | - CSUM_IP_CHECKED | CSUM_IP_VALID; - copym->m_pkthdr.csum_data = 0xffff; - } else { - NTOHS(ip->ip_len); - in_delayed_cksum(copym); - HTONS(ip->ip_len); - } + * Mark checksum as valid or calculate checksum for loopback. + * + * This is done this way because we have to embed the ifp of + * the interface we will send the original copy of the packet + * out on in the mbuf. ip_input will check if_hwassist of the + * embedded ifp and ignore all csum_flags if if_hwassist is 0. + * The UDP checksum has not been calculated yet. + */ + if (sw_csum || (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA)) { + if (!sw_csum && IF_HWASSIST_CSUM_FLAGS(ifp->if_hwassist)) { + copym->m_pkthdr.csum_flags |= + CSUM_DATA_VALID | CSUM_PSEUDO_HDR | + CSUM_IP_CHECKED | CSUM_IP_VALID; + copym->m_pkthdr.csum_data = 0xffff; + } else { + +#if BYTE_ORDER != BIG_ENDIAN + NTOHS(ip->ip_len); +#endif + + in_delayed_cksum(copym); + +#if BYTE_ORDER != BIG_ENDIAN + HTONS(ip->ip_len); +#endif + + } } + /* + * TedW: + * We need to send all loopback traffic down to dlil in case + * a filter has tapped-in. + */ + + /* + * Stuff the 'real' ifp into the pkthdr, to be used in matching + * in ip_input(); we need the loopback ifp/dl_tag passed as args + * to make the loopback driver compliant with the data link + * requirements. + */ + if (lo_ifp) { + copym->m_pkthdr.rcvif = ifp; + dlil_output(lo_ifp, PF_INET, copym, 0, + (struct sockaddr *) dst, 0, NULL); + } else { + printf("Warning: ip_output call to dlil_find_dltag failed!\n"); + m_freem(copym); + } +} + +/* + * Given a source IP address (and route, if available), determine the best + * interface to send the packet from. Checking for (and updating) the + * ROF_SRCIF_SELECTED flag in the pcb-supplied route placeholder is done + * without any locks based on the assumption that ip_output() is single- + * threaded per-pcb, i.e. for any given pcb there can only be one thread + * performing output at the IP layer. + * + * This routine is analogous to in6_selectroute() for IPv6. + */ +static struct ifaddr * +in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) +{ + struct ifaddr *ifa = NULL; + struct in_addr src = ip->ip_src; + struct in_addr dst = ip->ip_dst; + struct ifnet *rt_ifp; + char s_src[MAX_IPv4_STR_LEN], s_dst[MAX_IPv4_STR_LEN]; + + if (ip_select_srcif_debug) { + (void) inet_ntop(AF_INET, &src.s_addr, s_src, sizeof (s_src)); + (void) inet_ntop(AF_INET, &dst.s_addr, s_dst, sizeof (s_dst)); + } + + if (ro->ro_rt != NULL) + RT_LOCK(ro->ro_rt); + + rt_ifp = (ro->ro_rt != NULL) ? ro->ro_rt->rt_ifp : NULL; + + /* + * Given the source IP address, find a suitable source interface + * to use for transmission; if the caller has specified a scope, + * optimize the search by looking at the addresses only for that + * interface. This is still suboptimal, however, as we need to + * traverse the per-interface list. + */ + if (ifscope != IFSCOPE_NONE || ro->ro_rt != NULL) { + unsigned int scope = ifscope; /* - * TedW: - * We need to send all loopback traffic down to dlil in case - * a filter has tapped-in. + * If no scope is specified and the route is stale (pointing + * to a defunct interface) use the current primary interface; + * this happens when switching between interfaces configured + * with the same IP address. Otherwise pick up the scope + * information from the route; the ULP may have looked up a + * correct route and we just need to verify it here and mark + * it with the ROF_SRCIF_SELECTED flag below. */ + if (scope == IFSCOPE_NONE) { + scope = rt_ifp->if_index; + if (scope != get_primary_ifscope(AF_INET) && + ro->ro_rt->generation_id != route_generation) + scope = get_primary_ifscope(AF_INET); + } + + ifa = (struct ifaddr *)ifa_foraddr_scoped(src.s_addr, scope); + + if (ifa == NULL && ip->ip_p != IPPROTO_UDP && + ip->ip_p != IPPROTO_TCP && ipforwarding) { + /* + * If forwarding is enabled, and if the packet isn't + * TCP or UDP, check if the source address belongs + * to one of our own interfaces; if so, demote the + * interface scope and do a route lookup right below. + */ + ifa = (struct ifaddr *)ifa_foraddr(src.s_addr); + if (ifa != NULL) { + IFA_REMREF(ifa); + ifa = NULL; + ifscope = IFSCOPE_NONE; + } + } + + if (ip_select_srcif_debug && ifa != NULL) { + if (ro->ro_rt != NULL) { + printf("%s->%s ifscope %d->%d ifa_if %s " + "ro_if %s\n", s_src, s_dst, ifscope, + scope, if_name(ifa->ifa_ifp), + if_name(rt_ifp)); + } else { + printf("%s->%s ifscope %d->%d ifa_if %s\n", + s_src, s_dst, ifscope, scope, + if_name(ifa->ifa_ifp)); + } + } + } + + /* + * Slow path; search for an interface having the corresponding source + * IP address if the scope was not specified by the caller, and: + * + * 1) There currently isn't any route, or, + * 2) The interface used by the route does not own that source + * IP address; in this case, the route will get blown away + * and we'll do a more specific scoped search using the newly + * found interface. + */ + if (ifa == NULL && ifscope == IFSCOPE_NONE) { + ifa = (struct ifaddr *)ifa_foraddr(src.s_addr); /* - * Stuff the 'real' ifp into the pkthdr, to be used in matching - * in ip_input(); we need the loopback ifp/dl_tag passed as args - * to make the loopback driver compliant with the data link - * requirements. + * If we have the IP address, but not the route, we don't + * really know whether or not it belongs to the correct + * interface (it could be shared across multiple interfaces.) + * The only way to find out is to do a route lookup. */ - if (lo_ifp) { - copym->m_pkthdr.rcvif = ifp; - dlil_output(lo_ifp, PF_INET, copym, 0, (struct sockaddr *) dst, 0); - } else { - printf("Warning: ip_output call to dlil_find_dltag failed!\n"); - m_freem(copym); + if (ifa != NULL && ro->ro_rt == NULL) { + struct rtentry *rt; + struct sockaddr_in sin; + struct ifaddr *oifa = NULL; + + bzero(&sin, sizeof (sin)); + sin.sin_family = AF_INET; + sin.sin_len = sizeof (sin); + sin.sin_addr = dst; + + lck_mtx_lock(rnh_lock); + if ((rt = rt_lookup(TRUE, (struct sockaddr *)&sin, NULL, + rt_tables[AF_INET], IFSCOPE_NONE)) != NULL) { + RT_LOCK(rt); + /* + * If the route uses a different interface, + * use that one instead. The IP address of + * the ifaddr that we pick up here is not + * relevant. + */ + if (ifa->ifa_ifp != rt->rt_ifp) { + oifa = ifa; + ifa = rt->rt_ifa; + IFA_ADDREF(ifa); + RT_UNLOCK(rt); + } else { + RT_UNLOCK(rt); + } + rtfree_locked(rt); + } + lck_mtx_unlock(rnh_lock); + + if (oifa != NULL) { + struct ifaddr *iifa; + + /* + * See if the interface pointed to by the + * route is configured with the source IP + * address of the packet. + */ + iifa = (struct ifaddr *)ifa_foraddr_scoped( + src.s_addr, ifa->ifa_ifp->if_index); + + if (iifa != NULL) { + /* + * Found it; drop the original one + * as well as the route interface + * address, and use this instead. + */ + IFA_REMREF(oifa); + IFA_REMREF(ifa); + ifa = iifa; + } else if (!ipforwarding || + (rt->rt_flags & RTF_GATEWAY)) { + /* + * This interface doesn't have that + * source IP address; drop the route + * interface address and just use the + * original one, and let the caller + * do a scoped route lookup. + */ + IFA_REMREF(ifa); + ifa = oifa; + } else { + /* + * Forwarding is enabled and the source + * address belongs to one of our own + * interfaces which isn't the outgoing + * interface, and we have a route, and + * the destination is on a network that + * is directly attached (onlink); drop + * the original one and use the route + * interface address instead. + */ + IFA_REMREF(oifa); + } + } + } else if (ifa != NULL && ro->ro_rt != NULL && + !(ro->ro_rt->rt_flags & RTF_GATEWAY) && + ifa->ifa_ifp != ro->ro_rt->rt_ifp && ipforwarding) { + /* + * Forwarding is enabled and the source address belongs + * to one of our own interfaces which isn't the same + * as the interface used by the known route; drop the + * original one and use the route interface address. + */ + IFA_REMREF(ifa); + ifa = ro->ro_rt->rt_ifa; + IFA_ADDREF(ifa); } -/* if_simloop(ifp, copym, (struct sockaddr *)dst, 0);*/ + if (ip_select_srcif_debug && ifa != NULL) { + printf("%s->%s ifscope %d ifa_if %s\n", + s_src, s_dst, ifscope, if_name(ifa->ifa_ifp)); + } } + + if (ro->ro_rt != NULL) + RT_LOCK_ASSERT_HELD(ro->ro_rt); + /* + * If there is a non-loopback route with the wrong interface, or if + * there is no interface configured with such an address, blow it + * away. Except for local/loopback, we look for one with a matching + * interface scope/index. + */ + if (ro->ro_rt != NULL && + (ifa == NULL || (ifa->ifa_ifp != rt_ifp && rt_ifp != lo_ifp) || + !(ro->ro_rt->rt_flags & RTF_UP))) { + if (ip_select_srcif_debug) { + if (ifa != NULL) { + printf("%s->%s ifscope %d ro_if %s != " + "ifa_if %s (cached route cleared)\n", + s_src, s_dst, ifscope, if_name(rt_ifp), + if_name(ifa->ifa_ifp)); + } else { + printf("%s->%s ifscope %d ro_if %s " + "(no ifa_if found)\n", + s_src, s_dst, ifscope, if_name(rt_ifp)); + } + } + + RT_UNLOCK(ro->ro_rt); + rtfree(ro->ro_rt); + ro->ro_rt = NULL; + ro->ro_flags &= ~ROF_SRCIF_SELECTED; + + /* + * If the destination is IPv4 LLA and the route's interface + * doesn't match the source interface, then the source IP + * address is wrong; it most likely belongs to the primary + * interface associated with the IPv4 LL subnet. Drop the + * packet rather than letting it go out and return an error + * to the ULP. This actually applies not only to IPv4 LL + * but other shared subnets; for now we explicitly test only + * for the former case and save the latter for future. + */ + if (IN_LINKLOCAL(ntohl(dst.s_addr)) && + !IN_LINKLOCAL(ntohl(src.s_addr)) && ifa != NULL) { + IFA_REMREF(ifa); + ifa = NULL; + } + } + + if (ip_select_srcif_debug && ifa == NULL) { + printf("%s->%s ifscope %d (neither ro_if/ifa_if found)\n", + s_src, s_dst, ifscope); + } + + /* + * If there is a route, mark it accordingly. If there isn't one, + * we'll get here again during the next transmit (possibly with a + * route) and the flag will get set at that point. For IPv4 LLA + * destination, mark it only if the route has been fully resolved; + * otherwise we want to come back here again when the route points + * to the interface over which the ARP reply arrives on. + */ + if (ro->ro_rt != NULL && (!IN_LINKLOCAL(ntohl(dst.s_addr)) || + (ro->ro_rt->rt_gateway->sa_family == AF_LINK && + SDL(ro->ro_rt->rt_gateway)->sdl_alen != 0))) { + ro->ro_flags |= ROF_SRCIF_SELECTED; + ro->ro_rt->generation_id = route_generation; + } + + if (ro->ro_rt != NULL) + RT_UNLOCK(ro->ro_rt); + + return (ifa); }