X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/2d21ac55c334faf3a56e5634905ed6987fc787d4..316670eb35587141e969394ae8537d66b9211e80:/bsd/netinet/ip_output.c diff --git a/bsd/netinet/ip_output.c b/bsd/netinet/ip_output.c index c065797e7..aece80368 100644 --- a/bsd/netinet/ip_output.c +++ b/bsd/netinet/ip_output.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -79,9 +79,17 @@ #include #include #include +#include + +#include +#include #include +#include +#include #include +#include +#include #include #include @@ -96,8 +104,6 @@ #include #endif -#include "faith.h" - #include #include #include @@ -121,11 +127,16 @@ #include #include +#include #if DUMMYNET #include #endif +#if PF +#include +#endif /* PF */ + #if IPFIREWALL_FORWARD_DEBUG #define print_ip(a) printf("%ld.%ld.%ld.%ld",(ntohl(a.s_addr)>>24)&0xFF,\ (ntohl(a.s_addr)>>16)&0xFF,\ @@ -133,30 +144,21 @@ (ntohl(a.s_addr))&0xFF); #endif - u_short ip_id; static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *); -static struct ifnet *ip_multicast_if(struct in_addr *, int *); static void ip_mloopback(struct ifnet *, struct mbuf *, struct sockaddr_in *, int); -static int ip_getmoptions(struct sockopt *, struct ip_moptions *); static int ip_pcbopts(int, struct mbuf **, struct mbuf *); -static int ip_setmoptions(struct sockopt *, struct ip_moptions **); +static void imo_trace(struct ip_moptions *, int); static void ip_out_cksum_stats(int, u_int32_t); +static struct ifaddr *in_selectsrcif(struct ip *, struct route *, unsigned int); -int ip_createmoptions(struct ip_moptions **imop); -int ip_addmembership(struct ip_moptions *imo, struct ip_mreq *mreq); -int ip_dropmembership(struct ip_moptions *imo, struct ip_mreq *mreq); int ip_optcopy(struct ip *, struct ip *); void in_delayed_cksum_offset(struct mbuf *, int ); void in_cksum_offset(struct mbuf* , size_t ); -extern int (*fr_checkp)(struct ip *, int, struct ifnet *, int, struct mbuf **); - -extern u_long route_generation; - extern struct protosw inetsw[]; extern struct ip_linklocal_stat ip_linklocal_stat; @@ -168,13 +170,50 @@ extern int ipsec_bypass; #endif static int ip_maxchainsent = 0; -SYSCTL_INT(_net_inet_ip, OID_AUTO, maxchainsent, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, maxchainsent, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_maxchainsent, 0, "use dlil_output_list"); #if DEBUG static int forge_ce = 0; -SYSCTL_INT(_net_inet_ip, OID_AUTO, forge_ce, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, forge_ce, CTLFLAG_RW | CTLFLAG_LOCKED, &forge_ce, 0, "Forge ECN CE"); #endif /* DEBUG */ + +static int ip_select_srcif_debug = 0; +SYSCTL_INT(_net_inet_ip, OID_AUTO, select_srcif_debug, CTLFLAG_RW | CTLFLAG_LOCKED, + &ip_select_srcif_debug, 0, "log source interface selection debug info"); + +#define IMO_TRACE_HIST_SIZE 32 /* size of trace history */ + +/* For gdb */ +__private_extern__ unsigned int imo_trace_hist_size = IMO_TRACE_HIST_SIZE; + +struct ip_moptions_dbg { + struct ip_moptions imo; /* ip_moptions */ + u_int16_t imo_refhold_cnt; /* # of IMO_ADDREF */ + u_int16_t imo_refrele_cnt; /* # of IMO_REMREF */ + /* + * Alloc and free callers. + */ + ctrace_t imo_alloc; + ctrace_t imo_free; + /* + * Circular lists of IMO_ADDREF and IMO_REMREF callers. + */ + ctrace_t imo_refhold[IMO_TRACE_HIST_SIZE]; + ctrace_t imo_refrele[IMO_TRACE_HIST_SIZE]; +}; + +#if DEBUG +static unsigned int imo_debug = 1; /* debugging (enabled) */ +#else +static unsigned int imo_debug; /* debugging (disabled) */ +#endif /* !DEBUG */ +static unsigned int imo_size; /* size of zone element */ +static struct zone *imo_zone; /* zone for ip_moptions */ + +#define IMO_ZONE_MAX 64 /* maximum elements in zone */ +#define IMO_ZONE_NAME "ip_moptions" /* zone name */ + /* * IP output. The packet in mbuf chain m contains a skeletal IP * header (with len, off, ttl, proto, tos, src, dst). @@ -188,10 +227,10 @@ ip_output( struct route *ro, int flags, struct ip_moptions *imo, - struct ifnet *ifp) + struct ip_out_args *ipoa) { int error; - error = ip_output_list(m0, 0, opt, ro, flags, imo, ifp); + error = ip_output_list(m0, 0, opt, ro, flags, imo, ipoa); return error; } @@ -208,7 +247,6 @@ ip_output( * ipsec4_getpolicybysock:??? [IPSEC 4th argument, contents modified] * key_spdacquire:??? [IPSEC] * ipsec4_output:??? [IPSEC] - * :??? [firewall] * ip_dn_io_ptr:??? [dummynet] * dlil_output:??? [DLIL] * dlil_output_list:??? [DLIL] @@ -225,91 +263,119 @@ ip_output_list( struct route *ro, int flags, struct ip_moptions *imo, -#if CONFIG_FORCE_OUT_IFP - struct ifnet *pdp_ifp -#else - __unused struct ifnet *unused_ifp -#endif - ) + struct ip_out_args *ipoa) { - struct ip *ip, *mhip; + struct ip *ip; struct ifnet *ifp = NULL; - struct mbuf *m = m0; + struct mbuf *m = m0, *prevnxt = NULL, **mppn = &prevnxt; int hlen = sizeof (struct ip); - int len = 0, off, error = 0; + int len = 0, error = 0; struct sockaddr_in *dst = NULL; - struct in_ifaddr *ia = NULL; + struct in_ifaddr *ia = NULL, *src_ia = NULL; int isbroadcast, sw_csum; struct in_addr pkt_dst; + struct ipf_pktopts *ippo = NULL, ipf_pktopts; #if IPSEC - struct route iproute; + struct ipsec_output_state ipsec_state; + struct route *ipsec_saved_route = NULL; struct socket *so = NULL; struct secpolicy *sp = NULL; #endif #if IPFIREWALL_FORWARD int fwd_rewrite_src = 0; #endif +#if IPFIREWALL + int off; + struct sockaddr_in *next_hop_from_ipfwd_tag = NULL; +#endif +#if IPFIREWALL || DUMMYNET struct ip_fw_args args; + struct m_tag *tag; +#endif int didfilter = 0; ipfilter_t inject_filter_ref = 0; - struct m_tag *tag; +#if DUMMYNET struct route saved_route; + struct ip_out_args saved_ipoa; + struct sockaddr_in dst_buf; +#endif /* DUMMYNET */ struct mbuf * packetlist; - int pktcnt = 0; - + int pktcnt = 0, tso = 0; + u_int32_t bytecnt = 0; + unsigned int ifscope = IFSCOPE_NONE; + unsigned int nocell = 0; + boolean_t select_srcif, srcbound; + struct flowadv *adv = NULL; KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0); +#if IPSEC + bzero(&ipsec_state, sizeof(ipsec_state)); +#endif /* IPSEC */ + packetlist = m0; - args.next_hop = NULL; -#if IPFIREWALL - args.eh = NULL; - args.rule = NULL; - args.divert_rule = 0; /* divert cookie */ - +#if IPFIREWALL || DUMMYNET + bzero(&args, sizeof(struct ip_fw_args)); + + if (SLIST_EMPTY(&m0->m_pkthdr.tags)) + goto ipfw_tags_done; + /* Grab info from mtags prepended to the chain */ #if DUMMYNET - if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET, NULL)) != NULL) { + if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID, + KERNEL_TAG_TYPE_DUMMYNET, NULL)) != NULL) { struct dn_pkt_tag *dn_tag; - + dn_tag = (struct dn_pkt_tag *)(tag+1); - args.rule = dn_tag->rule; + args.fwa_ipfw_rule = dn_tag->dn_ipfw_rule; + args.fwa_pf_rule = dn_tag->dn_pf_rule; opt = NULL; - saved_route = dn_tag->ro; + saved_route = dn_tag->dn_ro; ro = &saved_route; - + imo = NULL; - dst = dn_tag->dn_dst; - ifp = dn_tag->ifp; - flags = dn_tag->flags; - + bcopy(&dn_tag->dn_dst, &dst_buf, sizeof(dst_buf)); + dst = &dst_buf; + ifp = dn_tag->dn_ifp; + flags = dn_tag->dn_flags; + if ((dn_tag->dn_flags & IP_OUTARGS)) { + saved_ipoa = dn_tag->dn_ipoa; + ipoa = &saved_ipoa; + } + m_tag_delete(m0, tag); } #endif /* DUMMYNET */ #if IPDIVERT - if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DIVERT, NULL)) != NULL) { + if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID, + KERNEL_TAG_TYPE_DIVERT, NULL)) != NULL) { struct divert_tag *div_tag; - + div_tag = (struct divert_tag *)(tag+1); - args.divert_rule = div_tag->cookie; + args.fwa_divert_rule = div_tag->cookie; m_tag_delete(m0, tag); } #endif /* IPDIVERT */ -#endif /* IPFIREWALL */ - if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFORWARD, NULL)) != NULL) { +#if IPFIREWALL + if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID, + KERNEL_TAG_TYPE_IPFORWARD, NULL)) != NULL) { struct ip_fwd_tag *ipfwd_tag; - + ipfwd_tag = (struct ip_fwd_tag *)(tag+1); - args.next_hop = ipfwd_tag->next_hop; - + next_hop_from_ipfwd_tag = ipfwd_tag->next_hop; + m_tag_delete(m0, tag); } +#endif /* IPFIREWALL */ + +ipfw_tags_done: +#endif /* IPFIREWALL || DUMMYNET */ m = m0; - + #if DIAGNOSTIC if ( !m || (m->m_flags & M_PKTHDR) != 0) panic("ip_output no HDR"); @@ -318,25 +384,74 @@ ip_output_list( mtod(m, struct ip *)->ip_p); #endif -#if IPFIREWALL - if (args.rule != NULL) { /* dummynet already saw us */ - ip = mtod(m, struct ip *); - hlen = IP_VHL_HL(ip->ip_vhl) << 2 ; - lck_mtx_lock(rt_mtx); - if (ro->ro_rt != NULL) - ia = (struct in_ifaddr *)ro->ro_rt->rt_ifa; - if (ia) - ifaref(&ia->ia_ifa); - lck_mtx_unlock(rt_mtx); + bzero(&ipf_pktopts, sizeof(struct ipf_pktopts)); + ippo = &ipf_pktopts; + + if (ip_doscopedroute && (flags & IP_OUTARGS)) { + /* + * In the forwarding case, only the ifscope value is used, + * as source interface selection doesn't take place. + */ + if ((select_srcif = (!(flags & IP_FORWARDING) && + (ipoa->ipoa_flags & IPOAF_SELECT_SRCIF)))) { + ipf_pktopts.ippo_flags |= IPPOF_SELECT_SRCIF; + } + + if ((ipoa->ipoa_flags & IPOAF_BOUND_IF) && + ipoa->ipoa_boundif != IFSCOPE_NONE) { + ifscope = ipoa->ipoa_boundif; + ipf_pktopts.ippo_flags |= + (IPPOF_BOUND_IF | (ifscope << IPPOF_SHIFT_IFSCOPE)); + } + + if ((srcbound = (ipoa->ipoa_flags & IPOAF_BOUND_SRCADDR))) + ipf_pktopts.ippo_flags |= IPPOF_BOUND_SRCADDR; + } else { + select_srcif = FALSE; + srcbound = FALSE; + ifscope = IFSCOPE_NONE; + } + + if ((flags & IP_OUTARGS) && (ipoa->ipoa_flags & IPOAF_NO_CELLULAR)) { + nocell = 1; + ipf_pktopts.ippo_flags |= IPPOF_NO_IFT_CELLULAR; + } + + if (flags & IP_OUTARGS) { + adv = &ipoa->ipoa_flowadv; + adv->code = FADV_SUCCESS; + } + +#if DUMMYNET + if (args.fwa_ipfw_rule != NULL || args.fwa_pf_rule != NULL) { + /* dummynet already saw us */ + ip = mtod(m, struct ip *); + hlen = IP_VHL_HL(ip->ip_vhl) << 2; + pkt_dst = ip->ip_dst; + if (ro->ro_rt != NULL) { + RT_LOCK_SPIN(ro->ro_rt); + ia = (struct in_ifaddr *)ro->ro_rt->rt_ifa; + if (ia) { + /* Become a regular mutex */ + RT_CONVERT_LOCK(ro->ro_rt); + IFA_ADDREF(&ia->ia_ifa); + } + RT_UNLOCK(ro->ro_rt); + } #if IPSEC - if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) { - so = ipsec_getsocket(m); - (void)ipsec_setsocket(m, NULL); + if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) { + so = ipsec_getsocket(m); + (void)ipsec_setsocket(m, NULL); } -#endif - goto sendit; +#endif /* IPSEC */ +#if IPFIREWALL + if (args.fwa_ipfw_rule != NULL) + goto skip_ipsec; +#endif /* #if IPFIREWALL */ + if (args.fwa_pf_rule != NULL) + goto sendit; } -#endif /* IPFIREWALL */ +#endif /* DUMMYNET */ #if IPSEC if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) { @@ -349,14 +464,45 @@ loopit: * No need to proccess packet twice if we've * already seen it */ - inject_filter_ref = ipf_get_inject_filter(m); + if (!SLIST_EMPTY(&m->m_pkthdr.tags)) + inject_filter_ref = ipf_get_inject_filter(m); + else + inject_filter_ref = 0; if (opt) { m = ip_insertoptions(m, opt, &len); hlen = len; + /* Update the chain */ + if (m != m0) { + if (m0 == packetlist) + packetlist = m; + m0 = m; + } } ip = mtod(m, struct ip *); - pkt_dst = args.next_hop ? args.next_hop->sin_addr : ip->ip_dst; +#if IPFIREWALL + /* + * rdar://8542331 + * + * When dealing with a packet chain, we need to reset "next_hop" because + * "dst" may have been changed to the gateway address below for the previous + * packet of the chain. This could cause the route to be inavertandly changed + * to the route to the gateway address (instead of the route to the destination). + */ + args.fwa_next_hop = next_hop_from_ipfwd_tag; + pkt_dst = args.fwa_next_hop ? args.fwa_next_hop->sin_addr : ip->ip_dst; +#else + pkt_dst = ip->ip_dst; +#endif + + /* + * We must not send if the packet is destined to network zero. + * RFC1122 3.2.1.3 (a) and (b). + */ + if (IN_ZERONET(ntohl(pkt_dst.s_addr))) { + error = EHOSTUNREACH; + goto bad; + } /* * Fill in IP header. @@ -369,11 +515,11 @@ loopit: #else ip->ip_id = htons(ip_id++); #endif - OSAddAtomic(1, (SInt32*)&ipstat.ips_localout); + OSAddAtomic(1, &ipstat.ips_localout); } else { hlen = IP_VHL_HL(ip->ip_vhl) << 2; } - + #if DEBUG /* For debugging, we let the stack forge congestion */ if (forge_ce != 0 && @@ -386,8 +532,8 @@ loopit: KERNEL_DEBUG(DBG_LAYER_BEG, ip->ip_dst.s_addr, ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len); - - dst = (struct sockaddr_in *)&ro->ro_dst; + + dst = (struct sockaddr_in *)(void *)&ro->ro_dst; /* * If there is a cached route, @@ -397,23 +543,36 @@ loopit: * cache with IPv6. */ - lck_mtx_lock(rt_mtx); if (ro->ro_rt != NULL) { if (ro->ro_rt->generation_id != route_generation && ((flags & (IP_ROUTETOIF | IP_FORWARDING)) == 0) && - (ip->ip_src.s_addr != INADDR_ANY) && - (ifa_foraddr(ip->ip_src.s_addr) == 0)) { - error = EADDRNOTAVAIL; - lck_mtx_unlock(rt_mtx); - goto bad; + (ip->ip_src.s_addr != INADDR_ANY)) { + src_ia = ifa_foraddr(ip->ip_src.s_addr); + if (src_ia == NULL) { + error = EADDRNOTAVAIL; + goto bad; + } + IFA_REMREF(&src_ia->ia_ifa); } + /* + * Test rt_flags without holding rt_lock for performance + * reasons; if the route is down it will hopefully be + * caught by the layer below (since it uses this route + * as a hint) or during the next transmit. + */ if ((ro->ro_rt->rt_flags & RTF_UP) == 0 || dst->sin_family != AF_INET || dst->sin_addr.s_addr != pkt_dst.s_addr) { - rtfree_locked(ro->ro_rt); + rtfree(ro->ro_rt); ro->ro_rt = NULL; } - if (ro->ro_rt && ro->ro_rt->generation_id != route_generation) + /* + * If we're doing source interface selection, we may not + * want to use this route; only synch up the generation + * count otherwise. + */ + if (!select_srcif && ro->ro_rt != NULL && + ro->ro_rt->generation_id != route_generation) ro->ro_rt->generation_id = route_generation; } if (ro->ro_rt == NULL) { @@ -426,38 +585,99 @@ loopit: * If routing to interface only, * short circuit routing lookup. */ -#define ifatoia(ifa) ((struct in_ifaddr *)(ifa)) -#define sintosa(sin) ((struct sockaddr *)(sin)) if (flags & IP_ROUTETOIF) { if (ia) - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == 0) { if ((ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == 0) { - OSAddAtomic(1, (SInt32*)&ipstat.ips_noroute); + OSAddAtomic(1, &ipstat.ips_noroute); error = ENETUNREACH; - lck_mtx_unlock(rt_mtx); goto bad; } } ifp = ia->ia_ifp; ip->ip_ttl = 1; isbroadcast = in_broadcast(dst->sin_addr, ifp); + } else if (IN_MULTICAST(ntohl(pkt_dst.s_addr)) && + imo != NULL && (ifp = imo->imo_multicast_ifp) != NULL) { + /* + * Bypass the normal routing lookup for multicast + * packets if the interface is specified. + */ + isbroadcast = 0; + if (ia != NULL) + IFA_REMREF(&ia->ia_ifa); + + /* Macro takes reference on ia */ + IFP_TO_IA(ifp, ia); } else { + boolean_t cloneok = FALSE; + /* + * Perform source interface selection; the source IP address + * must belong to one of the addresses of the interface used + * by the route. For performance reasons, do this only if + * there is no route, or if the routing table has changed, + * or if we haven't done source interface selection on this + * route (for this PCB instance) before. + */ + if (select_srcif && ip->ip_src.s_addr != INADDR_ANY && + (ro->ro_rt == NULL || !(ro->ro_rt->rt_flags & RTF_UP) || + ro->ro_rt->generation_id != route_generation || + !(ro->ro_flags & ROF_SRCIF_SELECTED))) { + struct ifaddr *ifa; -#if CONFIG_FORCE_OUT_IFP - /* Check if this packet should be forced out a specific interface */ - if (ro->ro_rt == 0 && pdp_ifp != NULL) { - pdp_context_route_locked(pdp_ifp, ro); - - if (ro->ro_rt == NULL) { - OSAddAtomic(1, (UInt32*)&ipstat.ips_noroute); - error = EHOSTUNREACH; - lck_mtx_unlock(rt_mtx); + /* Find the source interface */ + ifa = in_selectsrcif(ip, ro, ifscope); + + /* + * If the source address belongs to a cellular interface + * and the caller forbids our using interfaces of such + * type, pretend that there is no source address. + */ + if (nocell && ifa != NULL && + ifa->ifa_ifp->if_type == IFT_CELLULAR) { + IFA_REMREF(ifa); + error = EADDRNOTAVAIL; + goto bad; + } + + /* + * If the source address is spoofed (in the case of + * IP_RAWOUTPUT on an unbounded socket), or if this + * is destined for local/loopback, just let it go out + * using the interface of the route. Otherwise, + * there's no interface having such an address, + * so bail out. + */ + if (ifa == NULL && (!(flags & IP_RAWOUTPUT) || + srcbound) && ifscope != lo_ifp->if_index) { + error = EADDRNOTAVAIL; goto bad; } + + /* + * If the caller didn't explicitly specify the scope, + * pick it up from the source interface. If the cached + * route was wrong and was blown away as part of source + * interface selection, don't mask out RTF_PRCLONING + * since that route may have been allocated by the ULP, + * unless the IP header was created by the caller or + * the destination is IPv4 LLA. The check for the + * latter is needed because IPv4 LLAs are never scoped + * in the current implementation, and we don't want to + * replace the resolved IPv4 LLA route with one whose + * gateway points to that of the default gateway on + * the primary interface of the system. + */ + if (ifa != NULL) { + if (ifscope == IFSCOPE_NONE) + ifscope = ifa->ifa_ifp->if_index; + IFA_REMREF(ifa); + cloneok = (!(flags & IP_RAWOUTPUT) && + !(IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)))); + } } -#endif - + /* * If this is the case, we probably don't want to allocate * a protocol-cloned route since we didn't get one from the @@ -467,8 +687,7 @@ loopit: * the link layer, as this is probably required in all cases * for correct operation (as it is for ARP). */ - - if (ro->ro_rt == 0) { + if (ro->ro_rt == NULL) { unsigned long ign = RTF_PRCLONING; /* * We make an exception here: if the destination @@ -481,40 +700,84 @@ loopit: * that allocate a route and those that don't. The * RTF_BROADCAST route is important since we'd want * to send out undirected IP broadcast packets using - * link-level broadcast address. + * link-level broadcast address. Another exception + * is for ULP-created routes that got blown away by + * source interface selection (see above). * - * This exception will no longer be necessary when + * These exceptions will no longer be necessary when * the RTF_PRCLONING scheme is no longer present. */ - if (dst->sin_addr.s_addr == INADDR_BROADCAST) + if (cloneok || dst->sin_addr.s_addr == INADDR_BROADCAST) ign &= ~RTF_PRCLONING; - rtalloc_ign_locked(ro, ign); + /* + * Loosen the route lookup criteria if the ifscope + * corresponds to the loopback interface; this is + * needed to support Application Layer Gateways + * listening on loopback, in conjunction with packet + * filter redirection rules. The final source IP + * address will be rewritten by the packet filter + * prior to the RFC1122 loopback check below. + */ + if (ifscope == lo_ifp->if_index) + rtalloc_ign(ro, ign); + else + rtalloc_scoped_ign(ro, ign, ifscope); + + /* + * If the route points to a cellular interface and the + * caller forbids our using interfaces of such type, + * pretend that there is no route. + */ + if (nocell && ro->ro_rt != NULL) { + RT_LOCK_SPIN(ro->ro_rt); + if (ro->ro_rt->rt_ifp->if_type == + IFT_CELLULAR) { + RT_UNLOCK(ro->ro_rt); + rtfree(ro->ro_rt); + ro->ro_rt = NULL; + } else { + RT_UNLOCK(ro->ro_rt); + } + } } - if (ro->ro_rt == 0) { - OSAddAtomic(1, (SInt32*)&ipstat.ips_noroute); + + if (ro->ro_rt == NULL) { + OSAddAtomic(1, &ipstat.ips_noroute); error = EHOSTUNREACH; - lck_mtx_unlock(rt_mtx); goto bad; } - + if (ia) - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); + RT_LOCK_SPIN(ro->ro_rt); ia = ifatoia(ro->ro_rt->rt_ifa); - if (ia) - ifaref(&ia->ia_ifa); + if (ia) { + /* Become a regular mutex */ + RT_CONVERT_LOCK(ro->ro_rt); + IFA_ADDREF(&ia->ia_ifa); + } ifp = ro->ro_rt->rt_ifp; ro->ro_rt->rt_use++; - if (ro->ro_rt->rt_flags & RTF_GATEWAY) - dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway; - if (ro->ro_rt->rt_flags & RTF_HOST) + if (ro->ro_rt->rt_flags & RTF_GATEWAY) { + dst = (struct sockaddr_in *)(void *) + ro->ro_rt->rt_gateway; + } + if (ro->ro_rt->rt_flags & RTF_HOST) { isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST); - else + } else { + /* Become a regular mutex */ + RT_CONVERT_LOCK(ro->ro_rt); isbroadcast = in_broadcast(dst->sin_addr, ifp); + } + RT_UNLOCK(ro->ro_rt); } - lck_mtx_unlock(rt_mtx); + if (IN_MULTICAST(ntohl(pkt_dst.s_addr))) { struct in_multi *inm; + u_int32_t vif; + u_int8_t ttl = IP_DEFAULT_MULTICAST_TTL; + u_int8_t loop = IP_DEFAULT_MULTICAST_LOOP; m->m_flags |= M_MCAST; /* @@ -522,29 +785,35 @@ loopit: * still points to the address in "ro". (It may have been * changed to point to a gateway address, above.) */ - dst = (struct sockaddr_in *)&ro->ro_dst; + dst = (struct sockaddr_in *)(void *)&ro->ro_dst; /* * See if the caller provided any multicast options */ if (imo != NULL) { - if ((flags & IP_RAWOUTPUT) == 0) ip->ip_ttl = imo->imo_multicast_ttl; - if (imo->imo_multicast_ifp != NULL) { + IMO_LOCK(imo); + vif = imo->imo_multicast_vif; + ttl = imo->imo_multicast_ttl; + loop = imo->imo_multicast_loop; + if ((flags & IP_RAWOUTPUT) == 0) + ip->ip_ttl = ttl; + if (imo->imo_multicast_ifp != NULL) ifp = imo->imo_multicast_ifp; - } + IMO_UNLOCK(imo); #if MROUTING - if (imo->imo_multicast_vif != -1 && - ((flags & IP_RAWOUTPUT) == 0 || ip->ip_src.s_addr == INADDR_ANY)) - ip->ip_src.s_addr = - ip_mcast_src(imo->imo_multicast_vif); + if (vif != -1 && ((flags & IP_RAWOUTPUT) == 0 || + ip->ip_src.s_addr == INADDR_ANY)) + ip->ip_src.s_addr = ip_mcast_src(vif); #endif /* MROUTING */ - } else - if ((flags & IP_RAWOUTPUT) == 0) ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; + } else if ((flags & IP_RAWOUTPUT) == 0) { + vif = -1; + ip->ip_ttl = ttl; + } /* * Confirm that the outgoing interface supports multicast. */ - if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { + if (imo == NULL || vif == -1) { if ((ifp->if_flags & IFF_MULTICAST) == 0) { - OSAddAtomic(1, (SInt32*)&ipstat.ips_noroute); + OSAddAtomic(1, &ipstat.ips_noroute); error = ENETUNREACH; goto bad; } @@ -554,26 +823,28 @@ loopit: * of outgoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) { - register struct in_ifaddr *ia1; - lck_mtx_lock(rt_mtx); - TAILQ_FOREACH(ia1, &in_ifaddrhead, ia_link) + struct in_ifaddr *ia1; + lck_rw_lock_shared(in_ifaddr_rwlock); + TAILQ_FOREACH(ia1, &in_ifaddrhead, ia_link) { + IFA_LOCK_SPIN(&ia1->ia_ifa); if (ia1->ia_ifp == ifp) { ip->ip_src = IA_SIN(ia1)->sin_addr; - + IFA_UNLOCK(&ia1->ia_ifa); break; } - lck_mtx_unlock(rt_mtx); + IFA_UNLOCK(&ia1->ia_ifa); + } + lck_rw_done(in_ifaddr_rwlock); if (ip->ip_src.s_addr == INADDR_ANY) { error = ENETUNREACH; goto bad; } } - ifnet_lock_shared(ifp); - IN_LOOKUP_MULTI(pkt_dst, ifp, inm); - ifnet_lock_done(ifp); - if (inm != NULL && - (imo == NULL || imo->imo_multicast_loop)) { + in_multihead_lock_shared(); + IN_LOOKUP_MULTI(&pkt_dst, ifp, inm); + in_multihead_lock_done(); + if (inm != NULL && (imo == NULL || loop)) { /* * If we belong to the destination multicast group * on the outgoing interface, and the caller did not @@ -582,21 +853,23 @@ loopit: if (!TAILQ_EMPTY(&ipv4_filters)) { struct ipfilter *filter; int seen = (inject_filter_ref == 0); - struct ipf_pktopts *ippo = 0, ipf_pktopts; - if (imo) { - ippo = &ipf_pktopts; - ipf_pktopts.ippo_mcast_ifnet = imo->imo_multicast_ifp; - ipf_pktopts.ippo_mcast_ttl = imo->imo_multicast_ttl; - ipf_pktopts.ippo_mcast_loop = imo->imo_multicast_loop; + if (imo != NULL) { + ipf_pktopts.ippo_flags |= IPPOF_MCAST_OPTS; + ipf_pktopts.ippo_mcast_ifnet = ifp; + ipf_pktopts.ippo_mcast_ttl = ttl; + ipf_pktopts.ippo_mcast_loop = loop; } - + ipf_ref(); - + /* 4135317 - always pass network byte order to filter */ + +#if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_len); HTONS(ip->ip_off); - +#endif + TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) { if (seen == 0) { if ((struct ipfilter *)inject_filter_ref == filter) @@ -606,20 +879,25 @@ loopit: result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, ippo); if (result == EJUSTRETURN) { ipf_unref(); + INM_REMREF(inm); goto done; } if (result != 0) { ipf_unref(); + INM_REMREF(inm); goto bad; } } } - + /* set back to host byte order */ ip = mtod(m, struct ip *); + +#if BYTE_ORDER != BIG_ENDIAN NTOHS(ip->ip_len); NTOHS(ip->ip_off); - +#endif + ipf_unref(); didfilter = 1; } @@ -647,15 +925,19 @@ loopit: * as prescribed by rsvpd. */ if (!rsvp_on) - imo = NULL; + imo = NULL; if (ip_mforward(ip, ifp, m, imo) != 0) { m_freem(m); + if (inm != NULL) + INM_REMREF(inm); + OSAddAtomic(1, &ipstat.ips_cantforward); goto done; } } } #endif /* MROUTING */ - + if (inm != NULL) + INM_REMREF(inm); /* * Multicasts with a time-to-live of zero may be looped- * back, above, but must not be transmitted on a network. @@ -671,13 +953,14 @@ loopit: goto sendit; } -#ifndef notdef /* * If source address not specified yet, use address * of outgoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) { + IFA_LOCK_SPIN(&ia->ia_ifa); ip->ip_src = IA_SIN(ia)->sin_addr; + IFA_UNLOCK(&ia->ia_ifa); #if IPFIREWALL_FORWARD /* Keep note that we did this - if the firewall changes * the next-hop, our interface may change, changing the @@ -687,7 +970,6 @@ loopit: fwd_rewrite_src++; #endif /* IPFIREWALL_FORWARD */ } -#endif /* notdef */ /* * Look for broadcast address and @@ -714,6 +996,50 @@ loopit: } sendit: +#if PF + /* Invoke outbound packet filter */ + if (PF_IS_ENABLED) { + int rc; + + m0 = m; /* Save for later */ +#if DUMMYNET + args.fwa_m = m; + args.fwa_next_hop = dst; + args.fwa_oif = ifp; + args.fwa_ro = ro; + args.fwa_dst = dst; + args.fwa_oflags = flags; + if (flags & IP_OUTARGS) + args.fwa_ipoa = ipoa; + rc = pf_af_hook(ifp, mppn, &m, AF_INET, FALSE, &args); +#else /* DUMMYNET */ + rc = pf_af_hook(ifp, mppn, &m, AF_INET, FALSE, NULL); +#endif /* DUMMYNET */ + if (rc != 0 || m == NULL) { + /* Move to the next packet */ + m = *mppn; + + /* Skip ahead if first packet in list got dropped */ + if (packetlist == m0) + packetlist = m; + + if (m != NULL) { + m0 = m; + /* Next packet in the chain */ + goto loopit; + } else if (packetlist != NULL) { + /* No more packet; send down the chain */ + goto sendchain; + } + /* Nothing left; we're done */ + goto done; + } + m0 = m; + ip = mtod(m, struct ip *); + pkt_dst = ip->ip_dst; + hlen = IP_VHL_HL(ip->ip_vhl) << 2; + } +#endif /* PF */ /* * Force IP TTL to 255 following draft-ietf-zeroconf-ipv4-linklocal.txt */ @@ -721,27 +1047,40 @@ sendit: ip_linklocal_stat.iplls_out_total++; if (ip->ip_ttl != MAXTTL) { ip_linklocal_stat.iplls_out_badttl++; - ip->ip_ttl = MAXTTL; + ip->ip_ttl = MAXTTL; } } if (!didfilter && !TAILQ_EMPTY(&ipv4_filters)) { struct ipfilter *filter; int seen = (inject_filter_ref == 0); - + ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS; + + /* Check that a TSO frame isn't passed to a filter. + * This could happen if a filter is inserted while + * TCP is sending the TSO packet. + */ + if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) { + error = EMSGSIZE; + goto bad; + } + ipf_ref(); - + /* 4135317 - always pass network byte order to filter */ + +#if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_len); HTONS(ip->ip_off); - +#endif + TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) { if (seen == 0) { if ((struct ipfilter *)inject_filter_ref == filter) seen = 1; } else if (filter->ipf_filter.ipf_output) { errno_t result; - result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, 0); + result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, ippo); if (result == EJUSTRETURN) { ipf_unref(); goto done; @@ -752,12 +1091,15 @@ sendit: } } } - + /* set back to host byte order */ ip = mtod(m, struct ip *); + +#if BYTE_ORDER != BIG_ENDIAN NTOHS(ip->ip_len); NTOHS(ip->ip_off); - +#endif + ipf_unref(); } @@ -777,7 +1119,7 @@ sendit: sp = ipsec4_getpolicybysock(m, IPSEC_DIR_OUTBOUND, so, &error); if (sp == NULL) { - IPSEC_STAT_INCREMENT(ipsecstat.out_inval); + IPSEC_STAT_INCREMENT(ipsecstat.out_inval); KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); goto bad; } @@ -800,7 +1142,7 @@ sendit: /* no need to do IPsec. */ KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 2,0,0,0,0); goto skip_ipsec; - + case IPSEC_POLICY_IPSEC: if (sp->req == NULL) { /* acquire a policy */ @@ -815,15 +1157,12 @@ sendit: printf("ip_output: Invalid policy found. %d\n", sp->policy); } { - struct ipsec_output_state state; - bzero(&state, sizeof(state)); - state.m = m; + ipsec_state.m = m; if (flags & IP_ROUTETOIF) { - state.ro = &iproute; - bzero(&iproute, sizeof(iproute)); + bzero(&ipsec_state.ro, sizeof(ipsec_state.ro)); } else - state.ro = ro; - state.dst = (struct sockaddr *)dst; + route_copyout(&ipsec_state.ro, ro, sizeof(ipsec_state.ro)); + ipsec_state.dst = (struct sockaddr *)dst; ip->ip_sum = 0; @@ -836,26 +1175,35 @@ sendit: m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } + +#if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_len); HTONS(ip->ip_off); +#endif + + DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL, + struct ip *, ip, struct ifnet *, ifp, + struct ip *, ip, struct ip6_hdr *, NULL); + + error = ipsec4_output(&ipsec_state, sp, flags); + + m0 = m = ipsec_state.m; - error = ipsec4_output(&state, sp, flags); - - m0 = m = state.m; - if (flags & IP_ROUTETOIF) { /* * if we have tunnel mode SA, we may need to ignore * IP_ROUTETOIF. */ - if (state.ro != &iproute || state.ro->ro_rt != NULL) { + if (ipsec_state.tunneled) { flags &= ~IP_ROUTETOIF; - ro = state.ro; + ipsec_saved_route = ro; + ro = &ipsec_state.ro; } - } else - ro = state.ro; - - dst = (struct sockaddr_in *)state.dst; + } else { + ipsec_saved_route = ro; + ro = &ipsec_state.ro; + } + dst = (struct sockaddr_in *)(void *)ipsec_state.dst; if (error) { /* mbuf is already reclaimed in ipsec4_output. */ m0 = NULL; @@ -881,64 +1229,87 @@ sendit: /* be sure to update variables that are affected by ipsec4_output() */ ip = mtod(m, struct ip *); - + #ifdef _IP_VHL hlen = IP_VHL_HL(ip->ip_vhl) << 2; #else hlen = ip->ip_hl << 2; #endif /* Check that there wasn't a route change and src is still valid */ - - lck_mtx_lock(rt_mtx); - if (ro->ro_rt && ro->ro_rt->generation_id != route_generation) { - if (ifa_foraddr(ip->ip_src.s_addr) == 0 && ((flags & (IP_ROUTETOIF | IP_FORWARDING)) == 0)) { - error = EADDRNOTAVAIL; - lck_mtx_unlock(rt_mtx); - KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 5,0,0,0,0); + if (ro->ro_rt != NULL && ro->ro_rt->generation_id != route_generation) { + if ((src_ia = ifa_foraddr(ip->ip_src.s_addr)) == NULL && + ((flags & (IP_ROUTETOIF | IP_FORWARDING)) == 0)) { + error = EADDRNOTAVAIL; + KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, + 5,0,0,0,0); goto bad; } - rtfree_locked(ro->ro_rt); + rtfree(ro->ro_rt); ro->ro_rt = NULL; + if (src_ia != NULL) + IFA_REMREF(&src_ia->ia_ifa); } if (ro->ro_rt == NULL) { if ((flags & IP_ROUTETOIF) == 0) { - printf("ip_output: " - "can't update route after IPsec processing\n"); - error = EHOSTUNREACH; /*XXX*/ - lck_mtx_unlock(rt_mtx); - KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 6,0,0,0,0); + printf("ip_output: can't update route after " + "IPsec processing\n"); + error = EHOSTUNREACH; /*XXX*/ + KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, + 6,0,0,0,0); goto bad; } } else { if (ia) - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); + RT_LOCK_SPIN(ro->ro_rt); ia = ifatoia(ro->ro_rt->rt_ifa); - if (ia) - ifaref(&ia->ia_ifa); + if (ia) { + /* Become a regular mutex */ + RT_CONVERT_LOCK(ro->ro_rt); + IFA_ADDREF(&ia->ia_ifa); + } ifp = ro->ro_rt->rt_ifp; + RT_UNLOCK(ro->ro_rt); } - lck_mtx_unlock(rt_mtx); /* make it flipped, again. */ + +#if BYTE_ORDER != BIG_ENDIAN NTOHS(ip->ip_len); NTOHS(ip->ip_off); +#endif + KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 7,0xff,0xff,0xff,0xff); - + /* Pass to filters again */ if (!TAILQ_EMPTY(&ipv4_filters)) { struct ipfilter *filter; - + + ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS; + + /* Check that a TSO frame isn't passed to a filter. + * This could happen if a filter is inserted while + * TCP is sending the TSO packet. + */ + if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) { + error = EMSGSIZE; + goto bad; + } + ipf_ref(); - + /* 4135317 - always pass network byte order to filter */ + +#if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_len); HTONS(ip->ip_off); - +#endif + TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) { if (filter->ipf_filter.ipf_output) { errno_t result; - result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, 0); + result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, ippo); if (result == EJUSTRETURN) { ipf_unref(); goto done; @@ -949,47 +1320,34 @@ sendit: } } } - + /* set back to host byte order */ ip = mtod(m, struct ip *); + +#if BYTE_ORDER != BIG_ENDIAN NTOHS(ip->ip_len); NTOHS(ip->ip_off); - +#endif + ipf_unref(); } skip_ipsec: #endif /*IPSEC*/ #if IPFIREWALL - /* - * IpHack's section. - * - Xlate: translate packet's addr/port (NAT). - * - Firewall: deny/allow/etc. - * - Wrap: fake packet's addr/port - * - Encapsulate: put it in another IP and send out. - */ - if (fr_checkp) { - struct mbuf *m1 = m; - - if ((error = (*fr_checkp)(ip, hlen, ifp, 1, &m1)) || !m1) { - goto done; - } - ip = mtod(m0 = m = m1, struct ip *); - } - /* * Check with the firewall... * but not if we are already being fwd'd from a firewall. */ - if (fw_enable && IPFW_LOADED && !args.next_hop) { + if (fw_enable && IPFW_LOADED && !args.fwa_next_hop) { struct sockaddr_in *old = dst; - args.m = m; - args.next_hop = dst; - args.oif = ifp; + args.fwa_m = m; + args.fwa_next_hop = dst; + args.fwa_oif = ifp; off = ip_fw_chk_ptr(&args); - m = args.m; - dst = args.next_hop; + m = args.fwa_m; + dst = args.fwa_next_hop; /* * On return we must do the following: @@ -1013,28 +1371,30 @@ skip_ipsec: goto done ; } ip = mtod(m, struct ip *); - + if (off == 0 && dst == old) {/* common case */ goto pass ; } #if DUMMYNET - if (DUMMYNET_LOADED && (off & IP_FW_PORT_DYNT_FLAG) != 0) { - /* - * pass the pkt to dummynet. Need to include - * pipe number, m, ifp, ro, dst because these are - * not recomputed in the next pass. - * All other parameters have been already used and - * so they are not needed anymore. - * XXX note: if the ifp or ro entry are deleted - * while a pkt is in dummynet, we are in trouble! - */ - args.ro = ro; - args.dst = dst; - args.flags = flags; - - error = ip_dn_io_ptr(m, off & 0xffff, DN_TO_IP_OUT, - &args); - goto done; + if (DUMMYNET_LOADED && (off & IP_FW_PORT_DYNT_FLAG) != 0) { + /* + * pass the pkt to dummynet. Need to include + * pipe number, m, ifp, ro, dst because these are + * not recomputed in the next pass. + * All other parameters have been already used and + * so they are not needed anymore. + * XXX note: if the ifp or ro entry are deleted + * while a pkt is in dummynet, we are in trouble! + */ + args.fwa_ro = ro; + args.fwa_dst = dst; + args.fwa_oflags = flags; + if (flags & IP_OUTARGS) + args.fwa_ipoa = ipoa; + + error = ip_dn_io_ptr(m, off & 0xffff, DN_TO_IP_OUT, + &args, DN_CLIENT_IPFW); + goto done; } #endif /* DUMMYNET */ #if IPDIVERT @@ -1055,11 +1415,14 @@ skip_ipsec: } /* Restore packet header fields to original values */ + +#if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_len); HTONS(ip->ip_off); +#endif /* Deliver packet to divert input routine */ - divert_packet(m, 0, off & 0xffff, args.divert_rule); + divert_packet(m, 0, off & 0xffff, args.fwa_divert_rule); /* If 'tee', continue with original packet */ if (clone != NULL) { @@ -1105,35 +1468,42 @@ skip_ipsec: * as the packet runs through ip_input() as * it is done through a ISR. */ + lck_rw_lock_shared(in_ifaddr_rwlock); TAILQ_FOREACH(ia_fw, &in_ifaddrhead, ia_link) { /* * If the addr to forward to is one * of ours, we pretend to * be the destination for this packet. */ + IFA_LOCK_SPIN(&ia_fw->ia_ifa); if (IA_SIN(ia_fw)->sin_addr.s_addr == - dst->sin_addr.s_addr) + dst->sin_addr.s_addr) { + IFA_UNLOCK(&ia_fw->ia_ifa); break; + } + IFA_UNLOCK(&ia_fw->ia_ifa); } - if (ia) { + lck_rw_done(in_ifaddr_rwlock); + if (ia_fw) { /* tell ip_input "dont filter" */ struct m_tag *fwd_tag; struct ip_fwd_tag *ipfwd_tag; - - fwd_tag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFORWARD, - sizeof(struct sockaddr_in), M_NOWAIT); + + fwd_tag = m_tag_create(KERNEL_MODULE_TAG_ID, + KERNEL_TAG_TYPE_IPFORWARD, + sizeof (*ipfwd_tag), M_NOWAIT, m); if (fwd_tag == NULL) { error = ENOBUFS; goto bad; } - + ipfwd_tag = (struct ip_fwd_tag *)(fwd_tag+1); - ipfwd_tag->next_hop = args.next_hop; + ipfwd_tag->next_hop = args.fwa_next_hop; m_tag_prepend(m, fwd_tag); if (m->m_pkthdr.rcvif == NULL) - m->m_pkthdr.rcvif = ifunit("lo0"); + m->m_pkthdr.rcvif = lo_ifp; if ((~IF_HWASSIST_CSUM_FLAGS(m->m_pkthdr.rcvif->if_hwassist) & m->m_pkthdr.csum_flags) == 0) { if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { @@ -1150,15 +1520,18 @@ skip_ipsec: m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; ip->ip_sum = in_cksum(m, hlen); } + +#if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_len); HTONS(ip->ip_off); - - +#endif + /* we need to call dlil_output to run filters * and resync to avoid recursion loops. */ if (lo_ifp) { - dlil_output(lo_ifp, PF_INET, m, 0, (struct sockaddr *)dst, 0); + dlil_output(lo_ifp, PF_INET, m, 0, + (struct sockaddr *)dst, 0, adv); } else { printf("ip_output: no loopback ifp for forwarding!!!\n"); @@ -1173,39 +1546,52 @@ skip_ipsec: */ bcopy(dst, &ro_fwd->ro_dst, sizeof(*dst)); - ro_fwd->ro_rt = 0; - lck_mtx_lock(rt_mtx); - rtalloc_ign_locked(ro_fwd, RTF_PRCLONING); + ro_fwd->ro_rt = NULL; + rtalloc_ign(ro_fwd, RTF_PRCLONING); - if (ro_fwd->ro_rt == 0) { - OSAddAtomic(1, (SInt32*)&ipstat.ips_noroute); + if (ro_fwd->ro_rt == NULL) { + OSAddAtomic(1, &ipstat.ips_noroute); error = EHOSTUNREACH; - lck_mtx_unlock(rt_mtx); goto bad; } + RT_LOCK_SPIN(ro_fwd->ro_rt); ia_fw = ifatoia(ro_fwd->ro_rt->rt_ifa); + if (ia_fw != NULL) { + /* Become a regular mutex */ + RT_CONVERT_LOCK(ro_fwd->ro_rt); + IFA_ADDREF(&ia_fw->ia_ifa); + } ifp = ro_fwd->ro_rt->rt_ifp; ro_fwd->ro_rt->rt_use++; if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY) - dst = (struct sockaddr_in *)ro_fwd->ro_rt->rt_gateway; - if (ro_fwd->ro_rt->rt_flags & RTF_HOST) + dst = (struct sockaddr_in *)(void *)ro_fwd->ro_rt->rt_gateway; + if (ro_fwd->ro_rt->rt_flags & RTF_HOST) { isbroadcast = (ro_fwd->ro_rt->rt_flags & RTF_BROADCAST); - else + } else { + /* Become a regular mutex */ + RT_CONVERT_LOCK(ro_fwd->ro_rt); isbroadcast = in_broadcast(dst->sin_addr, ifp); - rtfree_locked(ro->ro_rt); + } + RT_UNLOCK(ro_fwd->ro_rt); + rtfree(ro->ro_rt); ro->ro_rt = ro_fwd->ro_rt; - dst = (struct sockaddr_in *)&ro_fwd->ro_dst; - lck_mtx_unlock(rt_mtx); + dst = (struct sockaddr_in *)(void *)&ro_fwd->ro_dst; /* * If we added a default src ip earlier, * which would have been gotten from the-then * interface, do it again, from the new one. */ - if (fwd_rewrite_src) - ip->ip_src = IA_SIN(ia_fw)->sin_addr; + if (ia_fw != NULL) { + if (fwd_rewrite_src) { + IFA_LOCK_SPIN(&ia_fw->ia_ifa); + ip->ip_src = IA_SIN(ia_fw)->sin_addr; + IFA_UNLOCK(&ia_fw->ia_ifa); + } + IFA_REMREF(&ia_fw->ia_ifa); + } goto pass ; } #endif /* IPFIREWALL_FORWARD */ @@ -1217,15 +1603,15 @@ skip_ipsec: error = EACCES; /* not sure this is the right error msg */ goto done; } -#endif /* IPFIREWALL */ pass: +#endif /* IPFIREWALL */ #if __APPLE__ /* Do not allow loopback address to wind up on a wire */ if ((ifp->if_flags & IFF_LOOPBACK) == 0 && ((ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)) { - OSAddAtomic(1, (SInt32*)&ipstat.ips_badaddr); + OSAddAtomic(1, &ipstat.ips_badaddr); m_freem(m); /* * Do not simply drop the packet just like a firewall -- we want the @@ -1239,6 +1625,8 @@ pass: } #endif m->m_pkthdr.csum_flags |= CSUM_IP; + tso = (ifp->if_hwassist & IFNET_TSO_IPV4) && (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4); + sw_csum = m->m_pkthdr.csum_flags & ~IF_HWASSIST_CSUM_FLAGS(ifp->if_hwassist); @@ -1253,12 +1641,11 @@ pass: /* Apple GMAC HW, expects STUFF_OFFSET << 16 | START_OFFSET */ u_short offset = (IP_VHL_HL(ip->ip_vhl) << 2) +14 ; /* IP+Enet header length */ u_short csumprev= m->m_pkthdr.csum_data & 0xFFFF; - m->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_TCP_SUM16; /* for GMAC */ + m->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_TCP_SUM16; /* for GMAC */ m->m_pkthdr.csum_data = (csumprev + offset) << 16 ; m->m_pkthdr.csum_data += offset; - sw_csum = CSUM_DELAY_IP; /* do IP hdr chksum in software */ - } - else { + sw_csum = CSUM_DELAY_IP; /* do IP hdr chksum in software */ + } else { /* let the software handle any UDP or TCP checksums */ sw_csum |= (CSUM_DELAY_DATA & m->m_pkthdr.csum_flags); } @@ -1266,7 +1653,7 @@ pass: sw_csum |= (CSUM_DELAY_DATA | CSUM_DELAY_IP) & m->m_pkthdr.csum_flags; } - + if (sw_csum & CSUM_DELAY_DATA) { in_delayed_cksum(m); sw_csum &= ~CSUM_DELAY_DATA; @@ -1284,17 +1671,22 @@ pass: * If small enough for interface, or the interface will take * care of the fragmentation for us, can just send directly. */ - if ((u_short)ip->ip_len <= ifp->if_mtu || + if ((u_short)ip->ip_len <= ifp->if_mtu || tso || ifp->if_hwassist & CSUM_FRAGMENT) { - struct rtentry *rte; + if (tso) + m->m_pkthdr.csum_flags |= CSUM_TSO_IPV4; + +#if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_len); HTONS(ip->ip_off); +#endif + ip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) { ip->ip_sum = in_cksum(m, hlen); } - + #ifndef __APPLE__ /* Record statistics for this interface address. */ if (!(flags & IP_FORWARDING) && ia != NULL) { @@ -1309,33 +1701,31 @@ pass: ipsec_delaux(m); #endif if (packetchain == 0) { - lck_mtx_lock(rt_mtx); - if ((rte = ro->ro_rt) != NULL) - rtref(rte); - lck_mtx_unlock(rt_mtx); - error = ifnet_output(ifp, PF_INET, m, rte, - (struct sockaddr *)dst); - if (rte != NULL) - rtfree(rte); + if (ro->ro_rt && nstat_collect) + nstat_route_tx(ro->ro_rt, 1, m->m_pkthdr.len, 0); + error = dlil_output(ifp, PF_INET, m, ro->ro_rt, + (struct sockaddr *)dst, 0, adv); goto done; } else { /* packet chaining allows us to reuse the route for all packets */ + bytecnt += m->m_pkthdr.len; + mppn = &m->m_nextpkt; m = m->m_nextpkt; if (m == NULL) { +#if PF +sendchain: +#endif /* PF */ if (pktcnt > ip_maxchainsent) ip_maxchainsent = pktcnt; - lck_mtx_lock(rt_mtx); - if ((rte = ro->ro_rt) != NULL) - rtref(rte); - lck_mtx_unlock(rt_mtx); + if (ro->ro_rt && nstat_collect) + nstat_route_tx(ro->ro_rt, pktcnt, bytecnt, 0); //send - error = ifnet_output(ifp, PF_INET, packetlist, - rte, (struct sockaddr *)dst); - if (rte != NULL) - rtfree(rte); + error = dlil_output(ifp, PF_INET, packetlist, + ro->ro_rt, (struct sockaddr *)dst, 0, adv); pktcnt = 0; + bytecnt = 0; goto done; - + } m0 = m; pktcnt++; @@ -1346,7 +1736,9 @@ pass: * Too large for interface; fragment if possible. * Must be able to put at least 8 bytes per fragment. */ - if (ip->ip_off & IP_DF) { + + if (ip->ip_off & IP_DF || (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) || + pktcnt > 0) { error = EMSGSIZE; /* * This case can happen if the user changed the MTU @@ -1355,23 +1747,105 @@ pass: * them, there is no way for one to update all its * routes when the MTU is changed. */ - - lck_mtx_lock(rt_mtx); - if (ro->ro_rt && (ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) - && !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) - && (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) { - ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu; - } - lck_mtx_unlock(rt_mtx); - OSAddAtomic(1, (SInt32*)&ipstat.ips_cantfrag); + if (ro->ro_rt) { + RT_LOCK_SPIN(ro->ro_rt); + if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) + && !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) + && (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) { + ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu; + } + RT_UNLOCK(ro->ro_rt); + } + if (pktcnt > 0) { + m0 = packetlist; + } + OSAddAtomic(1, &ipstat.ips_cantfrag); goto bad; } - len = (ifp->if_mtu - hlen) &~ 7; - if (len < 8) { - error = EMSGSIZE; + + error = ip_fragment(m, ifp, ifp->if_mtu, sw_csum); + if (error != 0) { + m0 = m = NULL; goto bad; } + KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr, + ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len); + + for (m = m0; m; m = m0) { + m0 = m->m_nextpkt; + m->m_nextpkt = 0; +#if IPSEC + /* clean ipsec history once it goes out of the node */ + if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) + ipsec_delaux(m); +#endif + if (error == 0) { +#ifndef __APPLE__ + /* Record statistics for this interface address. */ + if (ia != NULL) { + ia->ia_ifa.if_opackets++; + ia->ia_ifa.if_obytes += m->m_pkthdr.len; + } +#endif + if ((packetchain != 0) && (pktcnt > 0)) + panic("ip_output: mix of packet in packetlist is wrong=%p", packetlist); + if (ro->ro_rt && nstat_collect) + nstat_route_tx(ro->ro_rt, 1, m->m_pkthdr.len, 0); + error = dlil_output(ifp, PF_INET, m, ro->ro_rt, + (struct sockaddr *)dst, 0, adv); + } else + m_freem(m); + } + + if (error == 0) + OSAddAtomic(1, &ipstat.ips_fragmented); + +done: + if (ia) { + IFA_REMREF(&ia->ia_ifa); + ia = NULL; + } +#if IPSEC + if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) { + if (ipsec_state.ro.ro_rt) + rtfree(ipsec_state.ro.ro_rt); + if (sp != NULL) { + KEYDEBUG(KEYDEBUG_IPSEC_STAMP, + printf("DP ip_output call free SP:%x\n", sp)); + key_freesp(sp, KEY_SADB_UNLOCKED); + } + } +#endif /* IPSEC */ + + KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_END, error,0,0,0,0); + return (error); +bad: + m_freem(m0); + goto done; +} + +int +ip_fragment(struct mbuf *m, struct ifnet *ifp, unsigned long mtu, int sw_csum) +{ + struct ip *ip, *mhip; + int len, hlen, mhlen, firstlen, off, error = 0; + struct mbuf **mnext = &m->m_nextpkt, *m0; + int nfrags = 1; + + ip = mtod(m, struct ip *); +#ifdef _IP_VHL + hlen = IP_VHL_HL(ip->ip_vhl) << 2; +#else + hlen = ip->ip_hl << 2; +#endif + + firstlen = len = (mtu - hlen) &~ 7; + if (len < 8) { + m_freem(m); + return (EMSGSIZE); + } + /* * if the interface will not calculate checksums on * fragmented packets, then do it here. @@ -1382,12 +1856,6 @@ pass: m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } - - { - int mhlen, firstlen = len; - struct mbuf **mnext = &m->m_nextpkt; - int nfrags = 1; - /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto chain. @@ -1398,7 +1866,7 @@ pass: MGETHDR(m, M_DONTWAIT, MT_HEADER); /* MAC-OK */ if (m == 0) { error = ENOBUFS; - OSAddAtomic(1, (SInt32*)&ipstat.ips_odropped); + OSAddAtomic(1, &ipstat.ips_odropped); goto sendorfree; } m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG; @@ -1422,17 +1890,25 @@ pass: if (m->m_next == 0) { (void) m_free(m); error = ENOBUFS; /* ??? */ - OSAddAtomic(1, (SInt32*)&ipstat.ips_odropped); + OSAddAtomic(1, &ipstat.ips_odropped); goto sendorfree; } m->m_pkthdr.len = mhlen + len; m->m_pkthdr.rcvif = 0; m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags; m->m_pkthdr.socket_id = m0->m_pkthdr.socket_id; + + M_COPY_PFTAG(m, m0); + m_set_service_class(m, m0->m_pkthdr.svc); + #if CONFIG_MACF_NET mac_netinet_fragment(m0, m); #endif + +#if BYTE_ORDER != BIG_ENDIAN HTONS(mhip->ip_off); +#endif + mhip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) { mhip->ip_sum = in_cksum(m, mhlen); @@ -1441,7 +1917,7 @@ pass: mnext = &m->m_nextpkt; nfrags++; } - OSAddAtomic(nfrags, (SInt32*)&ipstat.ips_ofragments); + OSAddAtomic(nfrags, &ipstat.ips_ofragments); /* set first/last markers for fragment chain */ m->m_flags |= M_LASTFRAG; @@ -1457,74 +1933,20 @@ pass: m->m_pkthdr.len = hlen + firstlen; ip->ip_len = htons((u_short)m->m_pkthdr.len); ip->ip_off |= IP_MF; + +#if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_off); +#endif + ip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) { ip->ip_sum = in_cksum(m, hlen); } sendorfree: + if (error) + m_freem_list(m0); - KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr, - ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len); - - for (m = m0; m; m = m0) { - m0 = m->m_nextpkt; - m->m_nextpkt = 0; -#if IPSEC - /* clean ipsec history once it goes out of the node */ - if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) - ipsec_delaux(m); -#endif - if (error == 0) { - struct rtentry *rte; -#ifndef __APPLE__ - /* Record statistics for this interface address. */ - if (ia != NULL) { - ia->ia_ifa.if_opackets++; - ia->ia_ifa.if_obytes += m->m_pkthdr.len; - } -#endif - if ((packetchain != 0) && (pktcnt > 0)) - panic("ip_output: mix of packet in packetlist is wrong=%p", packetlist); - lck_mtx_lock(rt_mtx); - if ((rte = ro->ro_rt) != NULL) - rtref(rte); - lck_mtx_unlock(rt_mtx); - error = ifnet_output(ifp, PF_INET, m, rte, - (struct sockaddr *)dst); - if (rte != NULL) - rtfree(rte); - } else - m_freem(m); - } - - if (error == 0) - OSAddAtomic(1, (SInt32*)&ipstat.ips_fragmented); - } -done: - if (ia) { - ifafree(&ia->ia_ifa); - ia = NULL; - } -#if IPSEC - if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) { - if (ro == &iproute && ro->ro_rt) { - rtfree(ro->ro_rt); - ro->ro_rt = NULL; - } - if (sp != NULL) { - KEYDEBUG(KEYDEBUG_IPSEC_STAMP, - printf("DP ip_output call free SP:%x\n", sp)); - key_freesp(sp, KEY_SADB_UNLOCKED); - } - } -#endif /* IPSEC */ - - KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_END, error,0,0,0,0); return (error); -bad: - m_freem(m0); - goto done; } static void @@ -1549,36 +1971,44 @@ in_delayed_cksum_offset(struct mbuf *m0, int ip_offset) struct ip *ip; unsigned char buf[sizeof(struct ip)]; u_short csum, offset, ip_len; - struct mbuf *m = m0; - + + /* Save copy of first mbuf pointer and the ip_offset before modifying */ + struct mbuf *m = m0; + int ip_offset_copy = ip_offset; + while (ip_offset >= m->m_len) { ip_offset -= m->m_len; m = m->m_next; if (m == NULL) { - printf("in_delayed_cksum_withoffset failed - ip_offset wasn't in the packet\n"); + printf("in_delayed_cksum_withoffset failed - " + "ip_offset wasn't in the packet\n"); return; } } - - /* Sometimes the IP header is not contiguous, yes this can happen! */ - if (ip_offset + sizeof(struct ip) > m->m_len) { -#if DEBUG - printf("delayed m_pullup, m->len: %ld off: %d\n", + + /* + * In case the IP header is not contiguous, or not 32-bit + * aligned, copy it to a local buffer. + */ + if ((ip_offset + sizeof(struct ip) > m->m_len) || + !IP_HDR_ALIGNED_P(mtod(m, caddr_t) + ip_offset)) { +#if DEBUG + printf("delayed m_pullup, m->len: %d off: %d\n", m->m_len, ip_offset); #endif m_copydata(m, ip_offset, sizeof(struct ip), (caddr_t) buf); - - ip = (struct ip *)buf; + + ip = (struct ip *)(void *)buf; } else { - ip = (struct ip*)(m->m_data + ip_offset); + ip = (struct ip*)(void *)(m->m_data + ip_offset); } - + /* Gross */ if (ip_offset) { m->m_len -= ip_offset; m->m_data += ip_offset; } - + offset = IP_VHL_HL(ip->ip_vhl) << 2 ; /* @@ -1591,12 +2021,12 @@ in_delayed_cksum_offset(struct mbuf *m0, int ip_offset) * is bogus and we give up. */ ip_len = ip->ip_len; - if (ip_len != (m0->m_pkthdr.len - ip_offset)) { + if (ip_len != (m0->m_pkthdr.len - ip_offset_copy)) { ip_len = SWAP16(ip_len); - if (ip_len != (m0->m_pkthdr.len - ip_offset)) { + if (ip_len != (m0->m_pkthdr.len - ip_offset_copy)) { printf("in_delayed_cksum_offset: ip_len %d (%d) " "doesn't match actual length %d\n", ip->ip_len, - ip_len, (m0->m_pkthdr.len - ip_offset)); + ip_len, (m0->m_pkthdr.len - ip_offset_copy)); return; } } @@ -1624,15 +2054,18 @@ in_delayed_cksum_offset(struct mbuf *m0, int ip_offset) /* Insert the checksum in the existing chain */ if (offset + ip_offset + sizeof(u_short) > m->m_len) { char tmp[2]; - + #if DEBUG - printf("delayed m_copyback, m->len: %ld off: %d p: %d\n", + printf("delayed m_copyback, m->len: %d off: %d p: %d\n", m->m_len, offset + ip_offset, ip->ip_p); #endif - *(u_short *)tmp = csum; + *(u_short *)(void *)tmp = csum; m_copyback(m, offset + ip_offset, 2, tmp); - } else - *(u_short *)(m->m_data + offset + ip_offset) = csum; + } else if (IP_HDR_ALIGNED_P(mtod(m, caddr_t) + ip_offset)) { + *(u_short *)(void *)(m->m_data + offset + ip_offset) = csum; + } else { + bcopy(&csum, (m->m_data + offset + ip_offset), sizeof (csum)); + } } void @@ -1648,33 +2081,42 @@ in_cksum_offset(struct mbuf* m, size_t ip_offset) int hlen = 0; unsigned char buf[sizeof(struct ip)]; int swapped = 0; - + + /* Save copy of first mbuf pointer and the ip_offset before modifying */ + struct mbuf* m0 = m; + size_t ip_offset_copy = ip_offset; + while (ip_offset >= m->m_len) { ip_offset -= m->m_len; m = m->m_next; if (m == NULL) { - printf("in_cksum_offset failed - ip_offset wasn't in the packet\n"); + printf("in_cksum_offset failed - ip_offset wasn't " + "in the packet\n"); return; } } - - /* Sometimes the IP header is not contiguous, yes this can happen! */ - if (ip_offset + sizeof(struct ip) > m->m_len) { + /* + * In case the IP header is not contiguous, or not 32-bit + * aligned, copy it to a local buffer. + */ + if ((ip_offset + sizeof(struct ip) > m->m_len) || + !IP_HDR_ALIGNED_P(mtod(m, caddr_t) + ip_offset)) { #if DEBUG - printf("in_cksum_offset - delayed m_pullup, m->len: %ld off: %lu\n", - m->m_len, ip_offset); -#endif + printf("in_cksum_offset - delayed m_pullup, m->len: %d " + "off: %lu\n", m->m_len, ip_offset); +#endif m_copydata(m, ip_offset, sizeof(struct ip), (caddr_t) buf); - ip = (struct ip *)buf; + ip = (struct ip *)(void *)buf; ip->ip_sum = 0; - m_copyback(m, ip_offset + offsetof(struct ip, ip_sum), 2, (caddr_t)&ip->ip_sum); + m_copyback(m, ip_offset + offsetof(struct ip, ip_sum), 2, + (caddr_t)&ip->ip_sum); } else { - ip = (struct ip*)(m->m_data + ip_offset); + ip = (struct ip*)(void *)(m->m_data + ip_offset); ip->ip_sum = 0; } - + /* Gross */ if (ip_offset) { m->m_len -= ip_offset; @@ -1695,15 +2137,15 @@ in_cksum_offset(struct mbuf* m, size_t ip_offset) * the length and check again. If it still fails, then the packet * is bogus and we give up. */ - if (ntohs(ip->ip_len) != (m->m_pkthdr.len - ip_offset)) { + if (ntohs(ip->ip_len) != (m0->m_pkthdr.len - ip_offset_copy)) { ip->ip_len = SWAP16(ip->ip_len); swapped = 1; - if (ntohs(ip->ip_len) != (m->m_pkthdr.len - ip_offset)) { + if (ntohs(ip->ip_len) != (m0->m_pkthdr.len - ip_offset_copy)) { ip->ip_len = SWAP16(ip->ip_len); printf("in_cksum_offset: ip_len %d (%d) " "doesn't match actual length %lu\n", ip->ip_len, SWAP16(ip->ip_len), - (m->m_pkthdr.len - ip_offset)); + (m0->m_pkthdr.len - ip_offset_copy)); return; } } @@ -1721,16 +2163,25 @@ in_cksum_offset(struct mbuf* m, size_t ip_offset) m->m_data -= ip_offset; } - /* Insert the checksum in the existing chain if IP header not contiguous */ + /* + * Insert the checksum in the existing chain if IP header not + * contiguous, or if it's not 32-bit aligned, i.e. all the cases + * where it was copied to a local buffer. + */ if (ip_offset + sizeof(struct ip) > m->m_len) { char tmp[2]; #if DEBUG - printf("in_cksum_offset m_copyback, m->len: %lu off: %lu p: %d\n", - m->m_len, ip_offset + offsetof(struct ip, ip_sum), ip->ip_p); + printf("in_cksum_offset m_copyback, m->len: %u off: %lu " + "p: %d\n", m->m_len, + ip_offset + offsetof(struct ip, ip_sum), ip->ip_p); #endif - *(u_short *)tmp = ip->ip_sum; + *(u_short *)(void *)tmp = ip->ip_sum; m_copyback(m, ip_offset + offsetof(struct ip, ip_sum), 2, tmp); + } else if (!IP_HDR_ALIGNED_P(mtod(m, caddr_t) + ip_offset)) { + bcopy(&ip->ip_sum, + (m->m_data + ip_offset + offsetof(struct ip, ip_sum)), + sizeof (u_short)); } } @@ -1862,7 +2313,8 @@ ip_ctloutput(so, sopt) error = EMSGSIZE; break; } - MGET(m, sopt->sopt_p ? M_WAIT : M_DONTWAIT, MT_HEADER); + MGET(m, sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT, + MT_HEADER); if (m == 0) { error = ENOBUFS; break; @@ -1884,9 +2336,7 @@ ip_ctloutput(so, sopt) case IP_RECVDSTADDR: case IP_RECVIF: case IP_RECVTTL: -#if defined(NFAITH) && NFAITH > 0 - case IP_FAITH: -#endif + case IP_RECVPKTINFO: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) @@ -1926,77 +2376,99 @@ ip_ctloutput(so, sopt) OPTSET(INP_RECVTTL); break; -#if defined(NFAITH) && NFAITH > 0 - case IP_FAITH: - OPTSET(INP_FAITH); + case IP_RECVPKTINFO: + OPTSET(INP_PKTINFO); break; -#endif } break; #undef OPTSET -#if CONFIG_FORCE_OUT_IFP +#if CONFIG_FORCE_OUT_IFP + /* + * Apple private interface, similar to IP_BOUND_IF, except + * that the parameter is a NULL-terminated string containing + * the name of the network interface; an emptry string means + * unbind. Applications are encouraged to use IP_BOUND_IF + * instead, as that is the current "official" API. + */ case IP_FORCE_OUT_IFP: { - char ifname[IFNAMSIZ]; - ifnet_t ifp; - + char ifname[IFNAMSIZ]; + unsigned int ifscope; + + /* This option is settable only for IPv4 */ + if (!(inp->inp_vflag & INP_IPV4)) { + error = EINVAL; + break; + } + /* Verify interface name parameter is sane */ if (sopt->sopt_valsize > sizeof(ifname)) { error = EINVAL; break; } - + /* Copy the interface name */ if (sopt->sopt_valsize != 0) { - error = sooptcopyin(sopt, ifname, sizeof(ifname), sopt->sopt_valsize); + error = sooptcopyin(sopt, ifname, + sizeof (ifname), sopt->sopt_valsize); if (error) break; } - - if (sopt->sopt_valsize == 0 || ifname[0] == 0) { - // Set pdp_ifp to NULL - inp->pdp_ifp = NULL; - - // Flush the route - if (inp->inp_route.ro_rt) { - rtfree(inp->inp_route.ro_rt); - inp->inp_route.ro_rt = NULL; + + if (sopt->sopt_valsize == 0 || ifname[0] == '\0') { + /* Unbind this socket from any interface */ + ifscope = IFSCOPE_NONE; + } else { + ifnet_t ifp; + + /* Verify name is NULL terminated */ + if (ifname[sopt->sopt_valsize - 1] != '\0') { + error = EINVAL; + break; } - - break; - } - - /* Verify name is NULL terminated */ - if (ifname[sopt->sopt_valsize - 1] != 0) { - error = EINVAL; - break; - } - - if (ifnet_find_by_name(ifname, &ifp) != 0) { - error = ENXIO; - break; - } - - /* Won't actually free. Since we don't release this later, we should do it now. */ - ifnet_release(ifp); - - /* This only works for point-to-point interfaces */ - if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { - error = ENOTSUP; - break; + + /* Bail out if given bogus interface name */ + if (ifnet_find_by_name(ifname, &ifp) != 0) { + error = ENXIO; + break; + } + + /* Bind this socket to this interface */ + ifscope = ifp->if_index; + + /* + * Won't actually free; since we don't release + * this later, we should do it now. + */ + ifnet_release(ifp); } - - inp->pdp_ifp = ifp; + error = inp_bindif(inp, ifscope); } break; #endif + /* + * Multicast socket options are processed by the in_mcast + * module. + */ case IP_MULTICAST_IF: + case IP_MULTICAST_IFINDEX: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_ADD_MEMBERSHIP: case IP_DROP_MEMBERSHIP: - error = ip_setmoptions(sopt, &inp->inp_moptions); + case IP_ADD_SOURCE_MEMBERSHIP: + case IP_DROP_SOURCE_MEMBERSHIP: + case IP_BLOCK_SOURCE: + case IP_UNBLOCK_SOURCE: + case IP_MSFILTER: + case MCAST_JOIN_GROUP: + case MCAST_LEAVE_GROUP: + case MCAST_JOIN_SOURCE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + error = inp_setmoptions(inp, sopt); break; case IP_PORTRANGE: @@ -2036,16 +2508,11 @@ ip_ctloutput(so, sopt) struct mbuf *m; int optname; - if (sopt->sopt_valsize > MCLBYTES) { - error = EMSGSIZE; - break; - } if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ break; if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ break; - priv = (sopt->sopt_p != NULL && - proc_suser(sopt->sopt_p) != 0) ? 0 : 1; + priv = (proc_suser(sopt->sopt_p) == 0); if (m) { req = mtod(m, caddr_t); len = m->m_len; @@ -2065,15 +2532,73 @@ ip_ctloutput(so, sopt) if (error) break; - if (background) - so->so_traffic_mgt_flags |= TRAFFIC_MGT_SO_BACKGROUND; - else - so->so_traffic_mgt_flags &= ~TRAFFIC_MGT_SO_BACKGROUND; + if (background) { + socket_set_traffic_mgt_flags_locked(so, + TRAFFIC_MGT_SO_BACKGROUND); + } else { + socket_clear_traffic_mgt_flags_locked(so, + TRAFFIC_MGT_SO_BACKGROUND); + } break; } #endif /* TRAFFIC_MGT */ + /* + * On a multihomed system, scoped routing can be used to + * restrict the source interface used for sending packets. + * The socket option IP_BOUND_IF binds a particular AF_INET + * socket to an interface such that data sent on the socket + * is restricted to that interface. This is unlike the + * SO_DONTROUTE option where the routing table is bypassed; + * therefore it allows for a greater flexibility and control + * over the system behavior, and does not place any restriction + * on the destination address type (e.g. unicast, multicast, + * or broadcast if applicable) or whether or not the host is + * directly reachable. Note that in the multicast transmit + * case, IP_MULTICAST_{IF,IFINDEX} takes precedence over + * IP_BOUND_IF, since the former practically bypasses the + * routing table; in this case, IP_BOUND_IF sets the default + * interface used for sending multicast packets in the absence + * of an explicit multicast transmit interface. + */ + case IP_BOUND_IF: + /* This option is settable only for IPv4 */ + if (!(inp->inp_vflag & INP_IPV4)) { + error = EINVAL; + break; + } + + error = sooptcopyin(sopt, &optval, sizeof (optval), + sizeof (optval)); + + if (error) + break; + + error = inp_bindif(inp, optval); + break; + + case IP_NO_IFT_CELLULAR: + /* This option is settable only for IPv4 */ + if (!(inp->inp_vflag & INP_IPV4)) { + error = EINVAL; + break; + } + + error = sooptcopyin(sopt, &optval, sizeof (optval), + sizeof (optval)); + + if (error) + break; + + error = inp_nocellular(inp, optval); + break; + + case IP_OUT_IF: + /* This option is not settable */ + error = EINVAL; + break; + default: error = ENOPROTOOPT; break; @@ -2101,9 +2626,7 @@ ip_ctloutput(so, sopt) case IP_RECVIF: case IP_RECVTTL: case IP_PORTRANGE: -#if defined(NFAITH) && NFAITH > 0 - case IP_FAITH: -#endif + case IP_RECVPKTINFO: switch (sopt->sopt_name) { case IP_TOS: @@ -2145,22 +2668,20 @@ ip_ctloutput(so, sopt) optval = 0; break; -#if defined(NFAITH) && NFAITH > 0 - case IP_FAITH: - optval = OPTBIT(INP_FAITH); + case IP_RECVPKTINFO: + optval = OPTBIT(INP_PKTINFO); break; -#endif } error = sooptcopyout(sopt, &optval, sizeof optval); break; case IP_MULTICAST_IF: + case IP_MULTICAST_IFINDEX: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: - case IP_ADD_MEMBERSHIP: - case IP_DROP_MEMBERSHIP: - error = ip_getmoptions(sopt, inp->inp_moptions); + case IP_MSFILTER: + error = inp_getmoptions(inp, sopt); break; #if IPSEC @@ -2186,12 +2707,29 @@ ip_ctloutput(so, sopt) #if TRAFFIC_MGT case IP_TRAFFIC_MGT_BACKGROUND: { - unsigned background = so->so_traffic_mgt_flags; + unsigned background = (so->so_traffic_mgt_flags & TRAFFIC_MGT_SO_BACKGROUND); return (sooptcopyout(sopt, &background, sizeof(background))); break; } #endif /* TRAFFIC_MGT */ + case IP_BOUND_IF: + if (inp->inp_flags & INP_BOUND_IF) + optval = inp->inp_boundifp->if_index; + error = sooptcopyout(sopt, &optval, sizeof (optval)); + break; + + case IP_NO_IFT_CELLULAR: + optval = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0; + error = sooptcopyout(sopt, &optval, sizeof (optval)); + break; + + case IP_OUT_IF: + optval = (inp->inp_last_outifp != NULL) ? + inp->inp_last_outifp->if_index : 0; + error = sooptcopyout(sopt, &optval, sizeof (optval)); + break; + default: error = ENOPROTOOPT; break; @@ -2306,466 +2844,138 @@ bad: return (EINVAL); } -/* - * XXX - * The whole multicast option thing needs to be re-thought. - * Several of these options are equally applicable to non-multicast - * transmission, and one (IP_MULTICAST_TTL) totally duplicates a - * standard option (IP_TTL). - */ - -/* - * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index. - */ -static struct ifnet * -ip_multicast_if(a, ifindexp) - struct in_addr *a; - int *ifindexp; +void +ip_moptions_init(void) { - int ifindex; - struct ifnet *ifp; + PE_parse_boot_argn("ifa_debug", &imo_debug, sizeof (imo_debug)); - if (ifindexp) - *ifindexp = 0; - if (ntohl(a->s_addr) >> 24 == 0) { - ifindex = ntohl(a->s_addr) & 0xffffff; - ifnet_head_lock_shared(); - if (ifindex < 0 || if_index < ifindex) { - ifnet_head_done(); - return NULL; - } - ifp = ifindex2ifnet[ifindex]; - ifnet_head_done(); - if (ifindexp) - *ifindexp = ifindex; - } else { - INADDR_TO_IFP(*a, ifp); + imo_size = (imo_debug == 0) ? sizeof (struct ip_moptions) : + sizeof (struct ip_moptions_dbg); + + imo_zone = zinit(imo_size, IMO_ZONE_MAX * imo_size, 0, + IMO_ZONE_NAME); + if (imo_zone == NULL) { + panic("%s: failed allocating %s", __func__, IMO_ZONE_NAME); + /* NOTREACHED */ } - return ifp; + zone_change(imo_zone, Z_EXPAND, TRUE); } -/* - * Set the IP multicast options in response to user setsockopt(). - */ -static int -ip_setmoptions(sopt, imop) - struct sockopt *sopt; - struct ip_moptions **imop; +void +imo_addref(struct ip_moptions *imo, int locked) { - int error = 0; - int i; - struct in_addr addr; - struct ip_mreq mreq; - struct ifnet *ifp = NULL; - struct ip_moptions *imo = *imop; - int ifindex; + if (!locked) + IMO_LOCK(imo); + else + IMO_LOCK_ASSERT_HELD(imo); - if (imo == NULL) { - /* - * No multicast option buffer attached to the pcb; - * allocate one and initialize to default values. - */ - error = ip_createmoptions(imop); - if (error != 0) - return error; - imo = *imop; + if (++imo->imo_refcnt == 0) { + panic("%s: imo %p wraparound refcnt\n", __func__, imo); + /* NOTREACHED */ + } else if (imo->imo_trace != NULL) { + (*imo->imo_trace)(imo, TRUE); } - switch (sopt->sopt_name) { - /* store an index number for the vif you wanna use in the send */ -#if MROUTING - case IP_MULTICAST_VIF: - if (legal_vif_num == 0) { - error = EOPNOTSUPP; - break; - } - error = sooptcopyin(sopt, &i, sizeof i, sizeof i); - if (error) - break; - if (!legal_vif_num(i) && (i != -1)) { - error = EINVAL; - break; - } - imo->imo_multicast_vif = i; - break; -#endif /* MROUTING */ - - case IP_MULTICAST_IF: - /* - * Select the interface for outgoing multicast packets. - */ - error = sooptcopyin(sopt, &addr, sizeof addr, sizeof addr); - if (error) - break; - /* - * INADDR_ANY is used to remove a previous selection. - * When no interface is selected, a default one is - * chosen every time a multicast packet is sent. - */ - if (addr.s_addr == INADDR_ANY) { - imo->imo_multicast_ifp = NULL; - break; - } - /* - * The selected interface is identified by its local - * IP address. Find the interface and confirm that - * it supports multicasting. - */ - ifp = ip_multicast_if(&addr, &ifindex); - if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { - error = EADDRNOTAVAIL; - break; - } - imo->imo_multicast_ifp = ifp; - if (ifindex) - imo->imo_multicast_addr = addr; - else - imo->imo_multicast_addr.s_addr = INADDR_ANY; - break; - - case IP_MULTICAST_TTL: - /* - * Set the IP time-to-live for outgoing multicast packets. - * The original multicast API required a char argument, - * which is inconsistent with the rest of the socket API. - * We allow either a char or an int. - */ - if (sopt->sopt_valsize == 1) { - u_char ttl; - error = sooptcopyin(sopt, &ttl, 1, 1); - if (error) - break; - imo->imo_multicast_ttl = ttl; - } else { - u_int ttl; - error = sooptcopyin(sopt, &ttl, sizeof ttl, - sizeof ttl); - if (error) - break; - if (ttl > 255) - error = EINVAL; - else - imo->imo_multicast_ttl = ttl; - } - break; - - case IP_MULTICAST_LOOP: - /* - * Set the loopback flag for outgoing multicast packets. - * Must be zero or one. The original multicast API required a - * char argument, which is inconsistent with the rest - * of the socket API. We allow either a char or an int. - */ - if (sopt->sopt_valsize == 1) { - u_char loop; - error = sooptcopyin(sopt, &loop, 1, 1); - if (error) - break; - imo->imo_multicast_loop = !!loop; - } else { - u_int loop; - error = sooptcopyin(sopt, &loop, sizeof loop, - sizeof loop); - if (error) - break; - imo->imo_multicast_loop = !!loop; - } - break; - - case IP_ADD_MEMBERSHIP: - /* - * Add a multicast group membership. - * Group must be a valid IP multicast address. - */ - error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); - if (error) - break; - - error = ip_addmembership(imo, &mreq); - break; + if (!locked) + IMO_UNLOCK(imo); +} - case IP_DROP_MEMBERSHIP: - /* - * Drop a multicast group membership. - * Group must be a valid IP multicast address. - */ - error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); - if (error) - break; - - error = ip_dropmembership(imo, &mreq); - break; +void +imo_remref(struct ip_moptions *imo) +{ + int i; - default: - error = EOPNOTSUPP; - break; + IMO_LOCK(imo); + if (imo->imo_refcnt == 0) { + panic("%s: imo %p negative refcnt", __func__, imo); + /* NOTREACHED */ + } else if (imo->imo_trace != NULL) { + (*imo->imo_trace)(imo, FALSE); } - /* - * If all options have default values, no need to keep the mbuf. - */ - if (imo->imo_multicast_ifp == NULL && - imo->imo_multicast_vif == (u_long)-1 && - imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL && - imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP && - imo->imo_num_memberships == 0) { - FREE(*imop, M_IPMOPTS); - *imop = NULL; + --imo->imo_refcnt; + if (imo->imo_refcnt > 0) { + IMO_UNLOCK(imo); + return; } - return (error); -} + for (i = 0; i < imo->imo_num_memberships; ++i) { + struct in_mfilter *imf; -/* - * Set the IP multicast options in response to user setsockopt(). - */ -__private_extern__ int -ip_createmoptions( - struct ip_moptions **imop) -{ - struct ip_moptions *imo; - imo = (struct ip_moptions*) _MALLOC(sizeof(*imo), M_IPMOPTS, - M_WAITOK); - - if (imo == NULL) - return (ENOBUFS); - *imop = imo; - imo->imo_multicast_ifp = NULL; - imo->imo_multicast_addr.s_addr = INADDR_ANY; - imo->imo_multicast_vif = -1; - imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; - imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; - imo->imo_num_memberships = 0; - - return 0; -} + imf = imo->imo_mfilters ? &imo->imo_mfilters[i] : NULL; + if (imf != NULL) + imf_leave(imf); -/* - * Add membership to an IPv4 multicast. - */ -__private_extern__ int -ip_addmembership( - struct ip_moptions *imo, - struct ip_mreq *mreq) -{ - struct route ro; - struct sockaddr_in *dst; - struct ifnet *ifp = NULL; - int error = 0; - int i; - - if (!IN_MULTICAST(ntohl(mreq->imr_multiaddr.s_addr))) { - error = EINVAL; - return error; - } - /* - * If no interface address was provided, use the interface of - * the route to the given multicast address. - */ - if (mreq->imr_interface.s_addr == INADDR_ANY) { - bzero((caddr_t)&ro, sizeof(ro)); - dst = (struct sockaddr_in *)&ro.ro_dst; - dst->sin_len = sizeof(*dst); - dst->sin_family = AF_INET; - dst->sin_addr = mreq->imr_multiaddr; - lck_mtx_lock(rt_mtx); - rtalloc_ign_locked(&ro, 0UL); - if (ro.ro_rt != NULL) { - ifp = ro.ro_rt->rt_ifp; - rtfree_locked(ro.ro_rt); - } - else { - /* If there's no default route, try using loopback */ - mreq->imr_interface.s_addr = INADDR_LOOPBACK; - } - lck_mtx_unlock(rt_mtx); - } - - if (ifp == NULL) { - ifp = ip_multicast_if(&mreq->imr_interface, NULL); - } + (void) in_leavegroup(imo->imo_membership[i], imf); - /* - * See if we found an interface, and confirm that it - * supports multicast. - */ - if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { - error = EADDRNOTAVAIL; - return error; - } - /* - * See if the membership already exists or if all the - * membership slots are full. - */ - for (i = 0; i < imo->imo_num_memberships; ++i) { - if (imo->imo_membership[i]->inm_ifp == ifp && - imo->imo_membership[i]->inm_addr.s_addr - == mreq->imr_multiaddr.s_addr) - break; + if (imf != NULL) + imf_purge(imf); + + INM_REMREF(imo->imo_membership[i]); + imo->imo_membership[i] = NULL; } - if (i < imo->imo_num_memberships) { - error = EADDRINUSE; - return error; + imo->imo_num_memberships = 0; + if (imo->imo_mfilters != NULL) { + FREE(imo->imo_mfilters, M_INMFILTER); + imo->imo_mfilters = NULL; } - if (i == IP_MAX_MEMBERSHIPS) { - error = ETOOMANYREFS; - return error; + if (imo->imo_membership != NULL) { + FREE(imo->imo_membership, M_IPMOPTS); + imo->imo_membership = NULL; } - /* - * Everything looks good; add a new record to the multicast - * address list for the given interface. - */ - if ((imo->imo_membership[i] = - in_addmulti(&mreq->imr_multiaddr, ifp)) == NULL) { - error = ENOBUFS; - return error; - } - ++imo->imo_num_memberships; - - return error; -} + IMO_UNLOCK(imo); -/* - * Drop membership of an IPv4 multicast. - */ -__private_extern__ int -ip_dropmembership( - struct ip_moptions *imo, - struct ip_mreq *mreq) -{ - int error = 0; - struct ifnet* ifp = NULL; - int i; - - if (!IN_MULTICAST(ntohl(mreq->imr_multiaddr.s_addr))) { - error = EINVAL; - return error; - } + lck_mtx_destroy(&imo->imo_lock, ifa_mtx_grp); - /* - * If an interface address was specified, get a pointer - * to its ifnet structure. - */ - if (mreq->imr_interface.s_addr == INADDR_ANY) - ifp = NULL; - else { - ifp = ip_multicast_if(&mreq->imr_interface, NULL); - if (ifp == NULL) { - error = EADDRNOTAVAIL; - return error; - } + if (!(imo->imo_debug & IFD_ALLOC)) { + panic("%s: imo %p cannot be freed", __func__, imo); + /* NOTREACHED */ } - /* - * Find the membership in the membership array. - */ - for (i = 0; i < imo->imo_num_memberships; ++i) { - if ((ifp == NULL || - imo->imo_membership[i]->inm_ifp == ifp) && - imo->imo_membership[i]->inm_addr.s_addr == - mreq->imr_multiaddr.s_addr) - break; - } - if (i == imo->imo_num_memberships) { - error = EADDRNOTAVAIL; - return error; - } - /* - * Give up the multicast address record to which the - * membership points. - */ - in_delmulti(&imo->imo_membership[i]); - /* - * Remove the gap in the membership array. - */ - for (++i; i < imo->imo_num_memberships; ++i) - imo->imo_membership[i-1] = imo->imo_membership[i]; - --imo->imo_num_memberships; - - return error; + zfree(imo_zone, imo); } -/* - * Return the IP multicast options in response to user getsockopt(). - */ -static int -ip_getmoptions(sopt, imo) - struct sockopt *sopt; - register struct ip_moptions *imo; +static void +imo_trace(struct ip_moptions *imo, int refhold) { - struct in_addr addr; - struct in_ifaddr *ia; - int error, optval; - u_char coptval; - - error = 0; - switch (sopt->sopt_name) { -#if MROUTING - case IP_MULTICAST_VIF: - if (imo != NULL) - optval = imo->imo_multicast_vif; - else - optval = -1; - error = sooptcopyout(sopt, &optval, sizeof optval); - break; -#endif /* MROUTING */ - - case IP_MULTICAST_IF: - if (imo == NULL || imo->imo_multicast_ifp == NULL) - addr.s_addr = INADDR_ANY; - else if (imo->imo_multicast_addr.s_addr) { - /* return the value user has set */ - addr = imo->imo_multicast_addr; - } else { - IFP_TO_IA(imo->imo_multicast_ifp, ia); - addr.s_addr = (ia == NULL) ? INADDR_ANY - : IA_SIN(ia)->sin_addr.s_addr; - } - error = sooptcopyout(sopt, &addr, sizeof addr); - break; - - case IP_MULTICAST_TTL: - if (imo == 0) - optval = coptval = IP_DEFAULT_MULTICAST_TTL; - else - optval = coptval = imo->imo_multicast_ttl; - if (sopt->sopt_valsize == 1) - error = sooptcopyout(sopt, &coptval, 1); - else - error = sooptcopyout(sopt, &optval, sizeof optval); - break; - - case IP_MULTICAST_LOOP: - if (imo == 0) - optval = coptval = IP_DEFAULT_MULTICAST_LOOP; - else - optval = coptval = imo->imo_multicast_loop; - if (sopt->sopt_valsize == 1) - error = sooptcopyout(sopt, &coptval, 1); - else - error = sooptcopyout(sopt, &optval, sizeof optval); - break; - - default: - error = ENOPROTOOPT; - break; + struct ip_moptions_dbg *imo_dbg = (struct ip_moptions_dbg *)imo; + ctrace_t *tr; + u_int32_t idx; + u_int16_t *cnt; + + if (!(imo->imo_debug & IFD_DEBUG)) { + panic("%s: imo %p has no debug structure", __func__, imo); + /* NOTREACHED */ + } + if (refhold) { + cnt = &imo_dbg->imo_refhold_cnt; + tr = imo_dbg->imo_refhold; + } else { + cnt = &imo_dbg->imo_refrele_cnt; + tr = imo_dbg->imo_refrele; } - return (error); + + idx = atomic_add_16_ov(cnt, 1) % IMO_TRACE_HIST_SIZE; + ctrace_record(&tr[idx]); } -/* - * Discard the IP multicast options. - */ -void -ip_freemoptions(imo) - register struct ip_moptions *imo; +struct ip_moptions * +ip_allocmoptions(int how) { - register int i; + struct ip_moptions *imo; + imo = (how == M_WAITOK) ? zalloc(imo_zone) : zalloc_noblock(imo_zone); if (imo != NULL) { - for (i = 0; i < imo->imo_num_memberships; ++i) - in_delmulti(&imo->imo_membership[i]); - FREE(imo, M_IPMOPTS); + bzero(imo, imo_size); + lck_mtx_init(&imo->imo_lock, ifa_mtx_grp, ifa_mtx_attr); + imo->imo_debug |= IFD_ALLOC; + if (imo_debug != 0) { + imo->imo_debug |= IFD_DEBUG; + imo->imo_trace = imo_trace; + } + IMO_ADDREF(imo); } + + return (imo); } /* @@ -2798,8 +3008,12 @@ ip_mloopback(ifp, m, dst, hlen) * than the interface's MTU. Can this possibly matter? */ ip = mtod(copym, struct ip *); + +#if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_len); HTONS(ip->ip_off); +#endif + ip->ip_sum = 0; ip->ip_sum = in_cksum(copym, hlen); /* @@ -2837,9 +3051,17 @@ ip_mloopback(ifp, m, dst, hlen) CSUM_IP_CHECKED | CSUM_IP_VALID; copym->m_pkthdr.csum_data = 0xffff; } else { + +#if BYTE_ORDER != BIG_ENDIAN NTOHS(ip->ip_len); +#endif + in_delayed_cksum(copym); + +#if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_len); +#endif + } } @@ -2858,9 +3080,284 @@ ip_mloopback(ifp, m, dst, hlen) if (lo_ifp) { copym->m_pkthdr.rcvif = ifp; dlil_output(lo_ifp, PF_INET, copym, 0, - (struct sockaddr *) dst, 0); + (struct sockaddr *) dst, 0, NULL); } else { printf("Warning: ip_output call to dlil_find_dltag failed!\n"); m_freem(copym); } } + +/* + * Given a source IP address (and route, if available), determine the best + * interface to send the packet from. Checking for (and updating) the + * ROF_SRCIF_SELECTED flag in the pcb-supplied route placeholder is done + * without any locks based on the assumption that ip_output() is single- + * threaded per-pcb, i.e. for any given pcb there can only be one thread + * performing output at the IP layer. + * + * This routine is analogous to in6_selectroute() for IPv6. + */ +static struct ifaddr * +in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) +{ + struct ifaddr *ifa = NULL; + struct in_addr src = ip->ip_src; + struct in_addr dst = ip->ip_dst; + struct ifnet *rt_ifp; + char s_src[MAX_IPv4_STR_LEN], s_dst[MAX_IPv4_STR_LEN]; + + if (ip_select_srcif_debug) { + (void) inet_ntop(AF_INET, &src.s_addr, s_src, sizeof (s_src)); + (void) inet_ntop(AF_INET, &dst.s_addr, s_dst, sizeof (s_dst)); + } + + if (ro->ro_rt != NULL) + RT_LOCK(ro->ro_rt); + + rt_ifp = (ro->ro_rt != NULL) ? ro->ro_rt->rt_ifp : NULL; + + /* + * Given the source IP address, find a suitable source interface + * to use for transmission; if the caller has specified a scope, + * optimize the search by looking at the addresses only for that + * interface. This is still suboptimal, however, as we need to + * traverse the per-interface list. + */ + if (ifscope != IFSCOPE_NONE || ro->ro_rt != NULL) { + unsigned int scope = ifscope; + + /* + * If no scope is specified and the route is stale (pointing + * to a defunct interface) use the current primary interface; + * this happens when switching between interfaces configured + * with the same IP address. Otherwise pick up the scope + * information from the route; the ULP may have looked up a + * correct route and we just need to verify it here and mark + * it with the ROF_SRCIF_SELECTED flag below. + */ + if (scope == IFSCOPE_NONE) { + scope = rt_ifp->if_index; + if (scope != get_primary_ifscope(AF_INET) && + ro->ro_rt->generation_id != route_generation) + scope = get_primary_ifscope(AF_INET); + } + + ifa = (struct ifaddr *)ifa_foraddr_scoped(src.s_addr, scope); + + if (ifa == NULL && ip->ip_p != IPPROTO_UDP && + ip->ip_p != IPPROTO_TCP && ipforwarding) { + /* + * If forwarding is enabled, and if the packet isn't + * TCP or UDP, check if the source address belongs + * to one of our own interfaces; if so, demote the + * interface scope and do a route lookup right below. + */ + ifa = (struct ifaddr *)ifa_foraddr(src.s_addr); + if (ifa != NULL) { + IFA_REMREF(ifa); + ifa = NULL; + ifscope = IFSCOPE_NONE; + } + } + + if (ip_select_srcif_debug && ifa != NULL) { + if (ro->ro_rt != NULL) { + printf("%s->%s ifscope %d->%d ifa_if %s " + "ro_if %s\n", s_src, s_dst, ifscope, + scope, if_name(ifa->ifa_ifp), + if_name(rt_ifp)); + } else { + printf("%s->%s ifscope %d->%d ifa_if %s\n", + s_src, s_dst, ifscope, scope, + if_name(ifa->ifa_ifp)); + } + } + } + + /* + * Slow path; search for an interface having the corresponding source + * IP address if the scope was not specified by the caller, and: + * + * 1) There currently isn't any route, or, + * 2) The interface used by the route does not own that source + * IP address; in this case, the route will get blown away + * and we'll do a more specific scoped search using the newly + * found interface. + */ + if (ifa == NULL && ifscope == IFSCOPE_NONE) { + ifa = (struct ifaddr *)ifa_foraddr(src.s_addr); + + /* + * If we have the IP address, but not the route, we don't + * really know whether or not it belongs to the correct + * interface (it could be shared across multiple interfaces.) + * The only way to find out is to do a route lookup. + */ + if (ifa != NULL && ro->ro_rt == NULL) { + struct rtentry *rt; + struct sockaddr_in sin; + struct ifaddr *oifa = NULL; + + bzero(&sin, sizeof (sin)); + sin.sin_family = AF_INET; + sin.sin_len = sizeof (sin); + sin.sin_addr = dst; + + lck_mtx_lock(rnh_lock); + if ((rt = rt_lookup(TRUE, (struct sockaddr *)&sin, NULL, + rt_tables[AF_INET], IFSCOPE_NONE)) != NULL) { + RT_LOCK(rt); + /* + * If the route uses a different interface, + * use that one instead. The IP address of + * the ifaddr that we pick up here is not + * relevant. + */ + if (ifa->ifa_ifp != rt->rt_ifp) { + oifa = ifa; + ifa = rt->rt_ifa; + IFA_ADDREF(ifa); + RT_UNLOCK(rt); + } else { + RT_UNLOCK(rt); + } + rtfree_locked(rt); + } + lck_mtx_unlock(rnh_lock); + + if (oifa != NULL) { + struct ifaddr *iifa; + + /* + * See if the interface pointed to by the + * route is configured with the source IP + * address of the packet. + */ + iifa = (struct ifaddr *)ifa_foraddr_scoped( + src.s_addr, ifa->ifa_ifp->if_index); + + if (iifa != NULL) { + /* + * Found it; drop the original one + * as well as the route interface + * address, and use this instead. + */ + IFA_REMREF(oifa); + IFA_REMREF(ifa); + ifa = iifa; + } else if (!ipforwarding || + (rt->rt_flags & RTF_GATEWAY)) { + /* + * This interface doesn't have that + * source IP address; drop the route + * interface address and just use the + * original one, and let the caller + * do a scoped route lookup. + */ + IFA_REMREF(ifa); + ifa = oifa; + } else { + /* + * Forwarding is enabled and the source + * address belongs to one of our own + * interfaces which isn't the outgoing + * interface, and we have a route, and + * the destination is on a network that + * is directly attached (onlink); drop + * the original one and use the route + * interface address instead. + */ + IFA_REMREF(oifa); + } + } + } else if (ifa != NULL && ro->ro_rt != NULL && + !(ro->ro_rt->rt_flags & RTF_GATEWAY) && + ifa->ifa_ifp != ro->ro_rt->rt_ifp && ipforwarding) { + /* + * Forwarding is enabled and the source address belongs + * to one of our own interfaces which isn't the same + * as the interface used by the known route; drop the + * original one and use the route interface address. + */ + IFA_REMREF(ifa); + ifa = ro->ro_rt->rt_ifa; + IFA_ADDREF(ifa); + } + + if (ip_select_srcif_debug && ifa != NULL) { + printf("%s->%s ifscope %d ifa_if %s\n", + s_src, s_dst, ifscope, if_name(ifa->ifa_ifp)); + } + } + + if (ro->ro_rt != NULL) + RT_LOCK_ASSERT_HELD(ro->ro_rt); + /* + * If there is a non-loopback route with the wrong interface, or if + * there is no interface configured with such an address, blow it + * away. Except for local/loopback, we look for one with a matching + * interface scope/index. + */ + if (ro->ro_rt != NULL && + (ifa == NULL || (ifa->ifa_ifp != rt_ifp && rt_ifp != lo_ifp) || + !(ro->ro_rt->rt_flags & RTF_UP))) { + if (ip_select_srcif_debug) { + if (ifa != NULL) { + printf("%s->%s ifscope %d ro_if %s != " + "ifa_if %s (cached route cleared)\n", + s_src, s_dst, ifscope, if_name(rt_ifp), + if_name(ifa->ifa_ifp)); + } else { + printf("%s->%s ifscope %d ro_if %s " + "(no ifa_if found)\n", + s_src, s_dst, ifscope, if_name(rt_ifp)); + } + } + + RT_UNLOCK(ro->ro_rt); + rtfree(ro->ro_rt); + ro->ro_rt = NULL; + ro->ro_flags &= ~ROF_SRCIF_SELECTED; + + /* + * If the destination is IPv4 LLA and the route's interface + * doesn't match the source interface, then the source IP + * address is wrong; it most likely belongs to the primary + * interface associated with the IPv4 LL subnet. Drop the + * packet rather than letting it go out and return an error + * to the ULP. This actually applies not only to IPv4 LL + * but other shared subnets; for now we explicitly test only + * for the former case and save the latter for future. + */ + if (IN_LINKLOCAL(ntohl(dst.s_addr)) && + !IN_LINKLOCAL(ntohl(src.s_addr)) && ifa != NULL) { + IFA_REMREF(ifa); + ifa = NULL; + } + } + + if (ip_select_srcif_debug && ifa == NULL) { + printf("%s->%s ifscope %d (neither ro_if/ifa_if found)\n", + s_src, s_dst, ifscope); + } + + /* + * If there is a route, mark it accordingly. If there isn't one, + * we'll get here again during the next transmit (possibly with a + * route) and the flag will get set at that point. For IPv4 LLA + * destination, mark it only if the route has been fully resolved; + * otherwise we want to come back here again when the route points + * to the interface over which the ARP reply arrives on. + */ + if (ro->ro_rt != NULL && (!IN_LINKLOCAL(ntohl(dst.s_addr)) || + (ro->ro_rt->rt_gateway->sa_family == AF_LINK && + SDL(ro->ro_rt->rt_gateway)->sdl_alen != 0))) { + ro->ro_flags |= ROF_SRCIF_SELECTED; + ro->ro_rt->generation_id = route_generation; + } + + if (ro->ro_rt != NULL) + RT_UNLOCK(ro->ro_rt); + + return (ifa); +}