X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/ebb1b9f42b62218f29061826217bb0f71cd375a6..490019cf9519204c5fb36b2fba54ceb983bb6b72:/bsd/netinet/ip_output.c diff --git a/bsd/netinet/ip_output.c b/bsd/netinet/ip_output.c index c4530e994..2788b0e79 100644 --- a/bsd/netinet/ip_output.c +++ b/bsd/netinet/ip_output.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2011 Apple Inc. All rights reserved. + * Copyright (c) 2000-2014 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* @@ -58,7 +58,6 @@ * SUCH DAMAGE. * * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 - * $FreeBSD: src/sys/netinet/ip_output.c,v 1.99.2.16 2001/07/19 06:37:26 kris Exp $ */ /* * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce @@ -67,7 +66,7 @@ * Version 2.0. */ -#define _IP_VHL +#define _IP_VHL #include #include @@ -80,9 +79,14 @@ #include #include #include +#include #include #include +#include + +#include +#include #include #include @@ -90,6 +94,8 @@ #include #include #include +#include +#include #include #include @@ -97,25 +103,16 @@ #include #include #include - #include #if CONFIG_MACF_NET #include -#endif - -#include "faith.h" - -#include -#include -#include - -#define DBG_LAYER_BEG NETDBG_CODE(DBG_NETIP, 1) -#define DBG_LAYER_END NETDBG_CODE(DBG_NETIP, 3) -#define DBG_FNC_IP_OUTPUT NETDBG_CODE(DBG_NETIP, (1 << 8) | 1) -#define DBG_FNC_IPSEC4_OUTPUT NETDBG_CODE(DBG_NETIP, (2 << 8) | 1) +#endif /* CONFIG_MACF_NET */ -#define SWAP16(v) ((((v) & 0xff) << 8) | ((v) >> 8)) +#define DBG_LAYER_BEG NETDBG_CODE(DBG_NETIP, 1) +#define DBG_LAYER_END NETDBG_CODE(DBG_NETIP, 3) +#define DBG_FNC_IP_OUTPUT NETDBG_CODE(DBG_NETIP, (1 << 8) | 1) +#define DBG_FNC_IPSEC4_OUTPUT NETDBG_CODE(DBG_NETIP, (2 << 8) | 1) #if IPSEC #include @@ -123,13 +120,20 @@ #if IPSEC_DEBUG #include #else -#define KEYDEBUG(lev,arg) +#define KEYDEBUG(lev, arg) #endif -#endif /*IPSEC*/ +#endif /* IPSEC */ +#if NECP +#include +#endif /* NECP */ + +#if IPFIREWALL #include +#if IPDIVERT #include -#include +#endif /* IPDIVERT */ +#endif /* IPFIREWALL */ #if DUMMYNET #include @@ -139,53 +143,68 @@ #include #endif /* PF */ -#if IPFIREWALL_FORWARD_DEBUG -#define print_ip(a) printf("%ld.%ld.%ld.%ld",(ntohl(a.s_addr)>>24)&0xFF,\ - (ntohl(a.s_addr)>>16)&0xFF,\ - (ntohl(a.s_addr)>>8)&0xFF,\ - (ntohl(a.s_addr))&0xFF); -#endif - +#if IPFIREWALL_FORWARD && IPFIREWALL_FORWARD_DEBUG +#define print_ip(a) \ + printf("%ld.%ld.%ld.%ld", (ntohl(a.s_addr) >> 24) & 0xFF, \ + (ntohl(a.s_addr) >> 16) & 0xFF, \ + (ntohl(a.s_addr) >> 8) & 0xFF, \ + (ntohl(a.s_addr)) & 0xFF); +#endif /* IPFIREWALL_FORWARD && IPFIREWALL_FORWARD_DEBUG */ u_short ip_id; -static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *); -static void ip_mloopback(struct ifnet *, struct mbuf *, - struct sockaddr_in *, int); -static int ip_pcbopts(int, struct mbuf **, struct mbuf *); -static void imo_trace(struct ip_moptions *, int); - +static int sysctl_reset_ip_output_stats SYSCTL_HANDLER_ARGS; +static int sysctl_ip_output_measure_bins SYSCTL_HANDLER_ARGS; +static int sysctl_ip_output_getperf SYSCTL_HANDLER_ARGS; static void ip_out_cksum_stats(int, u_int32_t); +static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *); +static int ip_optcopy(struct ip *, struct ip *); +static int ip_pcbopts(int, struct mbuf **, struct mbuf *); +static void imo_trace(struct ip_moptions *, int); +static void ip_mloopback(struct ifnet *, struct ifnet *, struct mbuf *, + struct sockaddr_in *, int); static struct ifaddr *in_selectsrcif(struct ip *, struct route *, unsigned int); -int ip_optcopy(struct ip *, struct ip *); -void in_delayed_cksum_offset(struct mbuf *, int ); -void in_cksum_offset(struct mbuf* , size_t ); - -extern int (*fr_checkp)(struct ip *, int, struct ifnet *, int, struct mbuf **); - -extern struct protosw inetsw[]; - extern struct ip_linklocal_stat ip_linklocal_stat; -extern lck_mtx_t *ip_mutex; /* temporary: for testing */ #if IPSEC extern int ipsec_bypass; #endif -static int ip_maxchainsent = 0; -SYSCTL_INT(_net_inet_ip, OID_AUTO, maxchainsent, CTLFLAG_RW | CTLFLAG_LOCKED, - &ip_maxchainsent, 0, "use dlil_output_list"); +static int ip_maxchainsent = 0; +SYSCTL_INT(_net_inet_ip, OID_AUTO, maxchainsent, + CTLFLAG_RW | CTLFLAG_LOCKED, &ip_maxchainsent, 0, + "use dlil_output_list"); #if DEBUG static int forge_ce = 0; -SYSCTL_INT(_net_inet_ip, OID_AUTO, forge_ce, CTLFLAG_RW | CTLFLAG_LOCKED, - &forge_ce, 0, "Forge ECN CE"); +SYSCTL_INT(_net_inet_ip, OID_AUTO, forge_ce, + CTLFLAG_RW | CTLFLAG_LOCKED, &forge_ce, 0, + "Forge ECN CE"); #endif /* DEBUG */ static int ip_select_srcif_debug = 0; -SYSCTL_INT(_net_inet_ip, OID_AUTO, select_srcif_debug, CTLFLAG_RW | CTLFLAG_LOCKED, - &ip_select_srcif_debug, 0, "log source interface selection debug info"); +SYSCTL_INT(_net_inet_ip, OID_AUTO, select_srcif_debug, + CTLFLAG_RW | CTLFLAG_LOCKED, &ip_select_srcif_debug, 0, + "log source interface selection debug info"); + +static int ip_output_measure = 0; +SYSCTL_PROC(_net_inet_ip, OID_AUTO, output_perf, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + &ip_output_measure, 0, sysctl_reset_ip_output_stats, "I", + "Do time measurement"); + +static uint64_t ip_output_measure_bins = 0; +SYSCTL_PROC(_net_inet_ip, OID_AUTO, output_perf_bins, + CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &ip_output_measure_bins, 0, + sysctl_ip_output_measure_bins, "I", + "bins for chaining performance data histogram"); + +static net_perf_t net_perf; +SYSCTL_PROC(_net_inet_ip, OID_AUTO, output_perf_data, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, + 0, 0, sysctl_ip_output_getperf, "S,net_perf", + "IP output performance data (struct net_perf, net/net_perf.h)"); #define IMO_TRACE_HIST_SIZE 32 /* size of trace history */ @@ -226,104 +245,123 @@ static struct zone *imo_zone; /* zone for ip_moptions */ * The mbuf opt, if present, will not be freed. */ int -ip_output( - struct mbuf *m0, - struct mbuf *opt, - struct route *ro, - int flags, - struct ip_moptions *imo, - struct ip_out_args *ipoa) +ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro, int flags, + struct ip_moptions *imo, struct ip_out_args *ipoa) { - int error; - error = ip_output_list(m0, 0, opt, ro, flags, imo, ipoa); - return error; + return (ip_output_list(m0, 0, opt, ro, flags, imo, ipoa)); } /* - * Returns: 0 Success - * ENOMEM - * EADDRNOTAVAIL - * ENETUNREACH - * EHOSTUNREACH - * EACCES - * EMSGSIZE - * ENOBUFS - * ipsec4_getpolicybyaddr:??? [IPSEC 4th argument, contents modified] - * ipsec4_getpolicybysock:??? [IPSEC 4th argument, contents modified] - * key_spdacquire:??? [IPSEC] - * ipsec4_output:??? [IPSEC] - * :??? [firewall] - * ip_dn_io_ptr:??? [dummynet] - * dlil_output:??? [DLIL] - * dlil_output_list:??? [DLIL] + * IP output. The packet in mbuf chain m contains a skeletal IP + * header (with len, off, ttl, proto, tos, src, dst). + * The mbuf chain containing the packet will be freed. + * The mbuf opt, if present, will not be freed. + * + * Route ro MUST be non-NULL; if ro->ro_rt is valid, route lookup would be + * skipped and ro->ro_rt would be used. Otherwise the result of route + * lookup is stored in ro->ro_rt. * - * Notes: The ipsec4_getpolicyby{addr|sock} function error returns are - * only used as the error return from this function where one of - * these functions fails to return a policy. + * In the IP forwarding case, the packet will arrive with options already + * inserted, so must have a NULL opt pointer. */ int -ip_output_list( - struct mbuf *m0, - int packetchain, - struct mbuf *opt, - struct route *ro, - int flags, - struct ip_moptions *imo, - struct ip_out_args *ipoa - ) +ip_output_list(struct mbuf *m0, int packetchain, struct mbuf *opt, + struct route *ro, int flags, struct ip_moptions *imo, + struct ip_out_args *ipoa) { struct ip *ip; - struct ifnet *ifp = NULL; - struct mbuf *m = m0, **mppn = NULL; + struct ifnet *ifp = NULL; /* not refcnt'd */ + struct mbuf *m = m0, *prevnxt = NULL, **mppn = &prevnxt; int hlen = sizeof (struct ip); int len = 0, error = 0; struct sockaddr_in *dst = NULL; struct in_ifaddr *ia = NULL, *src_ia = NULL; - int isbroadcast, sw_csum; struct in_addr pkt_dst; - struct ipf_pktopts *ippo = NULL, ipf_pktopts; + struct ipf_pktopts *ippo = NULL; + ipfilter_t inject_filter_ref = NULL; + struct mbuf *packetlist; + uint32_t sw_csum, pktcnt = 0, scnt = 0, bytecnt = 0; + uint32_t packets_processed = 0; + unsigned int ifscope = IFSCOPE_NONE; + struct flowadv *adv = NULL; + struct timeval start_tv; #if IPSEC - struct ipsec_output_state ipsec_state; - struct route *ipsec_saved_route = NULL; struct socket *so = NULL; struct secpolicy *sp = NULL; -#endif -#if IPFIREWALL_FORWARD - int fwd_rewrite_src = 0; -#endif +#endif /* IPSEC */ +#if NECP + necp_kernel_policy_result necp_result = 0; + necp_kernel_policy_result_parameter necp_result_parameter; + necp_kernel_policy_id necp_matched_policy_id = 0; +#endif /* NECP */ #if IPFIREWALL - int off; - struct ip_fw_args args; - struct m_tag *tag; + int ipfwoff; struct sockaddr_in *next_hop_from_ipfwd_tag = NULL; -#endif - int didfilter = 0; - ipfilter_t inject_filter_ref = 0; +#endif /* IPFIREWALL */ +#if IPFIREWALL || DUMMYNET + struct m_tag *tag; +#endif /* IPFIREWALL || DUMMYNET */ #if DUMMYNET - struct route saved_route; struct ip_out_args saved_ipoa; struct sockaddr_in dst_buf; #endif /* DUMMYNET */ - struct mbuf * packetlist; - int pktcnt = 0, tso = 0; - u_int32_t bytecnt = 0; - unsigned int ifscope; - unsigned int nocell; - boolean_t select_srcif; - KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0); - + struct { #if IPSEC - bzero(&ipsec_state, sizeof(ipsec_state)); + struct ipsec_output_state ipsec_state; #endif /* IPSEC */ +#if NECP + struct route necp_route; +#endif /* NECP */ +#if IPFIREWALL || DUMMYNET + struct ip_fw_args args; +#endif /* IPFIREWALL || DUMMYNET */ +#if IPFIREWALL_FORWARD + struct route sro_fwd; +#endif /* IPFIREWALL_FORWARD */ +#if DUMMYNET + struct route saved_route; +#endif /* DUMMYNET */ + struct ipf_pktopts ipf_pktopts; + } ipobz; +#define ipsec_state ipobz.ipsec_state +#define necp_route ipobz.necp_route +#define args ipobz.args +#define sro_fwd ipobz.sro_fwd +#define saved_route ipobz.saved_route +#define ipf_pktopts ipobz.ipf_pktopts + union { + struct { + boolean_t select_srcif : 1; /* set once */ + boolean_t srcbound : 1; /* set once */ + boolean_t nocell : 1; /* set once */ + boolean_t isbroadcast : 1; + boolean_t didfilter : 1; + boolean_t noexpensive : 1; /* set once */ + boolean_t awdl_unrestricted : 1; /* set once */ +#if IPFIREWALL_FORWARD + boolean_t fwd_rewrite_src : 1; +#endif /* IPFIREWALL_FORWARD */ + }; + uint32_t raw; + } ipobf = { .raw = 0 }; + +#define IP_CHECK_RESTRICTIONS(_ifp, _ipobf) \ + (((_ipobf).nocell && IFNET_IS_CELLULAR(_ifp)) || \ + ((_ipobf).noexpensive && IFNET_IS_EXPENSIVE(_ifp)) || \ + (!(_ipobf).awdl_unrestricted && IFNET_IS_AWDL_RESTRICTED(_ifp))) + + if (ip_output_measure) + net_perf_start_time(&net_perf, &start_tv); + KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0); + VERIFY(m0->m_flags & M_PKTHDR); packetlist = m0; -#if IPFIREWALL - args.next_hop = NULL; - args.eh = NULL; - args.rule = NULL; - args.divert_rule = 0; /* divert cookie */ - args.ipoa = NULL; + /* zero out {ipsec_state, args, sro_fwd, saved_route, ipf_pktops} */ + bzero(&ipobz, sizeof (ipobz)); + ippo = &ipf_pktopts; + +#if IPFIREWALL || DUMMYNET if (SLIST_EMPTY(&m0->m_pkthdr.tags)) goto ipfw_tags_done; @@ -334,18 +372,21 @@ ip_output_list( struct dn_pkt_tag *dn_tag; dn_tag = (struct dn_pkt_tag *)(tag+1); - args.rule = dn_tag->rule; + args.fwa_ipfw_rule = dn_tag->dn_ipfw_rule; + args.fwa_pf_rule = dn_tag->dn_pf_rule; opt = NULL; - saved_route = dn_tag->ro; + saved_route = dn_tag->dn_ro; ro = &saved_route; imo = NULL; - bcopy(&dn_tag->dn_dst, &dst_buf, sizeof(dst_buf)); + bcopy(&dn_tag->dn_dst, &dst_buf, sizeof (dst_buf)); dst = &dst_buf; - ifp = dn_tag->ifp; - flags = dn_tag->flags; - saved_ipoa = dn_tag->ipoa; - ipoa = &saved_ipoa; + ifp = dn_tag->dn_ifp; + flags = dn_tag->dn_flags; + if ((dn_tag->dn_flags & IP_OUTARGS)) { + saved_ipoa = dn_tag->dn_ipoa; + ipoa = &saved_ipoa; + } m_tag_delete(m0, tag); } @@ -357,65 +398,108 @@ ip_output_list( struct divert_tag *div_tag; div_tag = (struct divert_tag *)(tag+1); - args.divert_rule = div_tag->cookie; + args.fwa_divert_rule = div_tag->cookie; m_tag_delete(m0, tag); } #endif /* IPDIVERT */ +#if IPFIREWALL if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFORWARD, NULL)) != NULL) { struct ip_fwd_tag *ipfwd_tag; ipfwd_tag = (struct ip_fwd_tag *)(tag+1); next_hop_from_ipfwd_tag = ipfwd_tag->next_hop; - + m_tag_delete(m0, tag); } -ipfw_tags_done: #endif /* IPFIREWALL */ - m = m0; +ipfw_tags_done: +#endif /* IPFIREWALL || DUMMYNET */ -#if DIAGNOSTIC - if ( !m || (m->m_flags & M_PKTHDR) != 0) - panic("ip_output no HDR"); - if (!ro) - panic("ip_output no route, proto = %d", - mtod(m, struct ip *)->ip_p); -#endif + m = m0; + m->m_pkthdr.pkt_flags &= ~(PKTF_LOOP|PKTF_IFAINFO); - bzero(&ipf_pktopts, sizeof(struct ipf_pktopts)); - ippo = &ipf_pktopts; +#if IPSEC + if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) { + /* If packet is bound to an interface, check bound policies */ + if ((flags & IP_OUTARGS) && (ipoa != NULL) && + (ipoa->ipoa_flags & IPOAF_BOUND_IF) && + ipoa->ipoa_boundif != IFSCOPE_NONE) { + if (ipsec4_getpolicybyinterface(m, IPSEC_DIR_OUTBOUND, + &flags, ipoa, &sp) != 0) + goto bad; + } + } +#endif /* IPSEC */ + + VERIFY(ro != NULL); - /* - * At present the IP_OUTARGS flag implies a request for IP to - * perform source interface selection. In the forwarding case, - * only the ifscope value is used, as source interface selection - * doesn't take place. - */ if (ip_doscopedroute && (flags & IP_OUTARGS)) { - select_srcif = !(flags & IP_FORWARDING); - ifscope = ipoa->ipoa_boundif; - ipf_pktopts.ippo_flags = IPPOF_BOUND_IF; - ipf_pktopts.ippo_flags |= (ifscope << IPPOF_SHIFT_IFSCOPE); + /* + * In the forwarding case, only the ifscope value is used, + * as source interface selection doesn't take place. + */ + if ((ipobf.select_srcif = (!(flags & IP_FORWARDING) && + (ipoa->ipoa_flags & IPOAF_SELECT_SRCIF)))) { + ipf_pktopts.ippo_flags |= IPPOF_SELECT_SRCIF; + } + + if ((ipoa->ipoa_flags & IPOAF_BOUND_IF) && + ipoa->ipoa_boundif != IFSCOPE_NONE) { + ifscope = ipoa->ipoa_boundif; + ipf_pktopts.ippo_flags |= + (IPPOF_BOUND_IF | (ifscope << IPPOF_SHIFT_IFSCOPE)); + } + + /* double negation needed for bool bit field */ + ipobf.srcbound = !!(ipoa->ipoa_flags & IPOAF_BOUND_SRCADDR); + if (ipobf.srcbound) + ipf_pktopts.ippo_flags |= IPPOF_BOUND_SRCADDR; } else { - select_srcif = FALSE; + ipobf.select_srcif = FALSE; + ipobf.srcbound = FALSE; ifscope = IFSCOPE_NONE; + if (flags & IP_OUTARGS) { + ipoa->ipoa_boundif = IFSCOPE_NONE; + ipoa->ipoa_flags &= ~(IPOAF_SELECT_SRCIF | + IPOAF_BOUND_IF | IPOAF_BOUND_SRCADDR); + } } if (flags & IP_OUTARGS) { - nocell = ipoa->ipoa_nocell; - if (nocell) + if (ipoa->ipoa_flags & IPOAF_NO_CELLULAR) { + ipobf.nocell = TRUE; ipf_pktopts.ippo_flags |= IPPOF_NO_IFT_CELLULAR; - } else { - nocell = 0; + } + if (ipoa->ipoa_flags & IPOAF_NO_EXPENSIVE) { + ipobf.noexpensive = TRUE; + ipf_pktopts.ippo_flags |= IPPOF_NO_IFF_EXPENSIVE; + } + if (ipoa->ipoa_flags & IPOAF_AWDL_UNRESTRICTED) + ipobf.awdl_unrestricted = TRUE; + adv = &ipoa->ipoa_flowadv; + adv->code = FADV_SUCCESS; + ipoa->ipoa_retflags = 0; + } + +#if IPSEC + if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) { + so = ipsec_getsocket(m); + if (so != NULL) { + (void) ipsec_setsocket(m, NULL); + } } +#endif /* IPSEC */ -#if IPFIREWALL - if (args.rule != NULL) { /* dummynet already saw us */ +#if DUMMYNET + if (args.fwa_ipfw_rule != NULL || args.fwa_pf_rule != NULL) { + /* dummynet already saw us */ ip = mtod(m, struct ip *); - hlen = IP_VHL_HL(ip->ip_vhl) << 2 ; + hlen = IP_VHL_HL(ip->ip_vhl) << 2; + pkt_dst = ip->ip_dst; if (ro->ro_rt != NULL) { RT_LOCK_SPIN(ro->ro_rt); ia = (struct in_ifaddr *)ro->ro_rt->rt_ifa; @@ -426,51 +510,60 @@ ipfw_tags_done: } RT_UNLOCK(ro->ro_rt); } -#if IPSEC - if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) { - so = ipsec_getsocket(m); - (void)ipsec_setsocket(m, NULL); - } -#endif - goto sendit; - } -#endif /* IPFIREWALL */ -#if IPSEC - if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) { - so = ipsec_getsocket(m); - (void)ipsec_setsocket(m, NULL); +#if IPFIREWALL + if (args.fwa_ipfw_rule != NULL) + goto skip_ipsec; +#endif /* IPFIREWALL */ + if (args.fwa_pf_rule != NULL) + goto sendit; } -#endif -loopit: +#endif /* DUMMYNET */ + +loopit: + packets_processed++; + ipobf.isbroadcast = FALSE; + ipobf.didfilter = FALSE; +#if IPFIREWALL_FORWARD + ipobf.fwd_rewrite_src = FALSE; +#endif /* IPFIREWALL_FORWARD */ + + VERIFY(m->m_flags & M_PKTHDR); /* - * No need to proccess packet twice if we've - * already seen it + * No need to proccess packet twice if we've already seen it. */ if (!SLIST_EMPTY(&m->m_pkthdr.tags)) inject_filter_ref = ipf_get_inject_filter(m); else - inject_filter_ref = 0; + inject_filter_ref = NULL; if (opt) { m = ip_insertoptions(m, opt, &len); hlen = len; + /* Update the chain */ + if (m != m0) { + if (m0 == packetlist) + packetlist = m; + m0 = m; + } } ip = mtod(m, struct ip *); + #if IPFIREWALL /* * rdar://8542331 * - * When dealing with a packet chain, we need to reset "next_hop" because - * "dst" may have been changed to the gateway address below for the previous - * packet of the chain. This could cause the route to be inavertandly changed - * to the route to the gateway address (instead of the route to the destination). + * When dealing with a packet chain, we need to reset "next_hop" + * because "dst" may have been changed to the gateway address below + * for the previous packet of the chain. This could cause the route + * to be inavertandly changed to the route to the gateway address + * (instead of the route to the destination). */ - args.next_hop = next_hop_from_ipfwd_tag; - pkt_dst = args.next_hop ? args.next_hop->sin_addr : ip->ip_dst; -#else + args.fwa_next_hop = next_hop_from_ipfwd_tag; + pkt_dst = args.fwa_next_hop ? args.fwa_next_hop->sin_addr : ip->ip_dst; +#else /* !IPFIREWALL */ pkt_dst = ip->ip_dst; -#endif +#endif /* !IPFIREWALL */ /* * We must not send if the packet is destined to network zero. @@ -484,33 +577,29 @@ loopit: /* * Fill in IP header. */ - if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { + if (!(flags & (IP_FORWARDING|IP_RAWOUTPUT))) { ip->ip_vhl = IP_MAKE_VHL(IPVERSION, hlen >> 2); ip->ip_off &= IP_DF; -#if RANDOM_IP_ID ip->ip_id = ip_randomid(); -#else - ip->ip_id = htons(ip_id++); -#endif OSAddAtomic(1, &ipstat.ips_localout); } else { hlen = IP_VHL_HL(ip->ip_vhl) << 2; } - + #if DEBUG /* For debugging, we let the stack forge congestion */ if (forge_ce != 0 && - ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT1 || - (ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT0)) { + ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT1 || + (ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT0)) { ip->ip_tos = (ip->ip_tos & ~IPTOS_ECN_MASK) | IPTOS_ECN_CE; forge_ce--; } #endif /* DEBUG */ - KERNEL_DEBUG(DBG_LAYER_BEG, ip->ip_dst.s_addr, - ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len); - - dst = (struct sockaddr_in *)&ro->ro_dst; + KERNEL_DEBUG(DBG_LAYER_BEG, ip->ip_dst.s_addr, ip->ip_src.s_addr, + ip->ip_p, ip->ip_off, ip->ip_len); + + dst = SIN(&ro->ro_dst); /* * If there is a cached route, @@ -521,15 +610,15 @@ loopit: */ if (ro->ro_rt != NULL) { - if (ro->ro_rt->generation_id != route_generation && - ((flags & (IP_ROUTETOIF | IP_FORWARDING)) == 0) && - (ip->ip_src.s_addr != INADDR_ANY)) { + if (ROUTE_UNUSABLE(ro) && ip->ip_src.s_addr != INADDR_ANY && + !(flags & (IP_ROUTETOIF | IP_FORWARDING))) { src_ia = ifa_foraddr(ip->ip_src.s_addr); if (src_ia == NULL) { error = EADDRNOTAVAIL; goto bad; } IFA_REMREF(&src_ia->ia_ifa); + src_ia = NULL; } /* * Test rt_flags without holding rt_lock for performance @@ -537,38 +626,35 @@ loopit: * caught by the layer below (since it uses this route * as a hint) or during the next transmit. */ - if ((ro->ro_rt->rt_flags & RTF_UP) == 0 || - dst->sin_family != AF_INET || - dst->sin_addr.s_addr != pkt_dst.s_addr) { - rtfree(ro->ro_rt); - ro->ro_rt = NULL; - } + if (ROUTE_UNUSABLE(ro) || dst->sin_family != AF_INET || + dst->sin_addr.s_addr != pkt_dst.s_addr) + ROUTE_RELEASE(ro); + /* * If we're doing source interface selection, we may not * want to use this route; only synch up the generation * count otherwise. */ - if (!select_srcif && ro->ro_rt != NULL && - ro->ro_rt->generation_id != route_generation) - ro->ro_rt->generation_id = route_generation; + if (!ipobf.select_srcif && ro->ro_rt != NULL && + RT_GENID_OUTOFSYNC(ro->ro_rt)) + RT_GENID_SYNC(ro->ro_rt); } if (ro->ro_rt == NULL) { - bzero(dst, sizeof(*dst)); + bzero(dst, sizeof (*dst)); dst->sin_family = AF_INET; - dst->sin_len = sizeof(*dst); + dst->sin_len = sizeof (*dst); dst->sin_addr = pkt_dst; } /* * If routing to interface only, * short circuit routing lookup. */ -#define ifatoia(ifa) ((struct in_ifaddr *)(ifa)) -#define sintosa(sin) ((struct sockaddr *)(sin)) if (flags & IP_ROUTETOIF) { - if (ia) + if (ia != NULL) IFA_REMREF(&ia->ia_ifa); - if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == 0) { - if ((ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == 0) { + if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) { + ia = ifatoia(ifa_ifwithnet(sintosa(dst))); + if (ia == NULL) { OSAddAtomic(1, &ipstat.ips_noroute); error = ENETUNREACH; goto bad; @@ -576,20 +662,31 @@ loopit: } ifp = ia->ia_ifp; ip->ip_ttl = 1; - isbroadcast = in_broadcast(dst->sin_addr, ifp); + ipobf.isbroadcast = in_broadcast(dst->sin_addr, ifp); + /* + * For consistency with other cases below. Loopback + * multicast case is handled separately by ip_mloopback(). + */ + if ((ifp->if_flags & IFF_LOOPBACK) && + !IN_MULTICAST(ntohl(pkt_dst.s_addr))) { + m->m_pkthdr.rcvif = ifp; + ip_setsrcifaddr_info(m, ifp->if_index, NULL); + ip_setdstifaddr_info(m, ifp->if_index, NULL); + } } else if (IN_MULTICAST(ntohl(pkt_dst.s_addr)) && imo != NULL && (ifp = imo->imo_multicast_ifp) != NULL) { /* * Bypass the normal routing lookup for multicast * packets if the interface is specified. */ - isbroadcast = 0; + ipobf.isbroadcast = FALSE; if (ia != NULL) IFA_REMREF(&ia->ia_ifa); /* Macro takes reference on ia */ IFP_TO_IA(ifp, ia); } else { + struct ifaddr *ia0 = NULL; boolean_t cloneok = FALSE; /* * Perform source interface selection; the source IP address @@ -599,36 +696,38 @@ loopit: * or if we haven't done source interface selection on this * route (for this PCB instance) before. */ - if (select_srcif && ip->ip_src.s_addr != INADDR_ANY && - (ro->ro_rt == NULL || !(ro->ro_rt->rt_flags & RTF_UP) || - ro->ro_rt->generation_id != route_generation || + if (ipobf.select_srcif && + ip->ip_src.s_addr != INADDR_ANY && (ROUTE_UNUSABLE(ro) || !(ro->ro_flags & ROF_SRCIF_SELECTED))) { - struct ifaddr *ifa; - /* Find the source interface */ - ifa = in_selectsrcif(ip, ro, ifscope); + ia0 = in_selectsrcif(ip, ro, ifscope); /* - * If the source address belongs to a cellular interface - * and the caller forbids our using interfaces of such - * type, pretend that there is no source address. + * If the source address belongs to a restricted + * interface and the caller forbids our using + * interfaces of such type, pretend that there is no + * route. */ - if (nocell && ifa != NULL && - ifa->ifa_ifp->if_type == IFT_CELLULAR) { - IFA_REMREF(ifa); - error = EADDRNOTAVAIL; + if (ia0 != NULL && + IP_CHECK_RESTRICTIONS(ia0->ifa_ifp, ipobf)) { + IFA_REMREF(ia0); + ia0 = NULL; + error = EHOSTUNREACH; + if (flags & IP_OUTARGS) + ipoa->ipoa_retflags |= IPOARF_IFDENIED; goto bad; } /* - * If the source address is spoofed (in the case - * of IP_RAWOUTPUT), or if this is destined for - * local/loopback, just let it go out using the - * interface of the route. Otherwise, there's no - * interface having such an address, so bail out. + * If the source address is spoofed (in the case of + * IP_RAWOUTPUT on an unbounded socket), or if this + * is destined for local/loopback, just let it go out + * using the interface of the route. Otherwise, + * there's no interface having such an address, + * so bail out. */ - if (ifa == NULL && !(flags & IP_RAWOUTPUT) && - ifscope != lo_ifp->if_index) { + if (ia0 == NULL && (!(flags & IP_RAWOUTPUT) || + ipobf.srcbound) && ifscope != lo_ifp->if_index) { error = EADDRNOTAVAIL; goto bad; } @@ -647,10 +746,9 @@ loopit: * gateway points to that of the default gateway on * the primary interface of the system. */ - if (ifa != NULL) { + if (ia0 != NULL) { if (ifscope == IFSCOPE_NONE) - ifscope = ifa->ifa_ifp->if_index; - IFA_REMREF(ifa); + ifscope = ia0->ifa_ifp->if_index; cloneok = (!(flags & IP_RAWOUTPUT) && !(IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)))); } @@ -703,17 +801,20 @@ loopit: rtalloc_scoped_ign(ro, ign, ifscope); /* - * If the route points to a cellular interface and the - * caller forbids our using interfaces of such type, + * If the route points to a cellular/expensive interface + * and the caller forbids our using interfaces of such type, * pretend that there is no route. */ - if (nocell && ro->ro_rt != NULL) { + if (ro->ro_rt != NULL) { RT_LOCK_SPIN(ro->ro_rt); - if (ro->ro_rt->rt_ifp->if_type == - IFT_CELLULAR) { + if (IP_CHECK_RESTRICTIONS(ro->ro_rt->rt_ifp, + ipobf)) { RT_UNLOCK(ro->ro_rt); - rtfree(ro->ro_rt); - ro->ro_rt = NULL; + ROUTE_RELEASE(ro); + if (flags & IP_OUTARGS) { + ipoa->ipoa_retflags |= + IPOARF_IFDENIED; + } } else { RT_UNLOCK(ro->ro_rt); } @@ -723,33 +824,75 @@ loopit: if (ro->ro_rt == NULL) { OSAddAtomic(1, &ipstat.ips_noroute); error = EHOSTUNREACH; + if (ia0 != NULL) { + IFA_REMREF(ia0); + ia0 = NULL; + } goto bad; } - if (ia) + if (ia != NULL) IFA_REMREF(&ia->ia_ifa); RT_LOCK_SPIN(ro->ro_rt); ia = ifatoia(ro->ro_rt->rt_ifa); - if (ia) { + if (ia != NULL) { /* Become a regular mutex */ RT_CONVERT_LOCK(ro->ro_rt); IFA_ADDREF(&ia->ia_ifa); } + /* + * Note: ia_ifp may not be the same as rt_ifp; the latter + * is what we use for determining outbound i/f, mtu, etc. + */ ifp = ro->ro_rt->rt_ifp; ro->ro_rt->rt_use++; - if (ro->ro_rt->rt_flags & RTF_GATEWAY) - dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway; + if (ro->ro_rt->rt_flags & RTF_GATEWAY) { + dst = SIN(ro->ro_rt->rt_gateway); + } if (ro->ro_rt->rt_flags & RTF_HOST) { - isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST); + /* double negation needed for bool bit field */ + ipobf.isbroadcast = + !!(ro->ro_rt->rt_flags & RTF_BROADCAST); } else { /* Become a regular mutex */ RT_CONVERT_LOCK(ro->ro_rt); - isbroadcast = in_broadcast(dst->sin_addr, ifp); + ipobf.isbroadcast = in_broadcast(dst->sin_addr, ifp); + } + /* + * For consistency with IPv6, as well as to ensure that + * IP_RECVIF is set correctly for packets that are sent + * to one of the local addresses. ia (rt_ifa) would have + * been fixed up by rt_setif for local routes. This + * would make it appear as if the packet arrives on the + * interface which owns the local address. Loopback + * multicast case is handled separately by ip_mloopback(). + */ + if (ia != NULL && (ifp->if_flags & IFF_LOOPBACK) && + !IN_MULTICAST(ntohl(pkt_dst.s_addr))) { + uint32_t srcidx; + + m->m_pkthdr.rcvif = ia->ia_ifa.ifa_ifp; + + if (ia0 != NULL) + srcidx = ia0->ifa_ifp->if_index; + else if ((ro->ro_flags & ROF_SRCIF_SELECTED) && + ro->ro_srcia != NULL) + srcidx = ro->ro_srcia->ifa_ifp->if_index; + else + srcidx = 0; + + ip_setsrcifaddr_info(m, srcidx, NULL); + ip_setdstifaddr_info(m, 0, ia); } RT_UNLOCK(ro->ro_rt); + if (ia0 != NULL) { + IFA_REMREF(ia0); + ia0 = NULL; + } } if (IN_MULTICAST(ntohl(pkt_dst.s_addr))) { + struct ifnet *srcifp = NULL; struct in_multi *inm; u_int32_t vif; u_int8_t ttl = IP_DEFAULT_MULTICAST_TTL; @@ -761,7 +904,7 @@ loopit: * still points to the address in "ro". (It may have been * changed to point to a gateway address, above.) */ - dst = (struct sockaddr_in *)&ro->ro_dst; + dst = SIN(&ro->ro_dst); /* * See if the caller provided any multicast options */ @@ -770,17 +913,12 @@ loopit: vif = imo->imo_multicast_vif; ttl = imo->imo_multicast_ttl; loop = imo->imo_multicast_loop; - if ((flags & IP_RAWOUTPUT) == 0) + if (!(flags & IP_RAWOUTPUT)) ip->ip_ttl = ttl; if (imo->imo_multicast_ifp != NULL) ifp = imo->imo_multicast_ifp; IMO_UNLOCK(imo); -#if MROUTING - if (vif != -1 && ((flags & IP_RAWOUTPUT) == 0 || - ip->ip_src.s_addr == INADDR_ANY)) - ip->ip_src.s_addr = ip_mcast_src(vif); -#endif /* MROUTING */ - } else if ((flags & IP_RAWOUTPUT) == 0) { + } else if (!(flags & IP_RAWOUTPUT)) { vif = -1; ip->ip_ttl = ttl; } @@ -788,7 +926,7 @@ loopit: * Confirm that the outgoing interface supports multicast. */ if (imo == NULL || vif == -1) { - if ((ifp->if_flags & IFF_MULTICAST) == 0) { + if (!(ifp->if_flags & IFF_MULTICAST)) { OSAddAtomic(1, &ipstat.ips_noroute); error = ENETUNREACH; goto bad; @@ -805,6 +943,7 @@ loopit: IFA_LOCK_SPIN(&ia1->ia_ifa); if (ia1->ia_ifp == ifp) { ip->ip_src = IA_SIN(ia1)->sin_addr; + srcifp = ifp; IFA_UNLOCK(&ia1->ia_ifa); break; } @@ -828,10 +967,11 @@ loopit: */ if (!TAILQ_EMPTY(&ipv4_filters)) { struct ipfilter *filter; - int seen = (inject_filter_ref == 0); + int seen = (inject_filter_ref == NULL); if (imo != NULL) { - ipf_pktopts.ippo_flags |= IPPOF_MCAST_OPTS; + ipf_pktopts.ippo_flags |= + IPPOF_MCAST_OPTS; ipf_pktopts.ippo_mcast_ifnet = ifp; ipf_pktopts.ippo_mcast_ttl = ttl; ipf_pktopts.ippo_mcast_loop = loop; @@ -839,20 +979,26 @@ loopit: ipf_ref(); - /* 4135317 - always pass network byte order to filter */ - + /* + * 4135317 - always pass network byte + * order to filter + */ #if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_len); HTONS(ip->ip_off); #endif - TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) { if (seen == 0) { - if ((struct ipfilter *)inject_filter_ref == filter) + if ((struct ipfilter *) + inject_filter_ref == filter) seen = 1; - } else if (filter->ipf_filter.ipf_output) { + } else if (filter->ipf_filter. + ipf_output != NULL) { errno_t result; - result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, ippo); + result = filter->ipf_filter. + ipf_output(filter-> + ipf_filter.cookie, + (mbuf_t *)&m, ippo); if (result == EJUSTRETURN) { ipf_unref(); INM_REMREF(inm); @@ -868,49 +1014,15 @@ loopit: /* set back to host byte order */ ip = mtod(m, struct ip *); - #if BYTE_ORDER != BIG_ENDIAN NTOHS(ip->ip_len); NTOHS(ip->ip_off); #endif - ipf_unref(); - didfilter = 1; + ipobf.didfilter = TRUE; } - ip_mloopback(ifp, m, dst, hlen); + ip_mloopback(srcifp, ifp, m, dst, hlen); } -#if MROUTING - else { - /* - * If we are acting as a multicast router, perform - * multicast forwarding as if the packet had just - * arrived on the interface to which we are about - * to send. The multicast forwarding function - * recursively calls this function, using the - * IP_FORWARDING flag to prevent infinite recursion. - * - * Multicasts that are looped back by ip_mloopback(), - * above, will be forwarded by the ip_input() routine, - * if necessary. - */ - if (ip_mrouter && (flags & IP_FORWARDING) == 0) { - /* - * Check if rsvp daemon is running. If not, don't - * set ip_moptions. This ensures that the packet - * is multicast and not just sent down one link - * as prescribed by rsvpd. - */ - if (!rsvp_on) - imo = NULL; - if (ip_mforward(ip, ifp, m, imo) != 0) { - m_freem(m); - if (inm != NULL) - INM_REMREF(inm); - goto done; - } - } - } -#endif /* MROUTING */ if (inm != NULL) INM_REMREF(inm); /* @@ -928,7 +1040,6 @@ loopit: goto sendit; } -#ifndef notdef /* * If source address not specified yet, use address * of outgoing interface. @@ -938,27 +1049,27 @@ loopit: ip->ip_src = IA_SIN(ia)->sin_addr; IFA_UNLOCK(&ia->ia_ifa); #if IPFIREWALL_FORWARD - /* Keep note that we did this - if the firewall changes + /* + * Keep note that we did this - if the firewall changes * the next-hop, our interface may change, changing the * default source IP. It's a shame so much effort happens - * twice. Oh well. + * twice. Oh well. */ - fwd_rewrite_src++; + ipobf.fwd_rewrite_src = TRUE; #endif /* IPFIREWALL_FORWARD */ } -#endif /* notdef */ /* * Look for broadcast address and * and verify user is allowed to send * such a packet. */ - if (isbroadcast) { - if ((ifp->if_flags & IFF_BROADCAST) == 0) { + if (ipobf.isbroadcast) { + if (!(ifp->if_flags & IFF_BROADCAST)) { error = EADDRNOTAVAIL; goto bad; } - if ((flags & IP_ALLOWBROADCAST) == 0) { + if (!(flags & IP_ALLOWBROADCAST)) { error = EACCES; goto bad; } @@ -975,14 +1086,31 @@ loopit: sendit: #if PF /* Invoke outbound packet filter */ - if ( PF_IS_ENABLED) { + if (PF_IS_ENABLED) { int rc; - rc = pf_af_hook(ifp, mppn, &m, AF_INET, FALSE); - if (rc != 0) { - if (packetlist == m0) { + + m0 = m; /* Save for later */ +#if DUMMYNET + args.fwa_m = m; + args.fwa_next_hop = dst; + args.fwa_oif = ifp; + args.fwa_ro = ro; + args.fwa_dst = dst; + args.fwa_oflags = flags; + if (flags & IP_OUTARGS) + args.fwa_ipoa = ipoa; + rc = pf_af_hook(ifp, mppn, &m, AF_INET, FALSE, &args); +#else /* DUMMYNET */ + rc = pf_af_hook(ifp, mppn, &m, AF_INET, FALSE, NULL); +#endif /* DUMMYNET */ + if (rc != 0 || m == NULL) { + /* Move to the next packet */ + m = *mppn; + + /* Skip ahead if first packet in list got dropped */ + if (packetlist == m0) packetlist = m; - mppn = NULL; - } + if (m != NULL) { m0 = m; /* Next packet in the chain */ @@ -1000,23 +1128,25 @@ sendit: hlen = IP_VHL_HL(ip->ip_vhl) << 2; } #endif /* PF */ - /* - * Force IP TTL to 255 following draft-ietf-zeroconf-ipv4-linklocal.txt - */ - if (IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)) || IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) { + /* + * Force IP TTL to 255 following draft-ietf-zeroconf-ipv4-linklocal.txt + */ + if (IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)) || + IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) { ip_linklocal_stat.iplls_out_total++; if (ip->ip_ttl != MAXTTL) { ip_linklocal_stat.iplls_out_badttl++; - ip->ip_ttl = MAXTTL; + ip->ip_ttl = MAXTTL; } - } + } - if (!didfilter && !TAILQ_EMPTY(&ipv4_filters)) { + if (!ipobf.didfilter && !TAILQ_EMPTY(&ipv4_filters)) { struct ipfilter *filter; - int seen = (inject_filter_ref == 0); + int seen = (inject_filter_ref == NULL); ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS; - /* Check that a TSO frame isn't passed to a filter. + /* + * Check that a TSO frame isn't passed to a filter. * This could happen if a filter is inserted while * TCP is sending the TSO packet. */ @@ -1026,21 +1156,22 @@ sendit: } ipf_ref(); - - /* 4135317 - always pass network byte order to filter */ + /* 4135317 - always pass network byte order to filter */ #if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_len); HTONS(ip->ip_off); #endif - TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) { if (seen == 0) { - if ((struct ipfilter *)inject_filter_ref == filter) + if ((struct ipfilter *)inject_filter_ref == + filter) seen = 1; } else if (filter->ipf_filter.ipf_output) { errno_t result; - result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, ippo); + result = filter->ipf_filter. + ipf_output(filter->ipf_filter.cookie, + (mbuf_t *)&m, ippo); if (result == EJUSTRETURN) { ipf_unref(); goto done; @@ -1051,37 +1182,95 @@ sendit: } } } - /* set back to host byte order */ ip = mtod(m, struct ip *); - #if BYTE_ORDER != BIG_ENDIAN NTOHS(ip->ip_len); NTOHS(ip->ip_off); #endif - ipf_unref(); } -#if IPSEC - /* temporary for testing only: bypass ipsec alltogether */ - - if (ipsec_bypass != 0 || (flags & IP_NOIPSEC) != 0) - goto skip_ipsec; +#if NECP + /* Process Network Extension Policy. Will Pass, Drop, or Rebind packet. */ + necp_matched_policy_id = necp_ip_output_find_policy_match (m, + flags, (flags & IP_OUTARGS) ? ipoa : NULL, &necp_result, &necp_result_parameter); + if (necp_matched_policy_id) { + necp_mark_packet_from_ip(m, necp_matched_policy_id); + switch (necp_result) { + case NECP_KERNEL_POLICY_RESULT_PASS: + /* Check if the interface is allowed */ + if (!necp_packet_is_allowed_over_interface(m, ifp)) { + error = EHOSTUNREACH; + goto bad; + } + goto skip_ipsec; + case NECP_KERNEL_POLICY_RESULT_DROP: + case NECP_KERNEL_POLICY_RESULT_SOCKET_DIVERT: + /* Flow divert packets should be blocked at the IP layer */ + error = EHOSTUNREACH; + goto bad; + case NECP_KERNEL_POLICY_RESULT_IP_TUNNEL: { + /* Verify that the packet is being routed to the tunnel */ + struct ifnet *policy_ifp = necp_get_ifnet_from_result_parameter(&necp_result_parameter); + if (policy_ifp == ifp) { + /* Check if the interface is allowed */ + if (!necp_packet_is_allowed_over_interface(m, ifp)) { + error = EHOSTUNREACH; + goto bad; + } + goto skip_ipsec; + } else { + if (necp_packet_can_rebind_to_ifnet(m, policy_ifp, &necp_route, AF_INET)) { + /* Check if the interface is allowed */ + if (!necp_packet_is_allowed_over_interface(m, policy_ifp)) { + error = EHOSTUNREACH; + goto bad; + } - KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_START, 0,0,0,0,0); + /* Set ifp to the tunnel interface, since it is compatible with the packet */ + ifp = policy_ifp; + ro = &necp_route; + goto skip_ipsec; + } else { + error = ENETUNREACH; + goto bad; + } + } + break; + } + default: + break; + } + } + /* Catch-all to check if the interface is allowed */ + if (!necp_packet_is_allowed_over_interface(m, ifp)) { + error = EHOSTUNREACH; + goto bad; + } +#endif /* NECP */ +#if IPSEC + if (ipsec_bypass != 0 || (flags & IP_NOIPSEC)) + goto skip_ipsec; - /* get SP for this packet */ - if (so == NULL) - sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, flags, &error); - else - sp = ipsec4_getpolicybysock(m, IPSEC_DIR_OUTBOUND, so, &error); + KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0); if (sp == NULL) { - IPSEC_STAT_INCREMENT(ipsecstat.out_inval); - KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); - goto bad; + /* get SP for this packet */ + if (so != NULL) { + sp = ipsec4_getpolicybysock(m, IPSEC_DIR_OUTBOUND, + so, &error); + } else { + sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, + flags, &error); + } + if (sp == NULL) { + IPSEC_STAT_INCREMENT(ipsecstat.out_inval); + KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, + 0, 0, 0, 0, 0); + goto bad; + } } error = 0; @@ -1094,20 +1283,30 @@ sendit: * This packet is just discarded. */ IPSEC_STAT_INCREMENT(ipsecstat.out_polvio); - KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 1,0,0,0,0); + KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, + 1, 0, 0, 0, 0); goto bad; case IPSEC_POLICY_BYPASS: case IPSEC_POLICY_NONE: /* no need to do IPsec. */ - KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 2,0,0,0,0); + KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, + 2, 0, 0, 0, 0); goto skip_ipsec; - + case IPSEC_POLICY_IPSEC: if (sp->req == NULL) { /* acquire a policy */ error = key_spdacquire(sp); - KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 3,0,0,0,0); + KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, + 3, 0, 0, 0, 0); + goto bad; + } + if (sp->ipsec_if) { + /* Verify the redirect to ipsec interface */ + if (sp->ipsec_if == ifp) { + goto skip_ipsec; + } goto bad; } break; @@ -1116,13 +1315,14 @@ sendit: default: printf("ip_output: Invalid policy found. %d\n", sp->policy); } - { + { ipsec_state.m = m; if (flags & IP_ROUTETOIF) { - bzero(&ipsec_state.ro, sizeof(ipsec_state.ro)); - } else - route_copyout(&ipsec_state.ro, ro, sizeof(ipsec_state.ro)); - ipsec_state.dst = (struct sockaddr *)dst; + bzero(&ipsec_state.ro, sizeof (ipsec_state.ro)); + } else { + route_copyout(&ipsec_state.ro, ro, sizeof (ipsec_state.ro)); + } + ipsec_state.dst = SA(dst); ip->ip_sum = 0; @@ -1130,11 +1330,8 @@ sendit: * XXX * delayed checksums are not currently compatible with IPsec */ - if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) in_delayed_cksum(m); - m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; - } - #if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_len); @@ -1142,13 +1339,28 @@ sendit: #endif DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL, - struct ip *, ip, struct ifnet *, ifp, - struct ip *, ip, struct ip6_hdr *, NULL); + struct ip *, ip, struct ifnet *, ifp, + struct ip *, ip, struct ip6_hdr *, NULL); error = ipsec4_output(&ipsec_state, sp, flags); - + if (ipsec_state.tunneled == 6) { + m0 = m = NULL; + error = 0; + goto bad; + } + m0 = m = ipsec_state.m; - + +#if DUMMYNET + /* + * If we're about to use the route in ipsec_state + * and this came from dummynet, cleaup now. + */ + if (ro == &saved_route && + (!(flags & IP_ROUTETOIF) || ipsec_state.tunneled)) + ROUTE_RELEASE(ro); +#endif /* DUMMYNET */ + if (flags & IP_ROUTETOIF) { /* * if we have tunnel mode SA, we may need to ignore @@ -1156,14 +1368,12 @@ sendit: */ if (ipsec_state.tunneled) { flags &= ~IP_ROUTETOIF; - ipsec_saved_route = ro; ro = &ipsec_state.ro; } } else { - ipsec_saved_route = ro; ro = &ipsec_state.ro; } - dst = (struct sockaddr_in *)ipsec_state.dst; + dst = SIN(ipsec_state.dst); if (error) { /* mbuf is already reclaimed in ipsec4_output. */ m0 = NULL; @@ -1176,55 +1386,59 @@ sendit: break; default: printf("ip4_output (ipsec): error code %d\n", error); - /*fall through*/ + /* FALLTHRU */ case ENOENT: /* don't show these error codes to the user */ error = 0; break; } - KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 4,0,0,0,0); + KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, + 4, 0, 0, 0, 0); goto bad; } - } + } /* be sure to update variables that are affected by ipsec4_output() */ ip = mtod(m, struct ip *); - + #ifdef _IP_VHL hlen = IP_VHL_HL(ip->ip_vhl) << 2; -#else +#else /* !_IP_VHL */ hlen = ip->ip_hl << 2; -#endif +#endif /* !_IP_VHL */ /* Check that there wasn't a route change and src is still valid */ - if (ro->ro_rt != NULL && ro->ro_rt->generation_id != route_generation) { - if ((src_ia = ifa_foraddr(ip->ip_src.s_addr)) == NULL && - ((flags & (IP_ROUTETOIF | IP_FORWARDING)) == 0)) { + if (ROUTE_UNUSABLE(ro)) { + ROUTE_RELEASE(ro); + VERIFY(src_ia == NULL); + if (ip->ip_src.s_addr != INADDR_ANY && + !(flags & (IP_ROUTETOIF | IP_FORWARDING)) && + (src_ia = ifa_foraddr(ip->ip_src.s_addr)) == NULL) { error = EADDRNOTAVAIL; KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, - 5,0,0,0,0); + 5, 0, 0, 0, 0); goto bad; } - rtfree(ro->ro_rt); - ro->ro_rt = NULL; - if (src_ia != NULL) + if (src_ia != NULL) { IFA_REMREF(&src_ia->ia_ifa); + src_ia = NULL; + } } if (ro->ro_rt == NULL) { - if ((flags & IP_ROUTETOIF) == 0) { - printf("ip_output: can't update route after " - "IPsec processing\n"); - error = EHOSTUNREACH; /*XXX*/ + if (!(flags & IP_ROUTETOIF)) { + printf("%s: can't update route after " + "IPsec processing\n", __func__); + error = EHOSTUNREACH; /* XXX */ KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, - 6,0,0,0,0); + 6, 0, 0, 0, 0); goto bad; } } else { - if (ia) + if (ia != NULL) IFA_REMREF(&ia->ia_ifa); RT_LOCK_SPIN(ro->ro_rt); ia = ifatoia(ro->ro_rt->rt_ifa); - if (ia) { + if (ia != NULL) { /* Become a regular mutex */ RT_CONVERT_LOCK(ro->ro_rt); IFA_ADDREF(&ia->ia_ifa); @@ -1234,21 +1448,21 @@ sendit: } /* make it flipped, again. */ - #if BYTE_ORDER != BIG_ENDIAN NTOHS(ip->ip_len); NTOHS(ip->ip_off); #endif - - KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 7,0xff,0xff,0xff,0xff); - + KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, + 7, 0xff, 0xff, 0xff, 0xff); + /* Pass to filters again */ if (!TAILQ_EMPTY(&ipv4_filters)) { struct ipfilter *filter; - + ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS; - /* Check that a TSO frame isn't passed to a filter. + /* + * Check that a TSO frame isn't passed to a filter. * This could happen if a filter is inserted while * TCP is sending the TSO packet. */ @@ -1258,18 +1472,18 @@ sendit: } ipf_ref(); - - /* 4135317 - always pass network byte order to filter */ + /* 4135317 - always pass network byte order to filter */ #if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_len); HTONS(ip->ip_off); #endif - TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) { if (filter->ipf_filter.ipf_output) { errno_t result; - result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, ippo); + result = filter->ipf_filter. + ipf_output(filter->ipf_filter.cookie, + (mbuf_t *)&m, ippo); if (result == EJUSTRETURN) { ipf_unref(); goto done; @@ -1280,79 +1494,60 @@ sendit: } } } - /* set back to host byte order */ ip = mtod(m, struct ip *); - #if BYTE_ORDER != BIG_ENDIAN NTOHS(ip->ip_len); NTOHS(ip->ip_off); #endif - ipf_unref(); } skip_ipsec: -#endif /*IPSEC*/ +#endif /* IPSEC */ #if IPFIREWALL - /* - * IpHack's section. - * - Xlate: translate packet's addr/port (NAT). - * - Firewall: deny/allow/etc. - * - Wrap: fake packet's addr/port - * - Encapsulate: put it in another IP and send out. - */ - if (fr_checkp) { - struct mbuf *m1 = m; - - if ((error = (*fr_checkp)(ip, hlen, ifp, 1, &m1)) || !m1) { - goto done; - } - ip = mtod(m0 = m = m1, struct ip *); - } - /* * Check with the firewall... * but not if we are already being fwd'd from a firewall. */ - if (fw_enable && IPFW_LOADED && !args.next_hop) { + if (fw_enable && IPFW_LOADED && !args.fwa_next_hop) { struct sockaddr_in *old = dst; - args.m = m; - args.next_hop = dst; - args.oif = ifp; - off = ip_fw_chk_ptr(&args); - m = args.m; - dst = args.next_hop; - - /* - * On return we must do the following: - * IP_FW_PORT_DENY_FLAG -> drop the pkt (XXX new) - * 1<=off<= 0xffff -> DIVERT - * (off & IP_FW_PORT_DYNT_FLAG) -> send to a DUMMYNET pipe - * (off & IP_FW_PORT_TEE_FLAG) -> TEE the packet - * dst != old -> IPFIREWALL_FORWARD - * off==0, dst==old -> accept - * If some of the above modules is not compiled in, then - * we should't have to check the corresponding condition - * (because the ipfw control socket should not accept - * unsupported rules), but better play safe and drop - * packets in case of doubt. - */ + args.fwa_m = m; + args.fwa_next_hop = dst; + args.fwa_oif = ifp; + ipfwoff = ip_fw_chk_ptr(&args); + m = args.fwa_m; + dst = args.fwa_next_hop; + + /* + * On return we must do the following: + * IP_FW_PORT_DENY_FLAG -> drop the pkt (XXX new) + * 1<=off<= 0xffff -> DIVERT + * (off & IP_FW_PORT_DYNT_FLAG) -> send to a DUMMYNET pipe + * (off & IP_FW_PORT_TEE_FLAG) -> TEE the packet + * dst != old -> IPFIREWALL_FORWARD + * off==0, dst==old -> accept + * If some of the above modules is not compiled in, then + * we should't have to check the corresponding condition + * (because the ipfw control socket should not accept + * unsupported rules), but better play safe and drop + * packets in case of doubt. + */ m0 = m; - if ( (off & IP_FW_PORT_DENY_FLAG) || m == NULL) { + if ((ipfwoff & IP_FW_PORT_DENY_FLAG) || m == NULL) { if (m) m_freem(m); - error = EACCES ; - goto done ; + error = EACCES; + goto done; } ip = mtod(m, struct ip *); - - if (off == 0 && dst == old) {/* common case */ - goto pass ; + + if (ipfwoff == 0 && dst == old) { /* common case */ + goto pass; } #if DUMMYNET - if (DUMMYNET_LOADED && (off & IP_FW_PORT_DYNT_FLAG) != 0) { + if (DUMMYNET_LOADED && (ipfwoff & IP_FW_PORT_DYNT_FLAG) != 0) { /* * pass the pkt to dummynet. Need to include * pipe number, m, ifp, ro, dst because these are @@ -1362,33 +1557,31 @@ skip_ipsec: * XXX note: if the ifp or ro entry are deleted * while a pkt is in dummynet, we are in trouble! */ - args.ro = ro; - args.dst = dst; - args.flags = flags; + args.fwa_ro = ro; + args.fwa_dst = dst; + args.fwa_oflags = flags; if (flags & IP_OUTARGS) - args.ipoa = ipoa; + args.fwa_ipoa = ipoa; - error = ip_dn_io_ptr(m, off & 0xffff, DN_TO_IP_OUT, - &args); + error = ip_dn_io_ptr(m, ipfwoff & 0xffff, DN_TO_IP_OUT, + &args, DN_CLIENT_IPFW); goto done; } #endif /* DUMMYNET */ #if IPDIVERT - if (off != 0 && (off & IP_FW_PORT_DYNT_FLAG) == 0) { + if (ipfwoff != 0 && (ipfwoff & IP_FW_PORT_DYNT_FLAG) == 0) { struct mbuf *clone = NULL; /* Clone packet if we're doing a 'tee' */ - if ((off & IP_FW_PORT_TEE_FLAG) != 0) + if ((ipfwoff & IP_FW_PORT_TEE_FLAG) != 0) clone = m_dup(m, M_DONTWAIT); /* * XXX * delayed checksums are not currently compatible * with divert sockets. */ - if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) in_delayed_cksum(m); - m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; - } /* Restore packet header fields to original values */ @@ -1398,7 +1591,8 @@ skip_ipsec: #endif /* Deliver packet to divert input routine */ - divert_packet(m, 0, off & 0xffff, args.divert_rule); + divert_packet(m, 0, ipfwoff & 0xffff, + args.fwa_divert_rule); /* If 'tee', continue with original packet */ if (clone != NULL) { @@ -1408,11 +1602,11 @@ skip_ipsec: } goto done; } -#endif - +#endif /* IPDIVERT */ #if IPFIREWALL_FORWARD - /* Here we check dst to make sure it's directly reachable on the - * interface we previously thought it was. + /* + * Here we check dst to make sure it's directly reachable on + * the interface we previously thought it was. * If it isn't (which may be likely in some situations) we have * to re-route it (ie, find a route for the next-hop and the * associated interface) and set them here. This is nested @@ -1420,27 +1614,25 @@ skip_ipsec: * such control is nigh impossible. So we do it here. * And I'm babbling. */ - if (off == 0 && old != dst) { + if (ipfwoff == 0 && old != dst) { struct in_ifaddr *ia_fw; + struct route *ro_fwd = &sro_fwd; - /* It's changed... */ - /* There must be a better way to do this next line... */ - static struct route sro_fwd, *ro_fwd = &sro_fwd; #if IPFIREWALL_FORWARD_DEBUG printf("IPFIREWALL_FORWARD: New dst ip: "); print_ip(dst->sin_addr); printf("\n"); -#endif +#endif /* IPFIREWALL_FORWARD_DEBUG */ /* * We need to figure out if we have been forwarded - * to a local socket. If so then we should somehow + * to a local socket. If so then we should somehow * "loop back" to ip_input, and get directed to the * PCB as if we had received this packet. This is * because it may be dificult to identify the packets * you want to forward until they are being output * and have selected an interface. (e.g. locally * initiated packets) If we used the loopback inteface, - * we would not be able to control what happens + * we would not be able to control what happens * as the packet runs through ip_input() as * it is done through a ISR. */ @@ -1462,7 +1654,7 @@ skip_ipsec: lck_rw_done(in_ifaddr_rwlock); if (ia_fw) { /* tell ip_input "dont filter" */ - struct m_tag *fwd_tag; + struct m_tag *fwd_tag; struct ip_fwd_tag *ipfwd_tag; fwd_tag = m_tag_create(KERNEL_MODULE_TAG_ID, @@ -1474,54 +1666,41 @@ skip_ipsec: } ipfwd_tag = (struct ip_fwd_tag *)(fwd_tag+1); - ipfwd_tag->next_hop = args.next_hop; + ipfwd_tag->next_hop = args.fwa_next_hop; m_tag_prepend(m, fwd_tag); if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = lo_ifp; - if ((~IF_HWASSIST_CSUM_FLAGS(m->m_pkthdr.rcvif->if_hwassist) & - m->m_pkthdr.csum_flags) == 0) { - if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { - m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; - m->m_pkthdr.csum_flags |= - CSUM_DATA_VALID | CSUM_PSEUDO_HDR; - m->m_pkthdr.csum_data = 0xffff; - } - m->m_pkthdr.csum_flags |= - CSUM_IP_CHECKED | CSUM_IP_VALID; - } - else if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { - in_delayed_cksum(m); - m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; - ip->ip_sum = in_cksum(m, hlen); - } #if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_len); HTONS(ip->ip_off); -#endif - - /* we need to call dlil_output to run filters - * and resync to avoid recursion loops. +#endif + mbuf_outbound_finalize(m, PF_INET, 0); + + /* + * we need to call dlil_output to run filters + * and resync to avoid recursion loops. */ if (lo_ifp) { - dlil_output(lo_ifp, PF_INET, m, 0, (struct sockaddr *)dst, 0); - } - else { - printf("ip_output: no loopback ifp for forwarding!!!\n"); + dlil_output(lo_ifp, PF_INET, m, NULL, + SA(dst), 0, adv); + } else { + printf("%s: no loopback ifp for " + "forwarding!!!\n", __func__); } goto done; } - /* Some of the logic for this was - * nicked from above. + /* + * Some of the logic for this was nicked from above. * * This rewrites the cached route in a local PCB. * Is this what we want to do? */ - bcopy(dst, &ro_fwd->ro_dst, sizeof(*dst)); + ROUTE_RELEASE(ro_fwd); + bcopy(dst, &ro_fwd->ro_dst, sizeof (*dst)); - ro_fwd->ro_rt = NULL; rtalloc_ign(ro_fwd, RTF_PRCLONING); if (ro_fwd->ro_rt == NULL) { @@ -1540,19 +1719,22 @@ skip_ipsec: ifp = ro_fwd->ro_rt->rt_ifp; ro_fwd->ro_rt->rt_use++; if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY) - dst = (struct sockaddr_in *)ro_fwd->ro_rt->rt_gateway; + dst = SIN(ro_fwd->ro_rt->rt_gateway); if (ro_fwd->ro_rt->rt_flags & RTF_HOST) { - isbroadcast = - (ro_fwd->ro_rt->rt_flags & RTF_BROADCAST); + /* double negation needed for bool bit field */ + ipobf.isbroadcast = + !!(ro_fwd->ro_rt->rt_flags & RTF_BROADCAST); } else { /* Become a regular mutex */ RT_CONVERT_LOCK(ro_fwd->ro_rt); - isbroadcast = in_broadcast(dst->sin_addr, ifp); + ipobf.isbroadcast = + in_broadcast(dst->sin_addr, ifp); } RT_UNLOCK(ro_fwd->ro_rt); - rtfree(ro->ro_rt); + ROUTE_RELEASE(ro); ro->ro_rt = ro_fwd->ro_rt; - dst = (struct sockaddr_in *)&ro_fwd->ro_dst; + ro_fwd->ro_rt = NULL; + dst = SIN(&ro_fwd->ro_dst); /* * If we added a default src ip earlier, @@ -1560,20 +1742,20 @@ skip_ipsec: * interface, do it again, from the new one. */ if (ia_fw != NULL) { - if (fwd_rewrite_src) { + if (ipobf.fwd_rewrite_src) { IFA_LOCK_SPIN(&ia_fw->ia_ifa); ip->ip_src = IA_SIN(ia_fw)->sin_addr; IFA_UNLOCK(&ia_fw->ia_ifa); } IFA_REMREF(&ia_fw->ia_ifa); } - goto pass ; + goto pass; } #endif /* IPFIREWALL_FORWARD */ - /* - * if we get here, none of the above matches, and - * we have to drop the pkt - */ + /* + * if we get here, none of the above matches, and + * we have to drop the pkt + */ m_freem(m); error = EACCES; /* not sure this is the right error msg */ goto done; @@ -1581,109 +1763,66 @@ skip_ipsec: pass: #endif /* IPFIREWALL */ -#if __APPLE__ - /* Do not allow loopback address to wind up on a wire */ - if ((ifp->if_flags & IFF_LOOPBACK) == 0 && - ((ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || - (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)) { - OSAddAtomic(1, &ipstat.ips_badaddr); - m_freem(m); - /* - * Do not simply drop the packet just like a firewall -- we want the - * the application to feel the pain. - * Return ENETUNREACH like ip6_output does in some similar cases. - * This can startle the otherwise clueless process that specifies - * loopback as the source address. - */ - error = ENETUNREACH; - goto done; - } -#endif - m->m_pkthdr.csum_flags |= CSUM_IP; - tso = (ifp->if_hwassist & IFNET_TSO_IPV4) && (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4); - - sw_csum = m->m_pkthdr.csum_flags - & ~IF_HWASSIST_CSUM_FLAGS(ifp->if_hwassist); - if ((ifp->if_hwassist & CSUM_TCP_SUM16) != 0) { - /* - * Special case code for GMACE - * frames that can be checksumed by GMACE SUM16 HW: - * frame >64, no fragments, no UDP - */ - if (apple_hwcksum_tx && (m->m_pkthdr.csum_flags & CSUM_TCP) - && (ip->ip_len > 50) && (ip->ip_len <= ifp->if_mtu)) { - /* Apple GMAC HW, expects STUFF_OFFSET << 16 | START_OFFSET */ - u_short offset = (IP_VHL_HL(ip->ip_vhl) << 2) +14 ; /* IP+Enet header length */ - u_short csumprev= m->m_pkthdr.csum_data & 0xFFFF; - m->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_TCP_SUM16; /* for GMAC */ - m->m_pkthdr.csum_data = (csumprev + offset) << 16 ; - m->m_pkthdr.csum_data += offset; - sw_csum = CSUM_DELAY_IP; /* do IP hdr chksum in software */ - } - else { - /* let the software handle any UDP or TCP checksums */ - sw_csum |= (CSUM_DELAY_DATA & m->m_pkthdr.csum_flags); - } - } else if (apple_hwcksum_tx == 0) { - sw_csum |= (CSUM_DELAY_DATA | CSUM_DELAY_IP) & - m->m_pkthdr.csum_flags; - } - - if (sw_csum & CSUM_DELAY_DATA) { - in_delayed_cksum(m); - sw_csum &= ~CSUM_DELAY_DATA; - m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + /* 127/8 must not appear on wire - RFC1122 */ + if (!(ifp->if_flags & IFF_LOOPBACK) && + ((ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || + (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)) { + OSAddAtomic(1, &ipstat.ips_badaddr); + error = EADDRNOTAVAIL; + goto bad; } - if (apple_hwcksum_tx != 0) { - m->m_pkthdr.csum_flags &= - IF_HWASSIST_CSUM_FLAGS(ifp->if_hwassist); - } else { - m->m_pkthdr.csum_flags = 0; - } + ip_output_checksum(ifp, m, (IP_VHL_HL(ip->ip_vhl) << 2), + ip->ip_len, &sw_csum); /* * If small enough for interface, or the interface will take * care of the fragmentation for us, can just send directly. */ - if ((u_short)ip->ip_len <= ifp->if_mtu || tso || - ifp->if_hwassist & CSUM_FRAGMENT) { - if (tso) - m->m_pkthdr.csum_flags |= CSUM_TSO_IPV4; - - + if ((u_short)ip->ip_len <= ifp->if_mtu || TSO_IPV4_OK(ifp, m) || + (!(ip->ip_off & IP_DF) && (ifp->if_hwassist & CSUM_FRAGMENT))) { #if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_len); HTONS(ip->ip_off); #endif - + ip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) { - ip->ip_sum = in_cksum(m, hlen); + ip->ip_sum = ip_cksum_hdr_out(m, hlen); + sw_csum &= ~CSUM_DELAY_IP; + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP; } - -#ifndef __APPLE__ - /* Record statistics for this interface address. */ - if (!(flags & IP_FORWARDING) && ia != NULL) { - ia->ia_ifa.if_opackets++; - ia->ia_ifa.if_obytes += m->m_pkthdr.len; - } -#endif #if IPSEC /* clean ipsec history once it goes out of the node */ - if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) + if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) ipsec_delaux(m); -#endif +#endif /* IPSEC */ + if ((m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) && + (m->m_pkthdr.tso_segsz > 0)) + scnt += m->m_pkthdr.len / m->m_pkthdr.tso_segsz; + else + scnt++; + if (packetchain == 0) { - if (ro->ro_rt && nstat_collect) - nstat_route_tx(ro->ro_rt, 1, m->m_pkthdr.len, 0); - error = ifnet_output(ifp, PF_INET, m, ro->ro_rt, - (struct sockaddr *)dst); + if (ro->ro_rt != NULL && nstat_collect) + nstat_route_tx(ro->ro_rt, scnt, + m->m_pkthdr.len, 0); + + error = dlil_output(ifp, PF_INET, m, ro->ro_rt, + SA(dst), 0, adv); + if (dlil_verbose && error) { + printf("dlil_output error on interface %s: %d\n", + ifp->if_xname, error); + } + scnt = 0; goto done; - } - else { /* packet chaining allows us to reuse the route for all packets */ + } else { + /* + * packet chaining allows us to reuse the + * route for all packets + */ bytecnt += m->m_pkthdr.len; mppn = &m->m_nextpkt; m = m->m_nextpkt; @@ -1693,15 +1832,21 @@ sendchain: #endif /* PF */ if (pktcnt > ip_maxchainsent) ip_maxchainsent = pktcnt; - if (ro->ro_rt && nstat_collect) - nstat_route_tx(ro->ro_rt, pktcnt, bytecnt, 0); - //send - error = ifnet_output(ifp, PF_INET, packetlist, - ro->ro_rt, (struct sockaddr *)dst); + if (ro->ro_rt != NULL && nstat_collect) + nstat_route_tx(ro->ro_rt, scnt, + bytecnt, 0); + + error = dlil_output(ifp, PF_INET, packetlist, + ro->ro_rt, SA(dst), 0, adv); + if (dlil_verbose && error) { + printf("dlil_output error on interface %s: %d\n", + ifp->if_xname, error); + } pktcnt = 0; + scnt = 0; bytecnt = 0; goto done; - + } m0 = m; pktcnt++; @@ -1711,10 +1856,10 @@ sendchain: /* * Too large for interface; fragment if possible. * Must be able to put at least 8 bytes per fragment. + * Balk when DF bit is set or the interface didn't support TSO. */ - - if (ip->ip_off & IP_DF || (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) || - pktcnt > 0) { + if ((ip->ip_off & IP_DF) || pktcnt > 0 || + (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4)) { error = EMSGSIZE; /* * This case can happen if the user changed the MTU @@ -1725,9 +1870,9 @@ sendchain: */ if (ro->ro_rt) { RT_LOCK_SPIN(ro->ro_rt); - if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) - && !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) - && (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) { + if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) && + !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) && + (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) { ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu; } RT_UNLOCK(ro->ro_rt); @@ -1745,60 +1890,82 @@ sendchain: goto bad; } - KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr, - ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len); + KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr, + ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len); for (m = m0; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = 0; #if IPSEC /* clean ipsec history once it goes out of the node */ - if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) + if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) ipsec_delaux(m); -#endif +#endif /* IPSEC */ if (error == 0) { -#ifndef __APPLE__ - /* Record statistics for this interface address. */ - if (ia != NULL) { - ia->ia_ifa.if_opackets++; - ia->ia_ifa.if_obytes += m->m_pkthdr.len; + if ((packetchain != 0) && (pktcnt > 0)) { + panic("%s: mix of packet in packetlist is " + "wrong=%p", __func__, packetlist); + /* NOTREACHED */ } -#endif - if ((packetchain != 0) && (pktcnt > 0)) - panic("ip_output: mix of packet in packetlist is wrong=%p", packetlist); - if (ro->ro_rt && nstat_collect) - nstat_route_tx(ro->ro_rt, 1, m->m_pkthdr.len, 0); - error = ifnet_output(ifp, PF_INET, m, ro->ro_rt, - (struct sockaddr *)dst); - } else + if (ro->ro_rt != NULL && nstat_collect) { + nstat_route_tx(ro->ro_rt, 1, + m->m_pkthdr.len, 0); + } + error = dlil_output(ifp, PF_INET, m, ro->ro_rt, + SA(dst), 0, adv); + if (dlil_verbose && error) { + printf("dlil_output error on interface %s: %d\n", + ifp->if_xname, error); + } + } else { m_freem(m); + } } if (error == 0) OSAddAtomic(1, &ipstat.ips_fragmented); done: - if (ia) { + if (ia != NULL) { IFA_REMREF(&ia->ia_ifa); ia = NULL; } #if IPSEC - if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) { - if (ipsec_state.ro.ro_rt) - rtfree(ipsec_state.ro.ro_rt); + ROUTE_RELEASE(&ipsec_state.ro); if (sp != NULL) { KEYDEBUG(KEYDEBUG_IPSEC_STAMP, - printf("DP ip_output call free SP:%x\n", sp)); + printf("DP ip_output call free SP:%x\n", sp)); key_freesp(sp, KEY_SADB_UNLOCKED); } - } #endif /* IPSEC */ +#if NECP + ROUTE_RELEASE(&necp_route); +#endif /* NECP */ +#if DUMMYNET + ROUTE_RELEASE(&saved_route); +#endif /* DUMMYNET */ +#if IPFIREWALL_FORWARD + ROUTE_RELEASE(&sro_fwd); +#endif /* IPFIREWALL_FORWARD */ - KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_END, error,0,0,0,0); + KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_END, error, 0, 0, 0, 0); + if (ip_output_measure) { + net_perf_measure_time(&net_perf, &start_tv, packets_processed); + net_perf_histogram(&net_perf, packets_processed); + } return (error); bad: - m_freem(m0); + if (pktcnt > 0) + m0 = packetlist; + m_freem_list(m0); goto done; + +#undef ipsec_state +#undef args +#undef sro_fwd +#undef saved_route +#undef ipf_pktopts +#undef IP_CHECK_RESTRICTIONS } int @@ -1812,9 +1979,9 @@ ip_fragment(struct mbuf *m, struct ifnet *ifp, unsigned long mtu, int sw_csum) ip = mtod(m, struct ip *); #ifdef _IP_VHL hlen = IP_VHL_HL(ip->ip_vhl) << 2; -#else +#else /* !_IP_VHL */ hlen = ip->ip_hl << 2; -#endif +#endif /* !_IP_VHL */ firstlen = len = (mtu - hlen) &~ 7; if (len < 8) { @@ -1826,11 +1993,9 @@ ip_fragment(struct mbuf *m, struct ifnet *ifp, unsigned long mtu, int sw_csum) * if the interface will not calculate checksums on * fragmented packets, then do it here. */ - if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA && - (ifp->if_hwassist & CSUM_IP_FRAGS) == 0) { + if ((m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) && + !(ifp->if_hwassist & CSUM_IP_FRAGS)) in_delayed_cksum(m); - m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; - } /* * Loop through length of segment after first fragment, @@ -1840,7 +2005,7 @@ ip_fragment(struct mbuf *m, struct ifnet *ifp, unsigned long mtu, int sw_csum) mhlen = sizeof (struct ip); for (off = hlen + len; off < (u_short)ip->ip_len; off += len) { MGETHDR(m, M_DONTWAIT, MT_HEADER); /* MAC-OK */ - if (m == 0) { + if (m == NULL) { error = ENOBUFS; OSAddAtomic(1, &ipstat.ips_odropped); goto sendorfree; @@ -1863,19 +2028,22 @@ ip_fragment(struct mbuf *m, struct ifnet *ifp, unsigned long mtu, int sw_csum) mhip->ip_off |= IP_MF; mhip->ip_len = htons((u_short)(len + mhlen)); m->m_next = m_copy(m0, off, len); - if (m->m_next == 0) { + if (m->m_next == NULL) { (void) m_free(m); error = ENOBUFS; /* ??? */ OSAddAtomic(1, &ipstat.ips_odropped); goto sendorfree; } m->m_pkthdr.len = mhlen + len; - m->m_pkthdr.rcvif = 0; + m->m_pkthdr.rcvif = NULL; m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags; - m->m_pkthdr.socket_id = m0->m_pkthdr.socket_id; + + M_COPY_CLASSIFIER(m, m0); + M_COPY_PFTAG(m, m0); + #if CONFIG_MACF_NET mac_netinet_fragment(m0, m); -#endif +#endif /* CONFIG_MACF_NET */ #if BYTE_ORDER != BIG_ENDIAN HTONS(mhip->ip_off); @@ -1883,7 +2051,8 @@ ip_fragment(struct mbuf *m, struct ifnet *ifp, unsigned long mtu, int sw_csum) mhip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) { - mhip->ip_sum = in_cksum(m, mhlen); + mhip->ip_sum = ip_cksum_hdr_out(m, mhlen); + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP; } *mnext = m; mnext = &m->m_nextpkt; @@ -1909,10 +2078,11 @@ ip_fragment(struct mbuf *m, struct ifnet *ifp, unsigned long mtu, int sw_csum) #if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_off); #endif - + ip->ip_sum = 0; if (sw_csum & CSUM_DELAY_IP) { - ip->ip_sum = in_cksum(m, hlen); + ip->ip_sum = ip_cksum_hdr_out(m, hlen); + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP; } sendorfree: if (error) @@ -1937,46 +2107,64 @@ ip_out_cksum_stats(int proto, u_int32_t len) } } -void -in_delayed_cksum_offset(struct mbuf *m0, int ip_offset) +/* + * Process a delayed payload checksum calculation (outbound path.) + * + * hoff is the number of bytes beyond the mbuf data pointer which + * points to the IP header. + * + * Returns a bitmask representing all the work done in software. + */ +uint32_t +in_finalize_cksum(struct mbuf *m, uint32_t hoff, uint32_t csum_flags) { + unsigned char buf[15 << 2] __attribute__((aligned(8))); struct ip *ip; - unsigned char buf[sizeof(struct ip)]; - u_short csum, offset, ip_len; + uint32_t offset, _hlen, mlen, hlen, len, sw_csum; + uint16_t csum, ip_len; - /* Save copy of first mbuf pointer and the ip_offset before modifying */ - struct mbuf *m = m0; - int ip_offset_copy = ip_offset; + _CASSERT(sizeof (csum) == sizeof (uint16_t)); + VERIFY(m->m_flags & M_PKTHDR); - while (ip_offset >= m->m_len) { - ip_offset -= m->m_len; - m = m->m_next; - if (m == NULL) { - printf("in_delayed_cksum_withoffset failed - ip_offset wasn't in the packet\n"); - return; - } + sw_csum = (csum_flags & m->m_pkthdr.csum_flags); + + if ((sw_csum &= (CSUM_DELAY_IP | CSUM_DELAY_DATA)) == 0) + goto done; + + mlen = m->m_pkthdr.len; /* total mbuf len */ + + /* sanity check (need at least simple IP header) */ + if (mlen < (hoff + sizeof (*ip))) { + panic("%s: mbuf %p pkt len (%u) < hoff+ip_hdr " + "(%u+%u)\n", __func__, m, mlen, hoff, + (uint32_t)sizeof (*ip)); + /* NOTREACHED */ } - - /* Sometimes the IP header is not contiguous, yes this can happen! */ - if (ip_offset + sizeof(struct ip) > m->m_len) { -#if DEBUG - printf("delayed m_pullup, m->len: %d off: %d\n", - m->m_len, ip_offset); -#endif - m_copydata(m, ip_offset, sizeof(struct ip), (caddr_t) buf); - - ip = (struct ip *)buf; + + /* + * In case the IP header is not contiguous, or not 32-bit aligned, + * or if we're computing the IP header checksum, copy it to a local + * buffer. Copy only the simple IP header here (IP options case + * is handled below.) + */ + if ((sw_csum & CSUM_DELAY_IP) || (hoff + sizeof (*ip)) > m->m_len || + !IP_HDR_ALIGNED_P(mtod(m, caddr_t) + hoff)) { + m_copydata(m, hoff, sizeof (*ip), (caddr_t)buf); + ip = (struct ip *)(void *)buf; + _hlen = sizeof (*ip); } else { - ip = (struct ip*)(m->m_data + ip_offset); + ip = (struct ip *)(void *)(m->m_data + hoff); + _hlen = 0; } - - /* Gross */ - if (ip_offset) { - m->m_len -= ip_offset; - m->m_data += ip_offset; + + hlen = IP_VHL_HL(ip->ip_vhl) << 2; /* IP header len */ + + /* sanity check */ + if (mlen < (hoff + hlen)) { + panic("%s: mbuf %p pkt too short (%d) for IP header (%u), " + "hoff %u", __func__, m, mlen, hlen, hoff); + /* NOTREACHED */ } - - offset = IP_VHL_HL(ip->ip_vhl) << 2 ; /* * We could be in the context of an IP or interface filter; in the @@ -1984,155 +2172,116 @@ in_delayed_cksum_offset(struct mbuf *m0, int ip_offset) * the latter it would be in network order. Because of this, we * attempt to interpret the length field by comparing it against * the actual packet length. If the comparison fails, byte swap - * the length and check again. If it still fails, then the packet - * is bogus and we give up. + * the length and check again. If it still fails, use the actual + * packet length. This also covers the trailing bytes case. */ ip_len = ip->ip_len; - if (ip_len != (m0->m_pkthdr.len - ip_offset_copy)) { - ip_len = SWAP16(ip_len); - if (ip_len != (m0->m_pkthdr.len - ip_offset_copy)) { - printf("in_delayed_cksum_offset: ip_len %d (%d) " - "doesn't match actual length %d\n", ip->ip_len, - ip_len, (m0->m_pkthdr.len - ip_offset_copy)); - return; + if (ip_len != (mlen - hoff)) { + ip_len = OSSwapInt16(ip_len); + if (ip_len != (mlen - hoff)) { + printf("%s: mbuf 0x%llx proto %d IP len %d (%x) " + "[swapped %d (%x)] doesn't match actual packet " + "length; %d is used instead\n", __func__, + (uint64_t)VM_KERNEL_ADDRPERM(m), ip->ip_p, + ip->ip_len, ip->ip_len, ip_len, ip_len, + (mlen - hoff)); + ip_len = mlen - hoff; } } - csum = in_cksum_skip(m, ip_len, offset); + len = ip_len - hlen; /* csum span */ - /* Update stats */ - ip_out_cksum_stats(ip->ip_p, ip_len - offset); + if (sw_csum & CSUM_DELAY_DATA) { + uint16_t ulpoff; + + /* + * offset is added to the lower 16-bit value of csum_data, + * which is expected to contain the ULP offset; therefore + * CSUM_PARTIAL offset adjustment must be undone. + */ + if ((m->m_pkthdr.csum_flags & (CSUM_PARTIAL|CSUM_DATA_VALID)) == + (CSUM_PARTIAL|CSUM_DATA_VALID)) { + /* + * Get back the original ULP offset (this will + * undo the CSUM_PARTIAL logic in ip_output.) + */ + m->m_pkthdr.csum_data = (m->m_pkthdr.csum_tx_stuff - + m->m_pkthdr.csum_tx_start); + } - if (m0->m_pkthdr.csum_flags & CSUM_UDP && csum == 0) - csum = 0xffff; - offset += m0->m_pkthdr.csum_data & 0xFFFF; /* checksum offset */ + ulpoff = (m->m_pkthdr.csum_data & 0xffff); /* ULP csum offset */ + offset = hoff + hlen; /* ULP header */ - /* Gross */ - if (ip_offset) { - if (M_LEADINGSPACE(m) < ip_offset) - panic("in_delayed_cksum_offset - chain modified!\n"); - m->m_len += ip_offset; - m->m_data -= ip_offset; - } + if (mlen < (ulpoff + sizeof (csum))) { + panic("%s: mbuf %p pkt len (%u) proto %d invalid ULP " + "cksum offset (%u) cksum flags 0x%x\n", __func__, + m, mlen, ip->ip_p, ulpoff, m->m_pkthdr.csum_flags); + /* NOTREACHED */ + } - if (offset > ip_len) /* bogus offset */ - return; + csum = inet_cksum(m, 0, offset, len); - /* Insert the checksum in the existing chain */ - if (offset + ip_offset + sizeof(u_short) > m->m_len) { - char tmp[2]; - -#if DEBUG - printf("delayed m_copyback, m->len: %d off: %d p: %d\n", - m->m_len, offset + ip_offset, ip->ip_p); -#endif - *(u_short *)tmp = csum; - m_copyback(m, offset + ip_offset, 2, tmp); - } else - *(u_short *)(m->m_data + offset + ip_offset) = csum; -} + /* Update stats */ + ip_out_cksum_stats(ip->ip_p, len); -void -in_delayed_cksum(struct mbuf *m) -{ - in_delayed_cksum_offset(m, 0); -} + /* RFC1122 4.1.3.4 */ + if (csum == 0 && (m->m_pkthdr.csum_flags & CSUM_UDP)) + csum = 0xffff; -void -in_cksum_offset(struct mbuf* m, size_t ip_offset) -{ - struct ip* ip = NULL; - int hlen = 0; - unsigned char buf[sizeof(struct ip)]; - int swapped = 0; - - /* Save copy of first mbuf pointer and the ip_offset before modifying */ - struct mbuf* m0 = m; - size_t ip_offset_copy = ip_offset; - - while (ip_offset >= m->m_len) { - ip_offset -= m->m_len; - m = m->m_next; - if (m == NULL) { - printf("in_cksum_offset failed - ip_offset wasn't in the packet\n"); - return; + /* Insert the checksum in the ULP csum field */ + offset += ulpoff; + if (offset + sizeof (csum) > m->m_len) { + m_copyback(m, offset, sizeof (csum), &csum); + } else if (IP_HDR_ALIGNED_P(mtod(m, char *) + hoff)) { + *(uint16_t *)(void *)(mtod(m, char *) + offset) = csum; + } else { + bcopy(&csum, (mtod(m, char *) + offset), sizeof (csum)); } + m->m_pkthdr.csum_flags &= + ~(CSUM_DELAY_DATA | CSUM_DATA_VALID | CSUM_PARTIAL); } - - /* Sometimes the IP header is not contiguous, yes this can happen! */ - if (ip_offset + sizeof(struct ip) > m->m_len) { -#if DEBUG - printf("in_cksum_offset - delayed m_pullup, m->len: %d off: %lu\n", - m->m_len, ip_offset); -#endif - m_copydata(m, ip_offset, sizeof(struct ip), (caddr_t) buf); + if (sw_csum & CSUM_DELAY_IP) { + /* IP header must be in the local buffer */ + VERIFY(_hlen == sizeof (*ip)); + if (_hlen != hlen) { + VERIFY(hlen <= sizeof (buf)); + m_copydata(m, hoff, hlen, (caddr_t)buf); + ip = (struct ip *)(void *)buf; + _hlen = hlen; + } - ip = (struct ip *)buf; - ip->ip_sum = 0; - m_copyback(m, ip_offset + offsetof(struct ip, ip_sum), 2, (caddr_t)&ip->ip_sum); - } else { - ip = (struct ip*)(m->m_data + ip_offset); + /* + * Compute the IP header checksum as if the IP length + * is the length which we believe is "correct"; see + * how ip_len gets calculated above. Note that this + * is done on the local copy and not on the real one. + */ + ip->ip_len = htons(ip_len); ip->ip_sum = 0; - } - - /* Gross */ - if (ip_offset) { - m->m_len -= ip_offset; - m->m_data += ip_offset; - } - -#ifdef _IP_VHL - hlen = IP_VHL_HL(ip->ip_vhl) << 2; -#else - hlen = ip->ip_hl << 2; -#endif - /* - * We could be in the context of an IP or interface filter; in the - * former case, ip_len would be in host order while for the latter - * it would be in network (correct) order. Because of this, we - * attempt to interpret the length field by comparing it against - * the actual packet length. If the comparison fails, byte swap - * the length and check again. If it still fails, then the packet - * is bogus and we give up. - */ - if (ntohs(ip->ip_len) != (m0->m_pkthdr.len - ip_offset_copy)) { - ip->ip_len = SWAP16(ip->ip_len); - swapped = 1; - if (ntohs(ip->ip_len) != (m0->m_pkthdr.len - ip_offset_copy)) { - ip->ip_len = SWAP16(ip->ip_len); - printf("in_cksum_offset: ip_len %d (%d) " - "doesn't match actual length %lu\n", - ip->ip_len, SWAP16(ip->ip_len), - (m0->m_pkthdr.len - ip_offset_copy)); - return; - } - } + csum = in_cksum_hdr_opt(ip); - ip->ip_sum = 0; - ip->ip_sum = in_cksum(m, hlen); - if (swapped) - ip->ip_len = SWAP16(ip->ip_len); + /* Update stats */ + ipstat.ips_snd_swcsum++; + ipstat.ips_snd_swcsum_bytes += hlen; - /* Gross */ - if (ip_offset) { - if (M_LEADINGSPACE(m) < ip_offset) - panic("in_cksum_offset - chain modified!\n"); - m->m_len += ip_offset; - m->m_data -= ip_offset; + /* + * Insert only the checksum in the existing IP header + * csum field; all other fields are left unchanged. + */ + offset = hoff + offsetof(struct ip, ip_sum); + if (offset + sizeof (csum) > m->m_len) { + m_copyback(m, offset, sizeof (csum), &csum); + } else if (IP_HDR_ALIGNED_P(mtod(m, char *) + hoff)) { + *(uint16_t *)(void *)(mtod(m, char *) + offset) = csum; + } else { + bcopy(&csum, (mtod(m, char *) + offset), sizeof (csum)); + } + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP; } - /* Insert the checksum in the existing chain if IP header not contiguous */ - if (ip_offset + sizeof(struct ip) > m->m_len) { - char tmp[2]; - -#if DEBUG - printf("in_cksum_offset m_copyback, m->len: %u off: %lu p: %d\n", - m->m_len, ip_offset + offsetof(struct ip, ip_sum), ip->ip_p); -#endif - *(u_short *)tmp = ip->ip_sum; - m_copyback(m, ip_offset + offsetof(struct ip, ip_sum), 2, tmp); - } +done: + return (sw_csum); } /* @@ -2143,46 +2292,43 @@ in_cksum_offset(struct mbuf* m, size_t ip_offset) * XXX This routine assumes that the packet has no options in place. */ static struct mbuf * -ip_insertoptions(m, opt, phlen) - register struct mbuf *m; - struct mbuf *opt; - int *phlen; +ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen) { - register struct ipoption *p = mtod(opt, struct ipoption *); + struct ipoption *p = mtod(opt, struct ipoption *); struct mbuf *n; - register struct ip *ip = mtod(m, struct ip *); + struct ip *ip = mtod(m, struct ip *); unsigned optlen; - optlen = opt->m_len - sizeof(p->ipopt_dst); + optlen = opt->m_len - sizeof (p->ipopt_dst); if (optlen + (u_short)ip->ip_len > IP_MAXPACKET) return (m); /* XXX should fail */ if (p->ipopt_dst.s_addr) ip->ip_dst = p->ipopt_dst; if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) { MGETHDR(n, M_DONTWAIT, MT_HEADER); /* MAC-OK */ - if (n == 0) + if (n == NULL) return (m); n->m_pkthdr.rcvif = 0; #if CONFIG_MACF_NET mac_mbuf_label_copy(m, n); -#endif +#endif /* CONFIG_MACF_NET */ n->m_pkthdr.len = m->m_pkthdr.len + optlen; - m->m_len -= sizeof(struct ip); - m->m_data += sizeof(struct ip); + m->m_len -= sizeof (struct ip); + m->m_data += sizeof (struct ip); n->m_next = m; m = n; - m->m_len = optlen + sizeof(struct ip); + m->m_len = optlen + sizeof (struct ip); m->m_data += max_linkhdr; - (void)memcpy(mtod(m, void *), ip, sizeof(struct ip)); + (void) memcpy(mtod(m, void *), ip, sizeof (struct ip)); } else { m->m_data -= optlen; m->m_len += optlen; m->m_pkthdr.len += optlen; - ovbcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); + ovbcopy((caddr_t)ip, mtod(m, caddr_t), sizeof (struct ip)); } ip = mtod(m, struct ip *); bcopy(p->ipopt_list, ip + 1, optlen); - *phlen = sizeof(struct ip) + optlen; + *phlen = sizeof (struct ip) + optlen; ip->ip_vhl = IP_MAKE_VHL(IPVERSION, *phlen >> 2); ip->ip_len += optlen; return (m); @@ -2192,11 +2338,10 @@ ip_insertoptions(m, opt, phlen) * Copy options from ip to jp, * omitting those not copied during fragmentation. */ -int -ip_optcopy(ip, jp) - struct ip *ip, *jp; +static int +ip_optcopy(struct ip *ip, struct ip *jp) { - register u_char *cp, *dp; + u_char *cp, *dp; int opt, optlen, cnt; cp = (u_char *)(ip + 1); @@ -2213,13 +2358,17 @@ ip_optcopy(ip, jp) continue; } #if DIAGNOSTIC - if (cnt < IPOPT_OLEN + sizeof(*cp)) + if (cnt < IPOPT_OLEN + sizeof (*cp)) { panic("malformed IPv4 option passed to ip_optcopy"); + /* NOTREACHED */ + } #endif optlen = cp[IPOPT_OLEN]; #if DIAGNOSTIC - if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) + if (optlen < IPOPT_OLEN + sizeof (*cp) || optlen > cnt) { panic("malformed IPv4 option passed to ip_optcopy"); + /* NOTREACHED */ + } #endif /* bogus lengths should have been caught by ip_dooptions */ if (optlen > cnt) @@ -2238,45 +2387,42 @@ ip_optcopy(ip, jp) * IP socket option processing. */ int -ip_ctloutput(so, sopt) - struct socket *so; - struct sockopt *sopt; +ip_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp = sotoinpcb(so); int error, optval; error = optval = 0; - if (sopt->sopt_level != IPPROTO_IP) { + if (sopt->sopt_level != IPPROTO_IP) return (EINVAL); - } switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { - case IP_OPTIONS: #ifdef notyet case IP_RETOPTS: #endif - { + case IP_OPTIONS: { struct mbuf *m; + if (sopt->sopt_valsize > MLEN) { error = EMSGSIZE; break; } MGET(m, sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT, MT_HEADER); - if (m == 0) { + if (m == NULL) { error = ENOBUFS; break; } m->m_len = sopt->sopt_valsize; - error = sooptcopyin(sopt, mtod(m, char *), m->m_len, - m->m_len); + error = sooptcopyin(sopt, mtod(m, char *), + m->m_len, m->m_len); if (error) break; - - return (ip_pcbopts(sopt->sopt_name, &inp->inp_options, - m)); + + return (ip_pcbopts(sopt->sopt_name, + &inp->inp_options, m)); } case IP_TOS: @@ -2286,12 +2432,9 @@ ip_ctloutput(so, sopt) case IP_RECVDSTADDR: case IP_RECVIF: case IP_RECVTTL: -#if defined(NFAITH) && NFAITH > 0 - case IP_FAITH: -#endif case IP_RECVPKTINFO: - error = sooptcopyin(sopt, &optval, sizeof optval, - sizeof optval); + error = sooptcopyin(sopt, &optval, sizeof (optval), + sizeof (optval)); if (error) break; @@ -2329,11 +2472,6 @@ ip_ctloutput(so, sopt) OPTSET(INP_RECVTTL); break; -#if defined(NFAITH) && NFAITH > 0 - case IP_FAITH: - OPTSET(INP_FAITH); - break; -#endif case IP_RECVPKTINFO: OPTSET(INP_PKTINFO); break; @@ -2360,7 +2498,7 @@ ip_ctloutput(so, sopt) } /* Verify interface name parameter is sane */ - if (sopt->sopt_valsize > sizeof(ifname)) { + if (sopt->sopt_valsize > sizeof (ifname)) { error = EINVAL; break; } @@ -2400,10 +2538,10 @@ ip_ctloutput(so, sopt) */ ifnet_release(ifp); } - inp_bindif(inp, ifscope); + error = inp_bindif(inp, ifscope, NULL); } break; -#endif +#endif /* CONFIG_FORCE_OUT_IFP */ /* * Multicast socket options are processed by the in_mcast * module. @@ -2430,8 +2568,8 @@ ip_ctloutput(so, sopt) break; case IP_PORTRANGE: - error = sooptcopyin(sopt, &optval, sizeof optval, - sizeof optval); + error = sooptcopyin(sopt, &optval, sizeof (optval), + sizeof (optval)); if (error) break; @@ -2458,14 +2596,13 @@ ip_ctloutput(so, sopt) break; #if IPSEC - case IP_IPSEC_POLICY: - { + case IP_IPSEC_POLICY: { caddr_t req = NULL; size_t len = 0; int priv; struct mbuf *m; int optname; - + if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ break; if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ @@ -2480,14 +2617,15 @@ ip_ctloutput(so, sopt) m_freem(m); break; } -#endif /*IPSEC*/ +#endif /* IPSEC */ #if TRAFFIC_MGT - case IP_TRAFFIC_MGT_BACKGROUND: - { - unsigned background = 0; - error = sooptcopyin(sopt, &background, sizeof(background), sizeof(background)); - if (error) + case IP_TRAFFIC_MGT_BACKGROUND: { + unsigned background = 0; + + error = sooptcopyin(sopt, &background, + sizeof (background), sizeof (background)); + if (error) break; if (background) { @@ -2533,7 +2671,7 @@ ip_ctloutput(so, sopt) if (error) break; - inp_bindif(inp, optval); + error = inp_bindif(inp, optval, NULL); break; case IP_NO_IFT_CELLULAR: @@ -2549,7 +2687,14 @@ ip_ctloutput(so, sopt) if (error) break; - error = inp_nocellular(inp, optval); + /* once set, it cannot be unset */ + if (!optval && INP_NO_CELLULAR(inp)) { + error = EINVAL; + break; + } + + error = so_set_restrictions(so, + SO_RESTRICT_DENY_CELLULAR); break; case IP_OUT_IF: @@ -2567,13 +2712,13 @@ ip_ctloutput(so, sopt) switch (sopt->sopt_name) { case IP_OPTIONS: case IP_RETOPTS: - if (inp->inp_options) - error = sooptcopyout(sopt, - mtod(inp->inp_options, - char *), - inp->inp_options->m_len); - else + if (inp->inp_options) { + error = sooptcopyout(sopt, + mtod(inp->inp_options, char *), + inp->inp_options->m_len); + } else { sopt->sopt_valsize = 0; + } break; case IP_TOS: @@ -2584,9 +2729,6 @@ ip_ctloutput(so, sopt) case IP_RECVIF: case IP_RECVTTL: case IP_PORTRANGE: -#if defined(NFAITH) && NFAITH > 0 - case IP_FAITH: -#endif case IP_RECVPKTINFO: switch (sopt->sopt_name) { @@ -2629,16 +2771,11 @@ ip_ctloutput(so, sopt) optval = 0; break; -#if defined(NFAITH) && NFAITH > 0 - case IP_FAITH: - optval = OPTBIT(INP_FAITH); - break; -#endif case IP_RECVPKTINFO: optval = OPTBIT(INP_PKTINFO); break; } - error = sooptcopyout(sopt, &optval, sizeof optval); + error = sooptcopyout(sopt, &optval, sizeof (optval)); break; case IP_MULTICAST_IF: @@ -2651,47 +2788,36 @@ ip_ctloutput(so, sopt) break; #if IPSEC - case IP_IPSEC_POLICY: - { - struct mbuf *m = NULL; - caddr_t req = NULL; - size_t len = 0; - - if (m != 0) { - req = mtod(m, caddr_t); - len = m->m_len; - } - error = ipsec4_get_policy(sotoinpcb(so), req, len, &m); - if (error == 0) - error = soopt_mcopyout(sopt, m); /* XXX */ - if (error == 0) - m_freem(m); + case IP_IPSEC_POLICY: { + error = 0; /* This option is no longer supported */ break; } -#endif /*IPSEC*/ +#endif /* IPSEC */ #if TRAFFIC_MGT - case IP_TRAFFIC_MGT_BACKGROUND: - { - unsigned background = (so->so_traffic_mgt_flags & TRAFFIC_MGT_SO_BACKGROUND); - return (sooptcopyout(sopt, &background, sizeof(background))); + case IP_TRAFFIC_MGT_BACKGROUND: { + unsigned background = (so->so_traffic_mgt_flags & + TRAFFIC_MGT_SO_BACKGROUND) ? 1 : 0; + return (sooptcopyout(sopt, &background, + sizeof (background))); break; } #endif /* TRAFFIC_MGT */ case IP_BOUND_IF: if (inp->inp_flags & INP_BOUND_IF) - optval = inp->inp_boundif; + optval = inp->inp_boundifp->if_index; error = sooptcopyout(sopt, &optval, sizeof (optval)); break; case IP_NO_IFT_CELLULAR: - optval = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0; + optval = INP_NO_CELLULAR(inp) ? 1 : 0; error = sooptcopyout(sopt, &optval, sizeof (optval)); break; case IP_OUT_IF: - optval = inp->inp_last_outif; + optval = (inp->inp_last_outifp != NULL) ? + inp->inp_last_outifp->if_index : 0; error = sooptcopyout(sopt, &optval, sizeof (optval)); break; @@ -2710,44 +2836,41 @@ ip_ctloutput(so, sopt) * with destination address if source routed. */ static int -ip_pcbopts( - __unused int optname, - struct mbuf **pcbopt, - register struct mbuf *m) +ip_pcbopts(int optname, struct mbuf **pcbopt, struct mbuf *m) { - register int cnt, optlen; - register u_char *cp; +#pragma unused(optname) + int cnt, optlen; + u_char *cp; u_char opt; /* turn off any old options */ if (*pcbopt) - (void)m_free(*pcbopt); + (void) m_free(*pcbopt); *pcbopt = 0; if (m == (struct mbuf *)0 || m->m_len == 0) { /* * Only turning off any previous options. */ if (m) - (void)m_free(m); + (void) m_free(m); return (0); } -#ifndef vax - if (m->m_len % sizeof(int32_t)) + if (m->m_len % sizeof (int32_t)) goto bad; -#endif + /* * IP first-hop destination address will be stored before * actual options; move other options back * and clear it when none present. */ - if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN]) + if (m->m_data + m->m_len + sizeof (struct in_addr) >= &m->m_dat[MLEN]) goto bad; cnt = m->m_len; - m->m_len += sizeof(struct in_addr); - cp = mtod(m, u_char *) + sizeof(struct in_addr); + m->m_len += sizeof (struct in_addr); + cp = mtod(m, u_char *) + sizeof (struct in_addr); ovbcopy(mtod(m, caddr_t), (caddr_t)cp, (unsigned)cnt); - bzero(mtod(m, caddr_t), sizeof(struct in_addr)); + bzero(mtod(m, caddr_t), sizeof (struct in_addr)); for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[IPOPT_OPTVAL]; @@ -2756,10 +2879,10 @@ ip_pcbopts( if (opt == IPOPT_NOP) optlen = 1; else { - if (cnt < IPOPT_OLEN + sizeof(*cp)) + if (cnt < IPOPT_OLEN + sizeof (*cp)) goto bad; optlen = cp[IPOPT_OLEN]; - if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) + if (optlen < IPOPT_OLEN + sizeof (*cp) || optlen > cnt) goto bad; } switch (opt) { @@ -2777,35 +2900,35 @@ ip_pcbopts( * A is first hop destination, which doesn't appear in * actual IP option, but is stored before the options. */ - if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr)) + if (optlen < IPOPT_MINOFF - 1 + sizeof (struct in_addr)) goto bad; - m->m_len -= sizeof(struct in_addr); - cnt -= sizeof(struct in_addr); - optlen -= sizeof(struct in_addr); + m->m_len -= sizeof (struct in_addr); + cnt -= sizeof (struct in_addr); + optlen -= sizeof (struct in_addr); cp[IPOPT_OLEN] = optlen; /* * Move first hop before start of options. */ bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t), - sizeof(struct in_addr)); + sizeof (struct in_addr)); /* * Then copy rest of options back * to close up the deleted entry. */ ovbcopy((caddr_t)(&cp[IPOPT_OFFSET+1] + - sizeof(struct in_addr)), + sizeof (struct in_addr)), (caddr_t)&cp[IPOPT_OFFSET+1], - (unsigned)cnt + sizeof(struct in_addr)); + (unsigned)cnt + sizeof (struct in_addr)); break; } } - if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr)) + if (m->m_len > MAX_IPOPTLEN + sizeof (struct in_addr)) goto bad; *pcbopt = m; return (0); bad: - (void)m_free(m); + (void) m_free(m); return (EINVAL); } @@ -2951,18 +3074,23 @@ ip_allocmoptions(int how) * replicating that code here. */ static void -ip_mloopback(ifp, m, dst, hlen) - struct ifnet *ifp; - register struct mbuf *m; - register struct sockaddr_in *dst; - int hlen; +ip_mloopback(struct ifnet *srcifp, struct ifnet *origifp, struct mbuf *m, + struct sockaddr_in *dst, int hlen) { - register struct ip *ip; struct mbuf *copym; - int sw_csum = (apple_hwcksum_tx == 0); + struct ip *ip; + + if (lo_ifp == NULL) + return; - copym = m_copy(m, 0, M_COPYALL); - if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen)) + /* + * Copy the packet header as it's needed for the checksum + * Make sure to deep-copy IP header portion in case the data + * is in an mbuf cluster, so that we can safely override the IP + * header portion later. + */ + copym = m_copym_mode(m, 0, M_COPYALL, M_DONTWAIT, M_COPYM_COPY_HDR); + if (copym != NULL && ((copym->m_flags & M_EXT) || copym->m_len < hlen)) copym = m_pullup(copym, hlen); if (copym == NULL) @@ -2973,83 +3101,65 @@ ip_mloopback(ifp, m, dst, hlen) * than the interface's MTU. Can this possibly matter? */ ip = mtod(copym, struct ip *); - #if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_len); HTONS(ip->ip_off); #endif - ip->ip_sum = 0; - ip->ip_sum = in_cksum(copym, hlen); + ip->ip_sum = ip_cksum_hdr_out(copym, hlen); + /* - * NB: - * It's not clear whether there are any lingering - * reentrancy problems in other areas which might - * be exposed by using ip_input directly (in - * particular, everything which modifies the packet - * in-place). Yet another option is using the - * protosw directly to deliver the looped back - * packet. For the moment, we'll err on the side - * of safety by using if_simloop(). + * Mark checksum as valid unless receive checksum offload is + * disabled; if so, compute checksum in software. If the + * interface itself is lo0, this will be overridden by if_loop. */ -#if 1 /* XXX */ - if (dst->sin_family != AF_INET) { - printf("ip_mloopback: bad address family %d\n", - dst->sin_family); - dst->sin_family = AF_INET; - } -#endif - - /* - * Mark checksum as valid or calculate checksum for loopback. - * - * This is done this way because we have to embed the ifp of - * the interface we will send the original copy of the packet - * out on in the mbuf. ip_input will check if_hwassist of the - * embedded ifp and ignore all csum_flags if if_hwassist is 0. - * The UDP checksum has not been calculated yet. - */ - if (sw_csum || (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA)) { - if (!sw_csum && IF_HWASSIST_CSUM_FLAGS(ifp->if_hwassist)) { - copym->m_pkthdr.csum_flags |= - CSUM_DATA_VALID | CSUM_PSEUDO_HDR | - CSUM_IP_CHECKED | CSUM_IP_VALID; - copym->m_pkthdr.csum_data = 0xffff; - } else { - + if (hwcksum_rx) { + copym->m_pkthdr.csum_flags &= ~CSUM_PARTIAL; + copym->m_pkthdr.csum_flags |= + CSUM_DATA_VALID | CSUM_PSEUDO_HDR; + copym->m_pkthdr.csum_data = 0xffff; + } else if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { #if BYTE_ORDER != BIG_ENDIAN - NTOHS(ip->ip_len); + NTOHS(ip->ip_len); #endif - - in_delayed_cksum(copym); - + in_delayed_cksum(copym); #if BYTE_ORDER != BIG_ENDIAN - HTONS(ip->ip_len); + HTONS(ip->ip_len); #endif - - } - } + } /* - * TedW: - * We need to send all loopback traffic down to dlil in case - * a filter has tapped-in. + * Stuff the 'real' ifp into the pkthdr, to be used in matching + * in ip_input(); we need the loopback ifp/dl_tag passed as args + * to make the loopback driver compliant with the data link + * requirements. */ + copym->m_pkthdr.rcvif = origifp; /* - * Stuff the 'real' ifp into the pkthdr, to be used in matching - * in ip_input(); we need the loopback ifp/dl_tag passed as args - * to make the loopback driver compliant with the data link - * requirements. + * Also record the source interface (which owns the source address). + * This is basically a stripped down version of ifa_foraddr(). */ - if (lo_ifp) { - copym->m_pkthdr.rcvif = ifp; - dlil_output(lo_ifp, PF_INET, copym, 0, - (struct sockaddr *) dst, 0); - } else { - printf("Warning: ip_output call to dlil_find_dltag failed!\n"); - m_freem(copym); + if (srcifp == NULL) { + struct in_ifaddr *ia; + + lck_rw_lock_shared(in_ifaddr_rwlock); + TAILQ_FOREACH(ia, INADDR_HASH(ip->ip_src.s_addr), ia_hash) { + IFA_LOCK_SPIN(&ia->ia_ifa); + if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_src.s_addr) { + srcifp = ia->ia_ifp; + IFA_UNLOCK(&ia->ia_ifa); + break; + } + IFA_UNLOCK(&ia->ia_ifa); + } + lck_rw_done(in_ifaddr_rwlock); } + if (srcifp != NULL) + ip_setsrcifaddr_info(copym, srcifp->if_index, NULL); + ip_setdstifaddr_info(copym, origifp->if_index, NULL); + + dlil_output(lo_ifp, PF_INET, copym, NULL, SA(dst), 0, NULL); } /* @@ -3071,6 +3181,8 @@ in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) struct ifnet *rt_ifp; char s_src[MAX_IPv4_STR_LEN], s_dst[MAX_IPv4_STR_LEN]; + VERIFY(src.s_addr != INADDR_ANY); + if (ip_select_srcif_debug) { (void) inet_ntop(AF_INET, &src.s_addr, s_src, sizeof (s_src)); (void) inet_ntop(AF_INET, &dst.s_addr, s_dst, sizeof (s_dst)); @@ -3103,7 +3215,7 @@ in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) if (scope == IFSCOPE_NONE) { scope = rt_ifp->if_index; if (scope != get_primary_ifscope(AF_INET) && - ro->ro_rt->generation_id != route_generation) + ROUTE_UNUSABLE(ro)) scope = get_primary_ifscope(AF_INET); } @@ -3169,7 +3281,7 @@ in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) sin.sin_addr = dst; lck_mtx_lock(rnh_lock); - if ((rt = rt_lookup(TRUE, (struct sockaddr *)&sin, NULL, + if ((rt = rt_lookup(TRUE, SA(&sin), NULL, rt_tables[AF_INET], IFSCOPE_NONE)) != NULL) { RT_LOCK(rt); /* @@ -3280,9 +3392,7 @@ in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) } RT_UNLOCK(ro->ro_rt); - rtfree(ro->ro_rt); - ro->ro_rt = NULL; - ro->ro_flags &= ~ROF_SRCIF_SELECTED; + ROUTE_RELEASE(ro); /* * If the destination is IPv4 LLA and the route's interface @@ -3317,8 +3427,13 @@ in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) if (ro->ro_rt != NULL && (!IN_LINKLOCAL(ntohl(dst.s_addr)) || (ro->ro_rt->rt_gateway->sa_family == AF_LINK && SDL(ro->ro_rt->rt_gateway)->sdl_alen != 0))) { + if (ifa != NULL) + IFA_ADDREF(ifa); /* for route */ + if (ro->ro_srcia != NULL) + IFA_REMREF(ro->ro_srcia); + ro->ro_srcia = ifa; ro->ro_flags |= ROF_SRCIF_SELECTED; - ro->ro_rt->generation_id = route_generation; + RT_GENID_SYNC(ro->ro_rt); } if (ro->ro_rt != NULL) @@ -3326,3 +3441,140 @@ in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) return (ifa); } + +void +ip_output_checksum(struct ifnet *ifp, struct mbuf *m, int hlen, int ip_len, + uint32_t *sw_csum) +{ + int tso = TSO_IPV4_OK(ifp, m); + uint32_t hwcap = ifp->if_hwassist; + + m->m_pkthdr.csum_flags |= CSUM_IP; + + if (!hwcksum_tx) { + /* do all in software; hardware checksum offload is disabled */ + *sw_csum = (CSUM_DELAY_DATA | CSUM_DELAY_IP) & + m->m_pkthdr.csum_flags; + } else { + /* do in software what the hardware cannot */ + *sw_csum = m->m_pkthdr.csum_flags & + ~IF_HWASSIST_CSUM_FLAGS(hwcap); + } + + if (hlen != sizeof (struct ip)) { + *sw_csum |= ((CSUM_DELAY_DATA | CSUM_DELAY_IP) & + m->m_pkthdr.csum_flags); + } else if (!(*sw_csum & CSUM_DELAY_DATA) && (hwcap & CSUM_PARTIAL)) { + /* + * Partial checksum offload, if non-IP fragment, and TCP only + * (no UDP support, as the hardware may not be able to convert + * +0 to -0 (0xffff) per RFC1122 4.1.3.4.) + */ + if (hwcksum_tx && !tso && + (m->m_pkthdr.csum_flags & CSUM_TCP) && + ip_len <= ifp->if_mtu) { + uint16_t start = sizeof (struct ip); + uint16_t ulpoff = m->m_pkthdr.csum_data & 0xffff; + m->m_pkthdr.csum_flags |= + (CSUM_DATA_VALID | CSUM_PARTIAL); + m->m_pkthdr.csum_tx_stuff = (ulpoff + start); + m->m_pkthdr.csum_tx_start = start; + /* do IP hdr chksum in software */ + *sw_csum = CSUM_DELAY_IP; + } else { + *sw_csum |= (CSUM_DELAY_DATA & m->m_pkthdr.csum_flags); + } + } + + if (*sw_csum & CSUM_DELAY_DATA) { + in_delayed_cksum(m); + *sw_csum &= ~CSUM_DELAY_DATA; + } + + if (hwcksum_tx) { + /* + * Drop off bits that aren't supported by hardware; + * also make sure to preserve non-checksum related bits. + */ + m->m_pkthdr.csum_flags = + ((m->m_pkthdr.csum_flags & + (IF_HWASSIST_CSUM_FLAGS(hwcap) | CSUM_DATA_VALID)) | + (m->m_pkthdr.csum_flags & ~IF_HWASSIST_CSUM_MASK)); + } else { + /* drop all bits; hardware checksum offload is disabled */ + m->m_pkthdr.csum_flags = 0; + } +} + +/* + * GRE protocol output for PPP/PPTP + */ +int +ip_gre_output(struct mbuf *m) +{ + struct route ro; + int error; + + bzero(&ro, sizeof (ro)); + + error = ip_output(m, NULL, &ro, 0, NULL, NULL); + + ROUTE_RELEASE(&ro); + + return (error); +} + +static int +sysctl_reset_ip_output_stats SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int error, i; + + i = ip_output_measure; + error = sysctl_handle_int(oidp, &i, 0, req); + if (error || req->newptr == USER_ADDR_NULL) + goto done; + /* impose bounds */ + if (i < 0 || i > 1) { + error = EINVAL; + goto done; + } + if (ip_output_measure != i && i == 1) { + net_perf_initialize(&net_perf, ip_output_measure_bins); + } + ip_output_measure = i; +done: + return (error); +} + +static int +sysctl_ip_output_measure_bins SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int error; + uint64_t i; + + i = ip_output_measure_bins; + error = sysctl_handle_quad(oidp, &i, 0, req); + if (error || req->newptr == USER_ADDR_NULL) + goto done; + /* validate data */ + if (!net_perf_validate_bins(i)) { + error = EINVAL; + goto done; + } + ip_output_measure_bins = i; +done: + return (error); +} + +static int +sysctl_ip_output_getperf SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + if (req->oldptr == USER_ADDR_NULL) + req->oldlen = (size_t)sizeof (struct ipstat); + + return (SYSCTL_OUT(req, &net_perf, MIN(sizeof (net_perf), req->oldlen))); +} +