X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/7b1edb791d9ca667b95988cb5638c4c88416cd17..d41d1dae2cd00cc08c7982087d1c445180cad9f5:/bsd/netinet/ip_output.c diff --git a/bsd/netinet/ip_output.c b/bsd/netinet/ip_output.c index ff9b2da85..07d74f97f 100644 --- a/bsd/netinet/ip_output.c +++ b/bsd/netinet/ip_output.c @@ -1,23 +1,29 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * - * @APPLE_LICENSE_HEADER_START@ + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. * - * @APPLE_LICENSE_HEADER_END@ + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* * Copyright (c) 1982, 1986, 1988, 1990, 1993 @@ -52,17 +58,17 @@ * SUCH DAMAGE. * * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 + * $FreeBSD: src/sys/netinet/ip_output.c,v 1.99.2.16 2001/07/19 06:37:26 kris Exp $ + */ +/* + * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce + * support for mandatory and extensible security protections. This notice + * is included in support of clause 2.2 (b) of the Apple Public License, + * Version 2.0. */ #define _IP_VHL -#if ISFB31 -#include "opt_ipfw.h" -#include "opt_ipdn.h" -#include "opt_ipdivert.h" -#include "opt_ipfilter.h" -#endif - #include #include #include @@ -71,63 +77,62 @@ #include #include #include +#include +#include + +#include #include +#include #include #include #include #include -#if INET6 -#include -#include -#endif #include #include #include -#include +#include + +#if CONFIG_MACF_NET +#include +#endif + +#include "faith.h" + +#include #include +#include #define DBG_LAYER_BEG NETDBG_CODE(DBG_NETIP, 1) #define DBG_LAYER_END NETDBG_CODE(DBG_NETIP, 3) #define DBG_FNC_IP_OUTPUT NETDBG_CODE(DBG_NETIP, (1 << 8) | 1) +#define DBG_FNC_IPSEC4_OUTPUT NETDBG_CODE(DBG_NETIP, (2 << 8) | 1) - -#ifdef vax -#include -#endif - -#if ISFB31 -#include - -static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options"); -#endif - -//static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options"); +#define SWAP16(v) ((((v) & 0xff) << 8) | ((v) >> 8)) #if IPSEC #include #include +#if IPSEC_DEBUG #include - -#endif /*IPSEC*/ - -#if !defined(COMPAT_IPFW) || COMPAT_IPFW == 1 -#undef COMPAT_IPFW -#define COMPAT_IPFW 1 #else -#undef COMPAT_IPFW +#define KEYDEBUG(lev,arg) #endif +#endif /*IPSEC*/ -#if COMPAT_IPFW #include -#endif +#include #if DUMMYNET #include #endif +#if PF +#include +#endif /* PF */ + #if IPFIREWALL_FORWARD_DEBUG #define print_ip(a) printf("%ld.%ld.%ld.%ld",(ntohl(a.s_addr)>>24)&0xFF,\ (ntohl(a.s_addr)>>16)&0xFF,\ @@ -135,24 +140,53 @@ static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options"); (ntohl(a.s_addr))&0xFF); #endif + u_short ip_id; -static struct mbuf *ip_insertoptions __P((struct mbuf *, struct mbuf *, int *)); -static void ip_mloopback - __P((struct ifnet *, struct mbuf *, struct sockaddr_in *, int)); -static int ip_getmoptions - __P((struct sockopt *, struct ip_moptions *)); -static int ip_pcbopts __P((int, struct mbuf **, struct mbuf *)); -static int ip_setmoptions - __P((struct sockopt *, struct ip_moptions **)); -static u_long lo_dl_tag = 0; -static int ip_optcopy __P((struct ip *, struct ip *)); +static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *); +static struct ifnet *ip_multicast_if(struct in_addr *, int *); +static void ip_mloopback(struct ifnet *, struct mbuf *, + struct sockaddr_in *, int); +static int ip_getmoptions(struct sockopt *, struct ip_moptions *); +static int ip_pcbopts(int, struct mbuf **, struct mbuf *); +static int ip_setmoptions(struct sockopt *, struct ip_moptions **); + +static void ip_out_cksum_stats(int, u_int32_t); +static struct ifaddr *in_selectsrcif(struct ip *, struct route *, unsigned int); +static void ip_bindif(struct inpcb *, unsigned int); + +int ip_createmoptions(struct ip_moptions **imop); +int ip_addmembership(struct ip_moptions *imo, struct ip_mreq *mreq); +int ip_dropmembership(struct ip_moptions *imo, struct ip_mreq *mreq); +int ip_optcopy(struct ip *, struct ip *); +void in_delayed_cksum_offset(struct mbuf *, int ); +void in_cksum_offset(struct mbuf* , size_t ); -void in_delayed_cksum(struct mbuf *m); -extern int apple_hwcksum_tx; +extern int (*fr_checkp)(struct ip *, int, struct ifnet *, int, struct mbuf **); extern struct protosw inetsw[]; +extern struct ip_linklocal_stat ip_linklocal_stat; +extern lck_mtx_t *ip_mutex; + +/* temporary: for testing */ +#if IPSEC +extern int ipsec_bypass; +#endif + +static int ip_maxchainsent = 0; +SYSCTL_INT(_net_inet_ip, OID_AUTO, maxchainsent, CTLFLAG_RW, + &ip_maxchainsent, 0, "use dlil_output_list"); +#if DEBUG +static int forge_ce = 0; +SYSCTL_INT(_net_inet_ip, OID_AUTO, forge_ce, CTLFLAG_RW, + &forge_ce, 0, "Forge ECN CE"); +#endif /* DEBUG */ + +static int ip_select_srcif_debug = 0; +SYSCTL_INT(_net_inet_ip, OID_AUTO, select_srcif_debug, CTLFLAG_RW, + &ip_select_srcif_debug, 0, "log source interface selection debug info"); + /* * IP output. The packet in mbuf chain m contains a skeletal IP * header (with len, off, ttl, proto, tos, src, dst). @@ -160,136 +194,290 @@ extern struct protosw inetsw[]; * The mbuf opt, if present, will not be freed. */ int -ip_output(m0, opt, ro, flags, imo) - struct mbuf *m0; - struct mbuf *opt; - struct route *ro; - int flags; - struct ip_moptions *imo; +ip_output( + struct mbuf *m0, + struct mbuf *opt, + struct route *ro, + int flags, + struct ip_moptions *imo, + struct ip_out_args *ipoa) { - struct ip *ip, *mhip; - struct ifnet *ifp; - u_long dl_tag; - struct mbuf *m = m0; + int error; + error = ip_output_list(m0, 0, opt, ro, flags, imo, ipoa); + return error; +} + +/* + * Returns: 0 Success + * ENOMEM + * EADDRNOTAVAIL + * ENETUNREACH + * EHOSTUNREACH + * EACCES + * EMSGSIZE + * ENOBUFS + * ipsec4_getpolicybyaddr:??? [IPSEC 4th argument, contents modified] + * ipsec4_getpolicybysock:??? [IPSEC 4th argument, contents modified] + * key_spdacquire:??? [IPSEC] + * ipsec4_output:??? [IPSEC] + * :??? [firewall] + * ip_dn_io_ptr:??? [dummynet] + * dlil_output:??? [DLIL] + * dlil_output_list:??? [DLIL] + * + * Notes: The ipsec4_getpolicyby{addr|sock} function error returns are + * only used as the error return from this function where one of + * these functions fails to return a policy. + */ +int +ip_output_list( + struct mbuf *m0, + int packetchain, + struct mbuf *opt, + struct route *ro, + int flags, + struct ip_moptions *imo, + struct ip_out_args *ipoa + ) +{ + struct ip *ip; + struct ifnet *ifp = NULL; + struct mbuf *m = m0, **mppn = NULL; int hlen = sizeof (struct ip); - int len, off, error = 0; - struct sockaddr_in *dst; - struct in_ifaddr *ia; + int len = 0, off, error = 0; + struct sockaddr_in *dst = NULL; + struct in_ifaddr *ia = NULL, *src_ia = NULL; int isbroadcast, sw_csum; + struct in_addr pkt_dst; #if IPSEC struct route iproute; - struct socket *so; + struct socket *so = NULL; struct secpolicy *sp = NULL; #endif #if IPFIREWALL_FORWARD int fwd_rewrite_src = 0; #endif - - -#if !IPDIVERT /* dummy variable for the firewall code to play with */ - u_short ip_divert_cookie = 0 ; -#endif -#if COMPAT_IPFW - struct ip_fw_chain *rule = NULL ; +#if IPFIREWALL + struct ip_fw_args args; #endif + int didfilter = 0; + ipfilter_t inject_filter_ref = 0; + struct m_tag *tag; + struct route saved_route; + struct ip_out_args saved_ipoa; + struct mbuf * packetlist; + int pktcnt = 0, tso = 0; + unsigned int ifscope; + boolean_t select_srcif; KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0); -#if IPSEC - /* - * NOTE: m->m_pkthdr is NULL cleared below just to prevent ipfw code - * from SEGV. - * ipfw code uses rcvif to determine incoming interface, and - * KAME uses rcvif for ipsec processing. - * ipfw may not be working right with KAME at this moment. - * We need more tests. - */ + packetlist = m0; +#if IPFIREWALL + args.next_hop = NULL; + args.eh = NULL; + args.rule = NULL; + args.divert_rule = 0; /* divert cookie */ + args.ipoa = NULL; + + if (SLIST_EMPTY(&m0->m_pkthdr.tags)) + goto ipfw_tags_done; + + /* Grab info from mtags prepended to the chain */ #if DUMMYNET - if (m->m_type == MT_DUMMYNET) { - if (m->m_next != NULL) { - so = (struct socket *)m->m_next->m_pkthdr.rcvif; - m->m_next->m_pkthdr.rcvif = NULL; - } else - so = NULL; - } else -#endif - { - so = ipsec_getsocket(m); - ipsec_setsocket(m, NULL); + if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID, + KERNEL_TAG_TYPE_DUMMYNET, NULL)) != NULL) { + struct dn_pkt_tag *dn_tag; + + dn_tag = (struct dn_pkt_tag *)(tag+1); + args.rule = dn_tag->rule; + opt = NULL; + saved_route = dn_tag->ro; + ro = &saved_route; + + imo = NULL; + dst = dn_tag->dn_dst; + ifp = dn_tag->ifp; + flags = dn_tag->flags; + saved_ipoa = dn_tag->ipoa; + ipoa = &saved_ipoa; + + m_tag_delete(m0, tag); } -#endif /*IPSEC*/ +#endif /* DUMMYNET */ +#if IPDIVERT + if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID, + KERNEL_TAG_TYPE_DIVERT, NULL)) != NULL) { + struct divert_tag *div_tag; -#if IPFIREWALL && DUMMYNET - /* - * dummynet packet are prepended a vestigial mbuf with - * m_type = MT_DUMMYNET and m_data pointing to the matching - * rule. - */ - if (m->m_type == MT_DUMMYNET) { - struct mbuf *tmp_m = m ; - /* - * the packet was already tagged, so part of the - * processing was already done, and we need to go down. - * opt, flags and imo have already been used, and now - * they are used to hold ifp and hlen and NULL, respectively. - */ - rule = (struct ip_fw_chain *)(m->m_data) ; - m = m->m_next ; - FREE(tmp_m, M_IPFW); - ip = mtod(m, struct ip *); - dst = (struct sockaddr_in *)&ro->ro_dst; - ifp = (struct ifnet *)opt; - hlen = IP_VHL_HL(ip->ip_vhl) << 2 ; - opt = NULL ; - flags = 0 ; /* XXX is this correct ? */ - goto sendit; - } else - rule = NULL ; -#endif + div_tag = (struct divert_tag *)(tag+1); + args.divert_rule = div_tag->cookie; + + m_tag_delete(m0, tag); + } +#endif /* IPDIVERT */ + + if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID, + KERNEL_TAG_TYPE_IPFORWARD, NULL)) != NULL) { + struct ip_fwd_tag *ipfwd_tag; + + ipfwd_tag = (struct ip_fwd_tag *)(tag+1); + args.next_hop = ipfwd_tag->next_hop; + + m_tag_delete(m0, tag); + } +ipfw_tags_done: +#endif /* IPFIREWALL */ + + m = m0; #if DIAGNOSTIC - if ((m->m_flags & M_PKTHDR) == 0) + if ( !m || (m->m_flags & M_PKTHDR) != 0) panic("ip_output no HDR"); if (!ro) panic("ip_output no route, proto = %d", mtod(m, struct ip *)->ip_p); #endif + + /* + * At present the IP_OUTARGS flag implies a request for IP to + * perform source interface selection. In the forwarding case, + * only the ifscope value is used, as source interface selection + * doesn't take place. + */ + if (ip_doscopedroute && (flags & IP_OUTARGS)) { + select_srcif = !(flags & IP_FORWARDING); + ifscope = ipoa->ipoa_ifscope; + } else { + select_srcif = FALSE; + ifscope = IFSCOPE_NONE; + } + +#if IPFIREWALL + if (args.rule != NULL) { /* dummynet already saw us */ + ip = mtod(m, struct ip *); + hlen = IP_VHL_HL(ip->ip_vhl) << 2 ; + if (ro->ro_rt != NULL) { + RT_LOCK_SPIN(ro->ro_rt); + ia = (struct in_ifaddr *)ro->ro_rt->rt_ifa; + if (ia) + ifaref(&ia->ia_ifa); + RT_UNLOCK(ro->ro_rt); + } +#if IPSEC + if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) { + so = ipsec_getsocket(m); + (void)ipsec_setsocket(m, NULL); + } +#endif + goto sendit; + } +#endif /* IPFIREWALL */ + +#if IPSEC + if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) { + so = ipsec_getsocket(m); + (void)ipsec_setsocket(m, NULL); + } +#endif +loopit: + /* + * No need to proccess packet twice if we've + * already seen it + */ + if (!SLIST_EMPTY(&m->m_pkthdr.tags)) + inject_filter_ref = ipf_get_inject_filter(m); + else + inject_filter_ref = 0; + if (opt) { m = ip_insertoptions(m, opt, &len); hlen = len; } ip = mtod(m, struct ip *); +#if IPFIREWALL + pkt_dst = args.next_hop ? args.next_hop->sin_addr : ip->ip_dst; +#else + pkt_dst = ip->ip_dst; +#endif + /* * Fill in IP header. */ if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { ip->ip_vhl = IP_MAKE_VHL(IPVERSION, hlen >> 2); ip->ip_off &= IP_DF; +#if RANDOM_IP_ID + ip->ip_id = ip_randomid(); +#else ip->ip_id = htons(ip_id++); - ipstat.ips_localout++; +#endif + OSAddAtomic(1, &ipstat.ips_localout); } else { hlen = IP_VHL_HL(ip->ip_vhl) << 2; } + +#if DEBUG + /* For debugging, we let the stack forge congestion */ + if (forge_ce != 0 && + ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT1 || + (ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT0)) { + ip->ip_tos = (ip->ip_tos & ~IPTOS_ECN_MASK) | IPTOS_ECN_CE; + forge_ce--; + } +#endif /* DEBUG */ KERNEL_DEBUG(DBG_LAYER_BEG, ip->ip_dst.s_addr, ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len); - + dst = (struct sockaddr_in *)&ro->ro_dst; + /* * If there is a cached route, * check that it is to the same destination * and is still up. If not, free it and try again. + * The address family should also be checked in case of sharing the + * cache with IPv6. */ - if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 || - dst->sin_addr.s_addr != ip->ip_dst.s_addr)) { - RTFREE(ro->ro_rt); - ro->ro_rt = (struct rtentry *)0; + + if (ro->ro_rt != NULL) { + if (ro->ro_rt->generation_id != route_generation && + ((flags & (IP_ROUTETOIF | IP_FORWARDING)) == 0) && + (ip->ip_src.s_addr != INADDR_ANY)) { + src_ia = ifa_foraddr(ip->ip_src.s_addr); + if (src_ia == NULL) { + error = EADDRNOTAVAIL; + goto bad; + } + ifafree(&src_ia->ia_ifa); + } + /* + * Test rt_flags without holding rt_lock for performance + * reasons; if the route is down it will hopefully be + * caught by the layer below (since it uses this route + * as a hint) or during the next transmit. + */ + if ((ro->ro_rt->rt_flags & RTF_UP) == 0 || + dst->sin_family != AF_INET || + dst->sin_addr.s_addr != pkt_dst.s_addr) { + rtfree(ro->ro_rt); + ro->ro_rt = NULL; + } + /* + * If we're doing source interface selection, we may not + * want to use this route; only synch up the generation + * count otherwise. + */ + if (!select_srcif && ro->ro_rt != NULL && + ro->ro_rt->generation_id != route_generation) + ro->ro_rt->generation_id = route_generation; } - if (ro->ro_rt == 0) { + if (ro->ro_rt == NULL) { + bzero(dst, sizeof(*dst)); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); - dst->sin_addr = ip->ip_dst; + dst->sin_addr = pkt_dst; } /* * If routing to interface only, @@ -298,17 +486,86 @@ ip_output(m0, opt, ro, flags, imo) #define ifatoia(ifa) ((struct in_ifaddr *)(ifa)) #define sintosa(sin) ((struct sockaddr *)(sin)) if (flags & IP_ROUTETOIF) { - if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == 0 && - (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == 0) { - ipstat.ips_noroute++; - error = ENETUNREACH; - goto bad; + if (ia) + ifafree(&ia->ia_ifa); + if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == 0) { + if ((ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == 0) { + OSAddAtomic(1, &ipstat.ips_noroute); + error = ENETUNREACH; + goto bad; + } } ifp = ia->ia_ifp; - dl_tag = ia->ia_ifa.ifa_dlt; ip->ip_ttl = 1; isbroadcast = in_broadcast(dst->sin_addr, ifp); + } else if (IN_MULTICAST(ntohl(pkt_dst.s_addr)) && + imo != NULL && imo->imo_multicast_ifp != NULL) { + /* + * Bypass the normal routing lookup for multicast + * packets if the interface is specified. + */ + ifp = imo->imo_multicast_ifp; + isbroadcast = 0; + if (ia != NULL) + ifafree(&ia->ia_ifa); + + /* Macro takes reference on ia */ + IFP_TO_IA(ifp, ia); } else { + boolean_t cloneok = FALSE; + /* + * Perform source interface selection; the source IP address + * must belong to one of the addresses of the interface used + * by the route. For performance reasons, do this only if + * there is no route, or if the routing table has changed, + * or if we haven't done source interface selection on this + * route (for this PCB instance) before. + */ + if (select_srcif && ip->ip_src.s_addr != INADDR_ANY && + (ro->ro_rt == NULL || !(ro->ro_rt->rt_flags & RTF_UP) || + ro->ro_rt->generation_id != route_generation || + !(ro->ro_flags & ROF_SRCIF_SELECTED))) { + struct ifaddr *ifa; + + /* Find the source interface */ + ifa = in_selectsrcif(ip, ro, ifscope); + + /* + * If the source address is spoofed (in the case + * of IP_RAWOUTPUT), or if this is destined for + * local/loopback, just let it go out using the + * interface of the route. Otherwise, there's no + * interface having such an address, so bail out. + */ + if (ifa == NULL && !(flags & IP_RAWOUTPUT) && + ifscope != lo_ifp->if_index) { + error = EADDRNOTAVAIL; + goto bad; + } + + /* + * If the caller didn't explicitly specify the scope, + * pick it up from the source interface. If the cached + * route was wrong and was blown away as part of source + * interface selection, don't mask out RTF_PRCLONING + * since that route may have been allocated by the ULP, + * unless the IP header was created by the caller or + * the destination is IPv4 LLA. The check for the + * latter is needed because IPv4 LLAs are never scoped + * in the current implementation, and we don't want to + * replace the resolved IPv4 LLA route with one whose + * gateway points to that of the default gateway on + * the primary interface of the system. + */ + if (ifa != NULL) { + if (ifscope == IFSCOPE_NONE) + ifscope = ifa->ifa_ifp->if_index; + ifafree(ifa); + cloneok = (!(flags & IP_RAWOUTPUT) && + !(IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)))); + } + } + /* * If this is the case, we probably don't want to allocate * a protocol-cloned route since we didn't get one from the @@ -318,25 +575,71 @@ ip_output(m0, opt, ro, flags, imo) * the link layer, as this is probably required in all cases * for correct operation (as it is for ARP). */ - if (ro->ro_rt == 0) - rtalloc_ign(ro, RTF_PRCLONING); - if (ro->ro_rt == 0) { - ipstat.ips_noroute++; + if (ro->ro_rt == NULL) { + unsigned long ign = RTF_PRCLONING; + /* + * We make an exception here: if the destination + * address is INADDR_BROADCAST, allocate a protocol- + * cloned host route so that we end up with a route + * marked with the RTF_BROADCAST flag. Otherwise, + * we would end up referring to the default route, + * instead of creating a cloned host route entry. + * That would introduce inconsistencies between ULPs + * that allocate a route and those that don't. The + * RTF_BROADCAST route is important since we'd want + * to send out undirected IP broadcast packets using + * link-level broadcast address. Another exception + * is for ULP-created routes that got blown away by + * source interface selection (see above). + * + * These exceptions will no longer be necessary when + * the RTF_PRCLONING scheme is no longer present. + */ + if (cloneok || dst->sin_addr.s_addr == INADDR_BROADCAST) + ign &= ~RTF_PRCLONING; + + /* + * Loosen the route lookup criteria if the ifscope + * corresponds to the loopback interface; this is + * needed to support Application Layer Gateways + * listening on loopback, in conjunction with packet + * filter redirection rules. The final source IP + * address will be rewritten by the packet filter + * prior to the RFC1122 loopback check below. + */ + if (ifscope == lo_ifp->if_index) + rtalloc_ign(ro, ign); + else + rtalloc_scoped_ign(ro, ign, ifscope); + } + + if (ro->ro_rt == NULL) { + OSAddAtomic(1, &ipstat.ips_noroute); error = EHOSTUNREACH; goto bad; } + + if (ia) + ifafree(&ia->ia_ifa); + RT_LOCK_SPIN(ro->ro_rt); ia = ifatoia(ro->ro_rt->rt_ifa); + if (ia) + ifaref(&ia->ia_ifa); ifp = ro->ro_rt->rt_ifp; - dl_tag = ro->ro_rt->rt_dlt; ro->ro_rt->rt_use++; if (ro->ro_rt->rt_flags & RTF_GATEWAY) dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway; - if (ro->ro_rt->rt_flags & RTF_HOST) + if (ro->ro_rt->rt_flags & RTF_HOST) { isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST); - else + } else { + /* Become a regular mutex */ + RT_CONVERT_LOCK(ro->ro_rt); isbroadcast = in_broadcast(dst->sin_addr, ifp); + } + RT_UNLOCK(ro->ro_rt); } - if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { + + if (IN_MULTICAST(ntohl(pkt_dst.s_addr))) { struct in_multi *inm; m->m_flags |= M_MCAST; @@ -350,20 +653,24 @@ ip_output(m0, opt, ro, flags, imo) * See if the caller provided any multicast options */ if (imo != NULL) { - ip->ip_ttl = imo->imo_multicast_ttl; - if (imo->imo_multicast_ifp != NULL) + if ((flags & IP_RAWOUTPUT) == 0) ip->ip_ttl = imo->imo_multicast_ttl; + if (imo->imo_multicast_ifp != NULL) { ifp = imo->imo_multicast_ifp; - if (imo->imo_multicast_vif != -1) + } +#if MROUTING + if (imo->imo_multicast_vif != -1 && + ((flags & IP_RAWOUTPUT) == 0 || ip->ip_src.s_addr == INADDR_ANY)) ip->ip_src.s_addr = - ip_mcast_src(imo->imo_multicast_vif); + ip_mcast_src(imo->imo_multicast_vif); +#endif /* MROUTING */ } else - ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; + if ((flags & IP_RAWOUTPUT) == 0) ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; /* * Confirm that the outgoing interface supports multicast. */ if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { if ((ifp->if_flags & IFF_MULTICAST) == 0) { - ipstat.ips_noroute++; + OSAddAtomic(1, &ipstat.ips_noroute); error = ENETUNREACH; goto bad; } @@ -373,17 +680,23 @@ ip_output(m0, opt, ro, flags, imo) * of outgoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) { - register struct in_ifaddr *ia1; - - for (ia1 = in_ifaddrhead.tqh_first; ia1; - ia1 = ia1->ia_link.tqe_next) + struct in_ifaddr *ia1; + lck_rw_lock_shared(in_ifaddr_rwlock); + TAILQ_FOREACH(ia1, &in_ifaddrhead, ia_link) if (ia1->ia_ifp == ifp) { ip->ip_src = IA_SIN(ia1)->sin_addr; break; } + lck_rw_done(in_ifaddr_rwlock); + if (ip->ip_src.s_addr == INADDR_ANY) { + error = ENETUNREACH; + goto bad; + } } - IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm); + ifnet_lock_shared(ifp); + IN_LOOKUP_MULTI(pkt_dst, ifp, inm); + ifnet_lock_done(ifp); if (inm != NULL && (imo == NULL || imo->imo_multicast_loop)) { /* @@ -391,8 +704,59 @@ ip_output(m0, opt, ro, flags, imo) * on the outgoing interface, and the caller did not * forbid loopback, loop back a copy. */ + if (!TAILQ_EMPTY(&ipv4_filters)) { + struct ipfilter *filter; + int seen = (inject_filter_ref == 0); + struct ipf_pktopts *ippo = 0, ipf_pktopts; + + if (imo) { + ippo = &ipf_pktopts; + ipf_pktopts.ippo_mcast_ifnet = imo->imo_multicast_ifp; + ipf_pktopts.ippo_mcast_ttl = imo->imo_multicast_ttl; + ipf_pktopts.ippo_mcast_loop = imo->imo_multicast_loop; + } + + ipf_ref(); + + /* 4135317 - always pass network byte order to filter */ + +#if BYTE_ORDER != BIG_ENDIAN + HTONS(ip->ip_len); + HTONS(ip->ip_off); +#endif + + TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) { + if (seen == 0) { + if ((struct ipfilter *)inject_filter_ref == filter) + seen = 1; + } else if (filter->ipf_filter.ipf_output) { + errno_t result; + result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, ippo); + if (result == EJUSTRETURN) { + ipf_unref(); + goto done; + } + if (result != 0) { + ipf_unref(); + goto bad; + } + } + } + + /* set back to host byte order */ + ip = mtod(m, struct ip *); + +#if BYTE_ORDER != BIG_ENDIAN + NTOHS(ip->ip_len); + NTOHS(ip->ip_off); +#endif + + ipf_unref(); + didfilter = 1; + } ip_mloopback(ifp, m, dst, hlen); } +#if MROUTING else { /* * If we are acting as a multicast router, perform @@ -421,6 +785,7 @@ ip_output(m0, opt, ro, flags, imo) } } } +#endif /* MROUTING */ /* * Multicasts with a time-to-live of zero may be looped- @@ -454,15 +819,6 @@ ip_output(m0, opt, ro, flags, imo) #endif /* IPFIREWALL_FORWARD */ } #endif /* notdef */ - /* - * Verify that we have any chance at all of being able to queue - * the packet or packet fragments - */ - if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >= - ifp->if_snd.ifq_maxlen) { - error = ENOBUFS; - goto bad; - } /* * Look for broadcast address and @@ -489,248 +845,136 @@ ip_output(m0, opt, ro, flags, imo) } sendit: - /* - * IpHack's section. - * - Xlate: translate packet's addr/port (NAT). - * - Firewall: deny/allow/etc. - * - Wrap: fake packet's addr/port - * - Encapsulate: put it in another IP and send out. - */ -#if COMPAT_IPFW - if (ip_nat_ptr && !(*ip_nat_ptr)(&ip, &m, ifp, IP_NAT_OUT)) { - error = EACCES; +#if PF + /* Invoke outbound packet filter */ + if (pf_af_hook(ifp, mppn, &m, AF_INET, FALSE) != 0) { + if (packetlist == m0) { + packetlist = m; + mppn = NULL; + } + if (m != NULL) { + m0 = m; + /* Next packet in the chain */ + goto loopit; + } else if (packetlist != NULL) { + /* No more packet; send down the chain */ + goto sendchain; + } + /* Nothing left; we're done */ goto done; } - - /* - * Check with the firewall... - */ - if (ip_fw_chk_ptr) { - struct sockaddr_in *old = dst; - - off = (*ip_fw_chk_ptr)(&ip, - hlen, ifp, &ip_divert_cookie, &m, &rule, &dst); - /* - * On return we must do the following: - * m == NULL -> drop the pkt - * 1<=off<= 0xffff -> DIVERT - * (off & 0x10000) -> send to a DUMMYNET pipe - * dst != old -> IPFIREWALL_FORWARD - * off==0, dst==old -> accept - * If some of the above modules is not compiled in, then - * we should't have to check the corresponding condition - * (because the ipfw control socket should not accept - * unsupported rules), but better play safe and drop - * packets in case of doubt. - */ - if (!m) { /* firewall said to reject */ - error = EACCES; - goto done; + m0 = m; + ip = mtod(m, struct ip *); + pkt_dst = ip->ip_dst; + hlen = IP_VHL_HL(ip->ip_vhl) << 2; +#endif /* PF */ + /* + * Force IP TTL to 255 following draft-ietf-zeroconf-ipv4-linklocal.txt + */ + if (IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)) || IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) { + ip_linklocal_stat.iplls_out_total++; + if (ip->ip_ttl != MAXTTL) { + ip_linklocal_stat.iplls_out_badttl++; + ip->ip_ttl = MAXTTL; } - if (off == 0 && dst == old) /* common case */ - goto pass ; -#if DUMMYNET - if (off & 0x10000) { - /* - * pass the pkt to dummynet. Need to include - * pipe number, m, ifp, ro, hlen because these are - * not recomputed in the next pass. - * All other parameters have been already used and - * so they are not needed anymore. - * XXX note: if the ifp or ro entry are deleted - * while a pkt is in dummynet, we are in trouble! - */ - dummynet_io(off & 0xffff, DN_TO_IP_OUT, m,ifp,ro,hlen,rule); - goto done; + } + + if (!didfilter && !TAILQ_EMPTY(&ipv4_filters)) { + struct ipfilter *filter; + int seen = (inject_filter_ref == 0); + + /* Check that a TSO frame isn't passed to a filter. + * This could happen if a filter is inserted while + * TCP is sending the TSO packet. + */ + if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) { + error = EMSGSIZE; + goto bad; } -#endif -#if IPDIVERT - if (off > 0 && off < 0x10000) { /* Divert packet */ - - /* - * delayed checksums are not currently compatible - * with divert sockets. - */ - if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { - in_delayed_cksum(m); - if (m == NULL) - return(ENOMEM); - m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; - } - /* Restore packet header fields to original values */ - ip->ip_len = htons((u_short)ip->ip_len); - ip->ip_off = htons((u_short)ip->ip_off); + ipf_ref(); + + /* 4135317 - always pass network byte order to filter */ - ip_divert_port = off & 0xffff ; - (*ip_protox[IPPROTO_DIVERT]->pr_input)(m, 0); - goto done; +#if BYTE_ORDER != BIG_ENDIAN + HTONS(ip->ip_len); + HTONS(ip->ip_off); +#endif + + TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) { + if (seen == 0) { + if ((struct ipfilter *)inject_filter_ref == filter) + seen = 1; + } else if (filter->ipf_filter.ipf_output) { + errno_t result; + result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, 0); + if (result == EJUSTRETURN) { + ipf_unref(); + goto done; + } + if (result != 0) { + ipf_unref(); + goto bad; + } + } } + + /* set back to host byte order */ + ip = mtod(m, struct ip *); + +#if BYTE_ORDER != BIG_ENDIAN + NTOHS(ip->ip_len); + NTOHS(ip->ip_off); #endif -#if IPFIREWALL_FORWARD - /* Here we check dst to make sure it's directly reachable on the - * interface we previously thought it was. - * If it isn't (which may be likely in some situations) we have - * to re-route it (ie, find a route for the next-hop and the - * associated interface) and set them here. This is nested - * forwarding which in most cases is undesirable, except where - * such control is nigh impossible. So we do it here. - * And I'm babbling. + ipf_unref(); + } + +#if IPSEC + /* temporary for testing only: bypass ipsec alltogether */ + + if (ipsec_bypass != 0 || (flags & IP_NOIPSEC) != 0) + goto skip_ipsec; + + KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_START, 0,0,0,0,0); + + + /* get SP for this packet */ + if (so == NULL) + sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, flags, &error); + else + sp = ipsec4_getpolicybysock(m, IPSEC_DIR_OUTBOUND, so, &error); + + if (sp == NULL) { + IPSEC_STAT_INCREMENT(ipsecstat.out_inval); + KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); + goto bad; + } + + error = 0; + + /* check policy */ + switch (sp->policy) { + case IPSEC_POLICY_DISCARD: + case IPSEC_POLICY_GENERATE: + /* + * This packet is just discarded. */ - if (off == 0 && old != dst) { - struct in_ifaddr *ia; - - /* It's changed... */ - /* There must be a better way to do this next line... */ - static struct route sro_fwd, *ro_fwd = &sro_fwd; -#if IPFIREWALL_FORWARD_DEBUG - printf("IPFIREWALL_FORWARD: New dst ip: "); - print_ip(dst->sin_addr); - printf("\n"); -#endif - /* - * We need to figure out if we have been forwarded - * to a local socket. If so then we should somehow - * "loop back" to ip_input, and get directed to the - * PCB as if we had received this packet. This is - * because it may be dificult to identify the packets - * you want to forward until they are being output - * and have selected an interface. (e.g. locally - * initiated packets) If we used the loopback inteface, - * we would not be able to control what happens - * as the packet runs through ip_input() as - * it is done through a ISR. - */ - for (ia = TAILQ_FIRST(&in_ifaddrhead); ia; - ia = TAILQ_NEXT(ia, ia_link)) { - /* - * If the addr to forward to is one - * of ours, we pretend to - * be the destination for this packet. - */ - if (IA_SIN(ia)->sin_addr.s_addr == - dst->sin_addr.s_addr) - break; - } - if (ia) { - /* tell ip_input "dont filter" */ - ip_fw_fwd_addr = dst; - if (m->m_pkthdr.rcvif == NULL) - m->m_pkthdr.rcvif = ifunit("lo0"); - - if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { - m->m_pkthdr.csum_flags |= - CSUM_DATA_VALID | CSUM_PSEUDO_HDR; - m0->m_pkthdr.csum_data = 0xffff; - } - m->m_pkthdr.csum_flags |= - CSUM_IP_CHECKED | CSUM_IP_VALID; - ip->ip_len = htons((u_short)ip->ip_len); - ip->ip_off = htons((u_short)ip->ip_off); - - - ip_input(m); - goto done; - } - /* Some of the logic for this was - * nicked from above. - * - * This rewrites the cached route in a local PCB. - * Is this what we want to do? - */ - bcopy(dst, &ro_fwd->ro_dst, sizeof(*dst)); - - ro_fwd->ro_rt = 0; - rtalloc_ign(ro_fwd, RTF_PRCLONING); - - if (ro_fwd->ro_rt == 0) { - ipstat.ips_noroute++; - error = EHOSTUNREACH; - goto bad; - } - - ia = ifatoia(ro_fwd->ro_rt->rt_ifa); - ifp = ro_fwd->ro_rt->rt_ifp; - dl_tag = ro->ro_rt->rt_dlt; - ro_fwd->ro_rt->rt_use++; - if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY) - dst = (struct sockaddr_in *)ro_fwd->ro_rt->rt_gateway; - if (ro_fwd->ro_rt->rt_flags & RTF_HOST) - isbroadcast = - (ro_fwd->ro_rt->rt_flags & RTF_BROADCAST); - else - isbroadcast = in_broadcast(dst->sin_addr, ifp); - RTFREE(ro->ro_rt); - ro->ro_rt = ro_fwd->ro_rt; - dst = (struct sockaddr_in *)&ro_fwd->ro_dst; - - /* - * If we added a default src ip earlier, - * which would have been gotten from the-then - * interface, do it again, from the new one. - */ - if (fwd_rewrite_src) - ip->ip_src = IA_SIN(ia)->sin_addr; - goto pass ; - } -#endif /* IPFIREWALL_FORWARD */ - /* - * if we get here, none of the above matches, and - * we have to drop the pkt - */ - m_freem(m); - error = EACCES; /* not sure this is the right error msg */ - goto done; - } -#endif /* COMPAT_IPFW */ - -pass: - -#if defined(PM) - /* - * Processing IP filter/NAT. - * Return TRUE iff this packet is discarded. - * Return FALSE iff this packet is accepted. - */ - - if (doNatFil && pm_out(ro->ro_rt->rt_ifp, ip, m)) - goto done; -#endif - -#if IPSEC - /* get SP for this packet */ - if (so == NULL) - sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, flags, &error); - else - sp = ipsec4_getpolicybysock(m, IPSEC_DIR_OUTBOUND, so, &error); - - if (sp == NULL) { - ipsecstat.out_inval++; - goto bad; - } - - error = 0; - - /* check policy */ - switch (sp->policy) { - case IPSEC_POLICY_DISCARD: - /* - * This packet is just discarded. - */ - ipsecstat.out_polvio++; - goto bad; + IPSEC_STAT_INCREMENT(ipsecstat.out_polvio); + KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 1,0,0,0,0); + goto bad; case IPSEC_POLICY_BYPASS: case IPSEC_POLICY_NONE: /* no need to do IPsec. */ + KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 2,0,0,0,0); goto skip_ipsec; case IPSEC_POLICY_IPSEC: if (sp->req == NULL) { - /* XXX should be panic ? */ - printf("ip_output: No IPsec request specified.\n"); - error = EINVAL; + /* acquire a policy */ + error = key_spdacquire(sp); + KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 3,0,0,0,0); goto bad; } break; @@ -739,8 +983,6 @@ pass: default: printf("ip_output: Invalid policy found. %d\n", sp->policy); } - - { struct ipsec_output_state state; bzero(&state, sizeof(state)); @@ -752,24 +994,27 @@ pass: state.ro = ro; state.dst = (struct sockaddr *)dst; - ip->ip_sum = 0; - - /* - * delayed checksums are not currently compatible with IPsec - */ - if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { - in_delayed_cksum(m); - if (m == NULL) - return(ENOMEM); - m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; - } + ip->ip_sum = 0; + + /* + * XXX + * delayed checksums are not currently compatible with IPsec + */ + if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + in_delayed_cksum(m); + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + } - ip->ip_len = htons((u_short)ip->ip_len); - ip->ip_off = htons((u_short)ip->ip_off); - error = ipsec4_output(&state, sp, flags); +#if BYTE_ORDER != BIG_ENDIAN + HTONS(ip->ip_len); + HTONS(ip->ip_off); +#endif - m = state.m; + error = ipsec4_output(&state, sp, flags); + + m0 = m = state.m; + if (flags & IP_ROUTETOIF) { /* * if we have tunnel mode SA, we may need to ignore @@ -781,6 +1026,7 @@ pass: } } else ro = state.ro; + dst = (struct sockaddr_in *)state.dst; if (error) { /* mbuf is already reclaimed in ipsec4_output. */ @@ -800,130 +1046,629 @@ pass: error = 0; break; } + KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 4,0,0,0,0); goto bad; } } /* be sure to update variables that are affected by ipsec4_output() */ ip = mtod(m, struct ip *); + #ifdef _IP_VHL hlen = IP_VHL_HL(ip->ip_vhl) << 2; #else hlen = ip->ip_hl << 2; #endif + /* Check that there wasn't a route change and src is still valid */ + if (ro->ro_rt != NULL && ro->ro_rt->generation_id != route_generation) { + if ((src_ia = ifa_foraddr(ip->ip_src.s_addr)) == NULL && + ((flags & (IP_ROUTETOIF | IP_FORWARDING)) == 0)) { + error = EADDRNOTAVAIL; + KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, + 5,0,0,0,0); + goto bad; + } + rtfree(ro->ro_rt); + ro->ro_rt = NULL; + if (src_ia != NULL) + ifafree(&src_ia->ia_ifa); + } + if (ro->ro_rt == NULL) { if ((flags & IP_ROUTETOIF) == 0) { - printf("ip_output: " - "can't update route after IPsec processing\n"); + printf("ip_output: can't update route after " + "IPsec processing\n"); error = EHOSTUNREACH; /*XXX*/ + KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, + 6,0,0,0,0); goto bad; } } else { - /* nobody uses ia beyond here */ + if (ia) + ifafree(&ia->ia_ifa); + RT_LOCK_SPIN(ro->ro_rt); + ia = ifatoia(ro->ro_rt->rt_ifa); + if (ia) + ifaref(&ia->ia_ifa); ifp = ro->ro_rt->rt_ifp; + RT_UNLOCK(ro->ro_rt); } /* make it flipped, again. */ - ip->ip_len = ntohs((u_short)ip->ip_len); - ip->ip_off = ntohs((u_short)ip->ip_off); + +#if BYTE_ORDER != BIG_ENDIAN + NTOHS(ip->ip_len); + NTOHS(ip->ip_off); +#endif + + KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 7,0xff,0xff,0xff,0xff); + + /* Pass to filters again */ + if (!TAILQ_EMPTY(&ipv4_filters)) { + struct ipfilter *filter; + + /* Check that a TSO frame isn't passed to a filter. + * This could happen if a filter is inserted while + * TCP is sending the TSO packet. + */ + if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) { + error = EMSGSIZE; + goto bad; + } + + ipf_ref(); + + /* 4135317 - always pass network byte order to filter */ + +#if BYTE_ORDER != BIG_ENDIAN + HTONS(ip->ip_len); + HTONS(ip->ip_off); +#endif + + TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) { + if (filter->ipf_filter.ipf_output) { + errno_t result; + result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, 0); + if (result == EJUSTRETURN) { + ipf_unref(); + goto done; + } + if (result != 0) { + ipf_unref(); + goto bad; + } + } + } + + /* set back to host byte order */ + ip = mtod(m, struct ip *); + +#if BYTE_ORDER != BIG_ENDIAN + NTOHS(ip->ip_len); + NTOHS(ip->ip_off); +#endif + + ipf_unref(); + } skip_ipsec: #endif /*IPSEC*/ +#if IPFIREWALL + /* + * IpHack's section. + * - Xlate: translate packet's addr/port (NAT). + * - Firewall: deny/allow/etc. + * - Wrap: fake packet's addr/port + * - Encapsulate: put it in another IP and send out. + */ + if (fr_checkp) { + struct mbuf *m1 = m; - sw_csum = m->m_pkthdr.csum_flags | CSUM_IP; + if ((error = (*fr_checkp)(ip, hlen, ifp, 1, &m1)) || !m1) { + goto done; + } + ip = mtod(m0 = m = m1, struct ip *); + } + /* + * Check with the firewall... + * but not if we are already being fwd'd from a firewall. + */ + if (fw_enable && IPFW_LOADED && !args.next_hop) { + struct sockaddr_in *old = dst; - /* frames that can be checksumed by GMACE SUM16 HW: frame >64, no fragments, no UDP odd length */ + args.m = m; + args.next_hop = dst; + args.oif = ifp; + off = ip_fw_chk_ptr(&args); + m = args.m; + dst = args.next_hop; - if (apple_hwcksum_tx && (sw_csum & CSUM_DELAY_DATA) && (ifp->if_hwassist & CSUM_TCP_SUM16) - && (ip->ip_len > 50) && (ip->ip_len <= ifp->if_mtu) - && !((ip->ip_len & 0x1) && (sw_csum & CSUM_UDP)) ) { + /* + * On return we must do the following: + * IP_FW_PORT_DENY_FLAG -> drop the pkt (XXX new) + * 1<=off<= 0xffff -> DIVERT + * (off & IP_FW_PORT_DYNT_FLAG) -> send to a DUMMYNET pipe + * (off & IP_FW_PORT_TEE_FLAG) -> TEE the packet + * dst != old -> IPFIREWALL_FORWARD + * off==0, dst==old -> accept + * If some of the above modules is not compiled in, then + * we should't have to check the corresponding condition + * (because the ipfw control socket should not accept + * unsupported rules), but better play safe and drop + * packets in case of doubt. + */ + m0 = m; + if ( (off & IP_FW_PORT_DENY_FLAG) || m == NULL) { + if (m) + m_freem(m); + error = EACCES ; + goto done ; + } + ip = mtod(m, struct ip *); + + if (off == 0 && dst == old) {/* common case */ + goto pass ; + } +#if DUMMYNET + if (DUMMYNET_LOADED && (off & IP_FW_PORT_DYNT_FLAG) != 0) { + /* + * pass the pkt to dummynet. Need to include + * pipe number, m, ifp, ro, dst because these are + * not recomputed in the next pass. + * All other parameters have been already used and + * so they are not needed anymore. + * XXX note: if the ifp or ro entry are deleted + * while a pkt is in dummynet, we are in trouble! + */ + args.ro = ro; + args.dst = dst; + args.flags = flags; + if (flags & IP_OUTARGS) + args.ipoa = ipoa; + + error = ip_dn_io_ptr(m, off & 0xffff, DN_TO_IP_OUT, + &args); + goto done; + } +#endif /* DUMMYNET */ +#if IPDIVERT + if (off != 0 && (off & IP_FW_PORT_DYNT_FLAG) == 0) { + struct mbuf *clone = NULL; - /* Apple GMAC HW, expects STUFF_OFFSET << 16 | START_OFFSET */ - u_short offset = (IP_VHL_HL(ip->ip_vhl) << 2) +14 ; /* IP+Enet header length */ - u_short csumprev= m->m_pkthdr.csum_data & 0xFFFF; - m->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_TCP_SUM16; /* for GMAC */ - m->m_pkthdr.csum_data = (csumprev + offset) << 16 ; - m->m_pkthdr.csum_data += offset; - sw_csum = CSUM_DELAY_IP; /* do IP hdr chksum in software */ - } - else { - if (ifp->if_hwassist & CSUM_TCP_SUM16) /* force SW checksuming */ - m->m_pkthdr.csum_flags = 0; - else { /* not Apple enet */ - m->m_pkthdr.csum_flags = sw_csum & ifp->if_hwassist; - sw_csum &= ~ifp->if_hwassist; + /* Clone packet if we're doing a 'tee' */ + if ((off & IP_FW_PORT_TEE_FLAG) != 0) + clone = m_dup(m, M_DONTWAIT); + /* + * XXX + * delayed checksums are not currently compatible + * with divert sockets. + */ + if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + in_delayed_cksum(m); + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + } + + /* Restore packet header fields to original values */ + +#if BYTE_ORDER != BIG_ENDIAN + HTONS(ip->ip_len); + HTONS(ip->ip_off); +#endif + + /* Deliver packet to divert input routine */ + divert_packet(m, 0, off & 0xffff, args.divert_rule); + + /* If 'tee', continue with original packet */ + if (clone != NULL) { + m0 = m = clone; + ip = mtod(m, struct ip *); + goto pass; + } + goto done; } +#endif + +#if IPFIREWALL_FORWARD + /* Here we check dst to make sure it's directly reachable on the + * interface we previously thought it was. + * If it isn't (which may be likely in some situations) we have + * to re-route it (ie, find a route for the next-hop and the + * associated interface) and set them here. This is nested + * forwarding which in most cases is undesirable, except where + * such control is nigh impossible. So we do it here. + * And I'm babbling. + */ + if (off == 0 && old != dst) { + struct in_ifaddr *ia_fw; + + /* It's changed... */ + /* There must be a better way to do this next line... */ + static struct route sro_fwd, *ro_fwd = &sro_fwd; +#if IPFIREWALL_FORWARD_DEBUG + printf("IPFIREWALL_FORWARD: New dst ip: "); + print_ip(dst->sin_addr); + printf("\n"); +#endif + /* + * We need to figure out if we have been forwarded + * to a local socket. If so then we should somehow + * "loop back" to ip_input, and get directed to the + * PCB as if we had received this packet. This is + * because it may be dificult to identify the packets + * you want to forward until they are being output + * and have selected an interface. (e.g. locally + * initiated packets) If we used the loopback inteface, + * we would not be able to control what happens + * as the packet runs through ip_input() as + * it is done through a ISR. + */ + lck_rw_lock_shared(in_ifaddr_rwlock); + TAILQ_FOREACH(ia_fw, &in_ifaddrhead, ia_link) { + /* + * If the addr to forward to is one + * of ours, we pretend to + * be the destination for this packet. + */ + if (IA_SIN(ia_fw)->sin_addr.s_addr == + dst->sin_addr.s_addr) + break; + } + lck_rw_done(in_ifaddr_rwlock); + if (ia_fw) { + /* tell ip_input "dont filter" */ + struct m_tag *fwd_tag; + struct ip_fwd_tag *ipfwd_tag; + + fwd_tag = m_tag_alloc(KERNEL_MODULE_TAG_ID, + KERNEL_TAG_TYPE_IPFORWARD, + sizeof (*ipfwd_tag), M_NOWAIT); + if (fwd_tag == NULL) { + error = ENOBUFS; + goto bad; + } + + ipfwd_tag = (struct ip_fwd_tag *)(fwd_tag+1); + ipfwd_tag->next_hop = args.next_hop; + + m_tag_prepend(m, fwd_tag); - if (sw_csum & CSUM_DELAY_DATA) { /* perform TCP/UDP checksuming now */ - in_delayed_cksum(m); - if (m == NULL) - return(ENOMEM); - sw_csum &= ~CSUM_DELAY_DATA; + if (m->m_pkthdr.rcvif == NULL) + m->m_pkthdr.rcvif = ifunit("lo0"); + if ((~IF_HWASSIST_CSUM_FLAGS(m->m_pkthdr.rcvif->if_hwassist) & + m->m_pkthdr.csum_flags) == 0) { + if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + m->m_pkthdr.csum_flags |= + CSUM_DATA_VALID | CSUM_PSEUDO_HDR; + m->m_pkthdr.csum_data = 0xffff; + } + m->m_pkthdr.csum_flags |= + CSUM_IP_CHECKED | CSUM_IP_VALID; + } + else if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + in_delayed_cksum(m); + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + ip->ip_sum = in_cksum(m, hlen); + } + +#if BYTE_ORDER != BIG_ENDIAN + HTONS(ip->ip_len); + HTONS(ip->ip_off); +#endif + + /* we need to call dlil_output to run filters + * and resync to avoid recursion loops. + */ + if (lo_ifp) { + dlil_output(lo_ifp, PF_INET, m, 0, (struct sockaddr *)dst, 0); + } + else { + printf("ip_output: no loopback ifp for forwarding!!!\n"); + } + goto done; + } + /* Some of the logic for this was + * nicked from above. + * + * This rewrites the cached route in a local PCB. + * Is this what we want to do? + */ + bcopy(dst, &ro_fwd->ro_dst, sizeof(*dst)); + + ro_fwd->ro_rt = NULL; + rtalloc_ign(ro_fwd, RTF_PRCLONING); + + if (ro_fwd->ro_rt == NULL) { + OSAddAtomic(1, &ipstat.ips_noroute); + error = EHOSTUNREACH; + goto bad; + } + + RT_LOCK_SPIN(ro_fwd->ro_rt); + ia_fw = ifatoia(ro_fwd->ro_rt->rt_ifa); + if (ia_fw != NULL) + ifaref(&ia_fw->ia_ifa); + ifp = ro_fwd->ro_rt->rt_ifp; + ro_fwd->ro_rt->rt_use++; + if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY) + dst = (struct sockaddr_in *)ro_fwd->ro_rt->rt_gateway; + if (ro_fwd->ro_rt->rt_flags & RTF_HOST) { + isbroadcast = + (ro_fwd->ro_rt->rt_flags & RTF_BROADCAST); + } else { + /* Become a regular mutex */ + RT_CONVERT_LOCK(ro_fwd->ro_rt); + isbroadcast = in_broadcast(dst->sin_addr, ifp); + } + RT_UNLOCK(ro_fwd->ro_rt); + rtfree(ro->ro_rt); + ro->ro_rt = ro_fwd->ro_rt; + dst = (struct sockaddr_in *)&ro_fwd->ro_dst; + + /* + * If we added a default src ip earlier, + * which would have been gotten from the-then + * interface, do it again, from the new one. + */ + if (ia_fw != NULL) { + if (fwd_rewrite_src) + ip->ip_src = IA_SIN(ia_fw)->sin_addr; + ifafree(&ia_fw->ia_ifa); + } + goto pass ; } - } +#endif /* IPFIREWALL_FORWARD */ + /* + * if we get here, none of the above matches, and + * we have to drop the pkt + */ + m_freem(m); + error = EACCES; /* not sure this is the right error msg */ + goto done; + } +#endif /* IPFIREWALL */ + +pass: +#if __APPLE__ + /* Do not allow loopback address to wind up on a wire */ + if ((ifp->if_flags & IFF_LOOPBACK) == 0 && + ((ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || + (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)) { + OSAddAtomic(1, &ipstat.ips_badaddr); + m_freem(m); + /* + * Do not simply drop the packet just like a firewall -- we want the + * the application to feel the pain. + * Return ENETUNREACH like ip6_output does in some similar cases. + * This can startle the otherwise clueless process that specifies + * loopback as the source address. + */ + error = ENETUNREACH; + goto done; + } +#endif + m->m_pkthdr.csum_flags |= CSUM_IP; + tso = (ifp->if_hwassist & IFNET_TSO_IPV4) && (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4); + + sw_csum = m->m_pkthdr.csum_flags + & ~IF_HWASSIST_CSUM_FLAGS(ifp->if_hwassist); + + if ((ifp->if_hwassist & CSUM_TCP_SUM16) != 0) { + /* + * Special case code for GMACE + * frames that can be checksumed by GMACE SUM16 HW: + * frame >64, no fragments, no UDP + */ + if (apple_hwcksum_tx && (m->m_pkthdr.csum_flags & CSUM_TCP) + && (ip->ip_len > 50) && (ip->ip_len <= ifp->if_mtu)) { + /* Apple GMAC HW, expects STUFF_OFFSET << 16 | START_OFFSET */ + u_short offset = (IP_VHL_HL(ip->ip_vhl) << 2) +14 ; /* IP+Enet header length */ + u_short csumprev= m->m_pkthdr.csum_data & 0xFFFF; + m->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_TCP_SUM16; /* for GMAC */ + m->m_pkthdr.csum_data = (csumprev + offset) << 16 ; + m->m_pkthdr.csum_data += offset; + sw_csum = CSUM_DELAY_IP; /* do IP hdr chksum in software */ + } + else { + /* let the software handle any UDP or TCP checksums */ + sw_csum |= (CSUM_DELAY_DATA & m->m_pkthdr.csum_flags); + } + } else if (apple_hwcksum_tx == 0) { + sw_csum |= (CSUM_DELAY_DATA | CSUM_DELAY_IP) & + m->m_pkthdr.csum_flags; + } + + if (sw_csum & CSUM_DELAY_DATA) { + in_delayed_cksum(m); + sw_csum &= ~CSUM_DELAY_DATA; + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + } + + if (apple_hwcksum_tx != 0) { + m->m_pkthdr.csum_flags &= + IF_HWASSIST_CSUM_FLAGS(ifp->if_hwassist); + } else { + m->m_pkthdr.csum_flags = 0; + } /* - * If small enough for interface, or the interface will take + * If small enough for interface, or the interface will take * care of the fragmentation for us, can just send directly. */ - if ((u_short)ip->ip_len <= ifp->if_mtu || - ifp->if_hwassist & CSUM_FRAGMENT) { + if ((u_short)ip->ip_len <= ifp->if_mtu || tso || + ifp->if_hwassist & CSUM_FRAGMENT) { + if (tso) + m->m_pkthdr.csum_flags |= CSUM_TSO_IPV4; + - ip->ip_len = htons((u_short)ip->ip_len); - ip->ip_off = htons((u_short)ip->ip_off); +#if BYTE_ORDER != BIG_ENDIAN + HTONS(ip->ip_len); + HTONS(ip->ip_off); +#endif + ip->ip_sum = 0; - if (sw_csum & CSUM_DELAY_IP) + if (sw_csum & CSUM_DELAY_IP) { ip->ip_sum = in_cksum(m, hlen); - error = dlil_output(dl_tag, m, (void *) ro->ro_rt, - (struct sockaddr *)dst, 0); - goto done; + } + +#ifndef __APPLE__ + /* Record statistics for this interface address. */ + if (!(flags & IP_FORWARDING) && ia != NULL) { + ia->ia_ifa.if_opackets++; + ia->ia_ifa.if_obytes += m->m_pkthdr.len; + } +#endif + +#if IPSEC + /* clean ipsec history once it goes out of the node */ + if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) + ipsec_delaux(m); +#endif + if (packetchain == 0) { + error = ifnet_output(ifp, PF_INET, m, ro->ro_rt, + (struct sockaddr *)dst); + goto done; + } + else { /* packet chaining allows us to reuse the route for all packets */ + mppn = &m->m_nextpkt; + m = m->m_nextpkt; + if (m == NULL) { +#if PF +sendchain: +#endif /* PF */ + if (pktcnt > ip_maxchainsent) + ip_maxchainsent = pktcnt; + //send + error = ifnet_output(ifp, PF_INET, packetlist, + ro->ro_rt, (struct sockaddr *)dst); + pktcnt = 0; + goto done; + + } + m0 = m; + pktcnt++; + goto loopit; + } } /* * Too large for interface; fragment if possible. * Must be able to put at least 8 bytes per fragment. */ - if (ip->ip_off & IP_DF) { + + if (ip->ip_off & IP_DF || (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4)) { error = EMSGSIZE; /* * This case can happen if the user changed the MTU + * * of an interface after enabling IP on it. Because * most netifs don't keep track of routes pointing to * them, there is no way for one to update all its * routes when the MTU is changed. */ - if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) + RT_LOCK_SPIN(ro->ro_rt); + if (ro->ro_rt && (ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) && !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) && (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) { ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu; } - ipstat.ips_cantfrag++; + RT_UNLOCK(ro->ro_rt); + OSAddAtomic(1, &ipstat.ips_cantfrag); goto bad; } - len = (ifp->if_mtu - hlen) &~ 7; - if (len < 8) { - error = EMSGSIZE; + + error = ip_fragment(m, ifp, ifp->if_mtu, sw_csum); + if (error != 0) { + m0 = m = NULL; goto bad; } - /* - * if the interface will not calculate checksums on - * fragmented packets, then do it here. - */ - if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA && - (ifp->if_hwassist & CSUM_IP_FRAGS) == 0) { - in_delayed_cksum(m); - if (m == NULL) - return(ENOMEM); - m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; - } + KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr, + ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len); + + for (m = m0; m; m = m0) { + m0 = m->m_nextpkt; + m->m_nextpkt = 0; +#if IPSEC + /* clean ipsec history once it goes out of the node */ + if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) + ipsec_delaux(m); +#endif + if (error == 0) { +#ifndef __APPLE__ + /* Record statistics for this interface address. */ + if (ia != NULL) { + ia->ia_ifa.if_opackets++; + ia->ia_ifa.if_obytes += m->m_pkthdr.len; + } +#endif + if ((packetchain != 0) && (pktcnt > 0)) + panic("ip_output: mix of packet in packetlist is wrong=%p", packetlist); + error = ifnet_output(ifp, PF_INET, m, ro->ro_rt, + (struct sockaddr *)dst); + } else + m_freem(m); + } + if (error == 0) + OSAddAtomic(1, &ipstat.ips_fragmented); - { - int mhlen, firstlen = len; - struct mbuf **mnext = &m->m_nextpkt; - int nfrags = 1; - +done: + if (ia) { + ifafree(&ia->ia_ifa); + ia = NULL; + } +#if IPSEC + if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) { + if (ro == &iproute && ro->ro_rt) { + rtfree(ro->ro_rt); + ro->ro_rt = NULL; + } + if (sp != NULL) { + KEYDEBUG(KEYDEBUG_IPSEC_STAMP, + printf("DP ip_output call free SP:%x\n", sp)); + key_freesp(sp, KEY_SADB_UNLOCKED); + } + } +#endif /* IPSEC */ + + KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_END, error,0,0,0,0); + return (error); +bad: + m_freem(m0); + goto done; +} + +int +ip_fragment(struct mbuf *m, struct ifnet *ifp, unsigned long mtu, int sw_csum) +{ + struct ip *ip, *mhip; + int len, hlen, mhlen, firstlen, off, error = 0; + struct mbuf **mnext = &m->m_nextpkt, *m0; + int nfrags = 1; + + ip = mtod(m, struct ip *); +#ifdef _IP_VHL + hlen = IP_VHL_HL(ip->ip_vhl) << 2; +#else + hlen = ip->ip_hl << 2; +#endif + + firstlen = len = (mtu - hlen) &~ 7; + if (len < 8) { + m_freem(m); + return (EMSGSIZE); + } + + /* + * if the interface will not calculate checksums on + * fragmented packets, then do it here. + */ + if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA && + (ifp->if_hwassist & CSUM_IP_FRAGS) == 0) { + in_delayed_cksum(m); + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + } /* * Loop through length of segment after first fragment, @@ -932,10 +1677,10 @@ skip_ipsec: m0 = m; mhlen = sizeof (struct ip); for (off = hlen + len; off < (u_short)ip->ip_len; off += len) { - MGETHDR(m, M_DONTWAIT, MT_HEADER); + MGETHDR(m, M_DONTWAIT, MT_HEADER); /* MAC-OK */ if (m == 0) { error = ENOBUFS; - ipstat.ips_odropped++; + OSAddAtomic(1, &ipstat.ips_odropped); goto sendorfree; } m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG; @@ -959,24 +1704,34 @@ skip_ipsec: if (m->m_next == 0) { (void) m_free(m); error = ENOBUFS; /* ??? */ - ipstat.ips_odropped++; + OSAddAtomic(1, &ipstat.ips_odropped); goto sendorfree; } m->m_pkthdr.len = mhlen + len; - m->m_pkthdr.rcvif = (struct ifnet *)0; - m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags; - mhip->ip_off = htons((u_short)mhip->ip_off); + m->m_pkthdr.rcvif = 0; + m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags; + m->m_pkthdr.socket_id = m0->m_pkthdr.socket_id; +#if CONFIG_MACF_NET + mac_netinet_fragment(m0, m); +#endif + +#if BYTE_ORDER != BIG_ENDIAN + HTONS(mhip->ip_off); +#endif + mhip->ip_sum = 0; - if (sw_csum & CSUM_DELAY_IP) + if (sw_csum & CSUM_DELAY_IP) { mhip->ip_sum = in_cksum(m, mhlen); + } *mnext = m; mnext = &m->m_nextpkt; nfrags++; } - ipstat.ips_ofragments += nfrags; + OSAddAtomic(nfrags, &ipstat.ips_ofragments); /* set first/last markers for fragment chain */ - m0->m_flags |= M_FRAG; + m->m_flags |= M_LASTFRAG; + m0->m_flags |= M_FIRSTFRAG | M_FRAG; m0->m_pkthdr.csum_data = nfrags; /* @@ -987,84 +1742,228 @@ skip_ipsec: m_adj(m, hlen + firstlen - (u_short)ip->ip_len); m->m_pkthdr.len = hlen + firstlen; ip->ip_len = htons((u_short)m->m_pkthdr.len); - ip->ip_off = htons((u_short)(ip->ip_off | IP_MF)); + ip->ip_off |= IP_MF; + +#if BYTE_ORDER != BIG_ENDIAN + HTONS(ip->ip_off); +#endif + ip->ip_sum = 0; - if (sw_csum & CSUM_DELAY_IP) + if (sw_csum & CSUM_DELAY_IP) { ip->ip_sum = in_cksum(m, hlen); - + } sendorfree: + if (error) + m_freem_list(m0); - KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr, - ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len); + return (error); +} - for (m = m0; m; m = m0) { - m0 = m->m_nextpkt; - m->m_nextpkt = 0; - if (error == 0) - error = dlil_output(dl_tag, m, (void *) ro->ro_rt, - (struct sockaddr *)dst, 0); - else - m_freem(m); +static void +ip_out_cksum_stats(int proto, u_int32_t len) +{ + switch (proto) { + case IPPROTO_TCP: + tcp_out_cksum_stats(len); + break; + case IPPROTO_UDP: + udp_out_cksum_stats(len); + break; + default: + /* keep only TCP or UDP stats for now */ + break; } +} - if (error == 0) - ipstat.ips_fragmented++; - } -done: -#if IPSEC - if (ro == &iproute && ro->ro_rt) { - RTFREE(ro->ro_rt); - ro->ro_rt = NULL; +void +in_delayed_cksum_offset(struct mbuf *m0, int ip_offset) +{ + struct ip *ip; + unsigned char buf[sizeof(struct ip)]; + u_short csum, offset, ip_len; + struct mbuf *m = m0; + + while (ip_offset >= m->m_len) { + ip_offset -= m->m_len; + m = m->m_next; + if (m == NULL) { + printf("in_delayed_cksum_withoffset failed - ip_offset wasn't in the packet\n"); + return; + } } - if (sp != NULL) { - KEYDEBUG(KEYDEBUG_IPSEC_STAMP, - printf("DP ip_output call free SP:%x\n", sp)); - key_freesp(sp); + + /* Sometimes the IP header is not contiguous, yes this can happen! */ + if (ip_offset + sizeof(struct ip) > m->m_len) { +#if DEBUG + printf("delayed m_pullup, m->len: %d off: %d\n", + m->m_len, ip_offset); +#endif + m_copydata(m, ip_offset, sizeof(struct ip), (caddr_t) buf); + + ip = (struct ip *)buf; + } else { + ip = (struct ip*)(m->m_data + ip_offset); } -#endif /* IPSEC */ + + /* Gross */ + if (ip_offset) { + m->m_len -= ip_offset; + m->m_data += ip_offset; + } + + offset = IP_VHL_HL(ip->ip_vhl) << 2 ; - KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_END, error,0,0,0,0); - return (error); -bad: - m_freem(m0); - goto done; -} + /* + * We could be in the context of an IP or interface filter; in the + * former case, ip_len would be in host (correct) order while for + * the latter it would be in network order. Because of this, we + * attempt to interpret the length field by comparing it against + * the actual packet length. If the comparison fails, byte swap + * the length and check again. If it still fails, then the packet + * is bogus and we give up. + */ + ip_len = ip->ip_len; + if (ip_len != (m0->m_pkthdr.len - ip_offset)) { + ip_len = SWAP16(ip_len); + if (ip_len != (m0->m_pkthdr.len - ip_offset)) { + printf("in_delayed_cksum_offset: ip_len %d (%d) " + "doesn't match actual length %d\n", ip->ip_len, + ip_len, (m0->m_pkthdr.len - ip_offset)); + return; + } + } + + csum = in_cksum_skip(m, ip_len, offset); + + /* Update stats */ + ip_out_cksum_stats(ip->ip_p, ip_len - offset); + + if (m0->m_pkthdr.csum_flags & CSUM_UDP && csum == 0) + csum = 0xffff; + offset += m0->m_pkthdr.csum_data & 0xFFFF; /* checksum offset */ + + /* Gross */ + if (ip_offset) { + if (M_LEADINGSPACE(m) < ip_offset) + panic("in_delayed_cksum_offset - chain modified!\n"); + m->m_len += ip_offset; + m->m_data -= ip_offset; + } + + if (offset > ip_len) /* bogus offset */ + return; -extern u_short in_chksum_skip(struct mbuf *, int, int); + /* Insert the checksum in the existing chain */ + if (offset + ip_offset + sizeof(u_short) > m->m_len) { + char tmp[2]; + +#if DEBUG + printf("delayed m_copyback, m->len: %d off: %d p: %d\n", + m->m_len, offset + ip_offset, ip->ip_p); +#endif + *(u_short *)tmp = csum; + m_copyback(m, offset + ip_offset, 2, tmp); + } else + *(u_short *)(m->m_data + offset + ip_offset) = csum; +} void in_delayed_cksum(struct mbuf *m) { - struct ip *ip; - u_short csum, csum2, offset; + in_delayed_cksum_offset(m, 0); +} - ip = mtod(m, struct ip *); - offset = IP_VHL_HL(ip->ip_vhl) << 2 ; +void +in_cksum_offset(struct mbuf* m, size_t ip_offset) +{ + struct ip* ip = NULL; + int hlen = 0; + unsigned char buf[sizeof(struct ip)]; + int swapped = 0; + + while (ip_offset >= m->m_len) { + ip_offset -= m->m_len; + m = m->m_next; + if (m == NULL) { + printf("in_cksum_offset failed - ip_offset wasn't in the packet\n"); + return; + } + } + + /* Sometimes the IP header is not contiguous, yes this can happen! */ + if (ip_offset + sizeof(struct ip) > m->m_len) { - csum = in_cksum_skip(m, ip->ip_len, offset); +#if DEBUG + printf("in_cksum_offset - delayed m_pullup, m->len: %d off: %lu\n", + m->m_len, ip_offset); +#endif + m_copydata(m, ip_offset, sizeof(struct ip), (caddr_t) buf); - if ((m->m_pkthdr.csum_flags & CSUM_UDP) && csum == 0) - csum = 0xffff; + ip = (struct ip *)buf; + ip->ip_sum = 0; + m_copyback(m, ip_offset + offsetof(struct ip, ip_sum), 2, (caddr_t)&ip->ip_sum); + } else { + ip = (struct ip*)(m->m_data + ip_offset); + ip->ip_sum = 0; + } + + /* Gross */ + if (ip_offset) { + m->m_len -= ip_offset; + m->m_data += ip_offset; + } - offset += m->m_pkthdr.csum_data & 0xFFFF; /* checksum offset */ +#ifdef _IP_VHL + hlen = IP_VHL_HL(ip->ip_vhl) << 2; +#else + hlen = ip->ip_hl << 2; +#endif + /* + * We could be in the context of an IP or interface filter; in the + * former case, ip_len would be in host order while for the latter + * it would be in network (correct) order. Because of this, we + * attempt to interpret the length field by comparing it against + * the actual packet length. If the comparison fails, byte swap + * the length and check again. If it still fails, then the packet + * is bogus and we give up. + */ + if (ntohs(ip->ip_len) != (m->m_pkthdr.len - ip_offset)) { + ip->ip_len = SWAP16(ip->ip_len); + swapped = 1; + if (ntohs(ip->ip_len) != (m->m_pkthdr.len - ip_offset)) { + ip->ip_len = SWAP16(ip->ip_len); + printf("in_cksum_offset: ip_len %d (%d) " + "doesn't match actual length %lu\n", + ip->ip_len, SWAP16(ip->ip_len), + (m->m_pkthdr.len - ip_offset)); + return; + } + } - if (offset > ip->ip_len) /* bogus offset */ - return; + ip->ip_sum = 0; + ip->ip_sum = in_cksum(m, hlen); + if (swapped) + ip->ip_len = SWAP16(ip->ip_len); + + /* Gross */ + if (ip_offset) { + if (M_LEADINGSPACE(m) < ip_offset) + panic("in_cksum_offset - chain modified!\n"); + m->m_len += ip_offset; + m->m_data -= ip_offset; + } - if (offset + sizeof(u_short) > m->m_len) { - printf("delayed m_pullup, m->len: %d off: %d p: %d\n", - m->m_len, offset, ip->ip_p); - /* - * XXX - * this shouldn't happen, but if it does, the - * correct behavior may be to insert the checksum - * in the existing chain instead of rearranging it. - */ - if (m = m_pullup(m, offset + sizeof(u_short)) == 0) - return; - } + /* Insert the checksum in the existing chain if IP header not contiguous */ + if (ip_offset + sizeof(struct ip) > m->m_len) { + char tmp[2]; - *(u_short *)(m->m_data + offset) = csum; +#if DEBUG + printf("in_cksum_offset m_copyback, m->len: %u off: %lu p: %d\n", + m->m_len, ip_offset + offsetof(struct ip, ip_sum), ip->ip_p); +#endif + *(u_short *)tmp = ip->ip_sum; + m_copyback(m, ip_offset + offsetof(struct ip, ip_sum), 2, tmp); + } } /* @@ -1091,9 +1990,13 @@ ip_insertoptions(m, opt, phlen) if (p->ipopt_dst.s_addr) ip->ip_dst = p->ipopt_dst; if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) { - MGETHDR(n, M_DONTWAIT, MT_HEADER); + MGETHDR(n, M_DONTWAIT, MT_HEADER); /* MAC-OK */ if (n == 0) return (m); + n->m_pkthdr.rcvif = 0; +#if CONFIG_MACF_NET + mac_mbuf_label_copy(m, n); +#endif n->m_pkthdr.len = m->m_pkthdr.len + optlen; m->m_len -= sizeof(struct ip); m->m_data += sizeof(struct ip); @@ -1139,8 +2042,16 @@ ip_optcopy(ip, jp) *dp++ = IPOPT_NOP; optlen = 1; continue; - } else - optlen = cp[IPOPT_OLEN]; + } +#if DIAGNOSTIC + if (cnt < IPOPT_OLEN + sizeof(*cp)) + panic("malformed IPv4 option passed to ip_optcopy"); +#endif + optlen = cp[IPOPT_OLEN]; +#if DIAGNOSTIC + if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) + panic("malformed IPv4 option passed to ip_optcopy"); +#endif /* bogus lengths should have been caught by ip_dooptions */ if (optlen > cnt) optlen = cnt; @@ -1183,7 +2094,8 @@ ip_ctloutput(so, sopt) error = EMSGSIZE; break; } - MGET(m, sopt->sopt_p ? M_WAIT : M_DONTWAIT, MT_HEADER); + MGET(m, sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT, + MT_HEADER); if (m == 0) { error = ENOBUFS; break; @@ -1204,7 +2116,10 @@ ip_ctloutput(so, sopt) case IP_RECVRETOPTS: case IP_RECVDSTADDR: case IP_RECVIF: + case IP_RECVTTL: +#if defined(NFAITH) && NFAITH > 0 case IP_FAITH: +#endif error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) @@ -1240,13 +2155,82 @@ ip_ctloutput(so, sopt) OPTSET(INP_RECVIF); break; + case IP_RECVTTL: + OPTSET(INP_RECVTTL); + break; + +#if defined(NFAITH) && NFAITH > 0 case IP_FAITH: OPTSET(INP_FAITH); break; +#endif } break; #undef OPTSET +#if CONFIG_FORCE_OUT_IFP + /* + * Apple private interface, similar to IP_BOUND_IF, except + * that the parameter is a NULL-terminated string containing + * the name of the network interface; an emptry string means + * unbind. Applications are encouraged to use IP_BOUND_IF + * instead, as that is the current "official" API. + */ + case IP_FORCE_OUT_IFP: { + char ifname[IFNAMSIZ]; + unsigned int ifscope; + + /* This option is settable only for IPv4 */ + if (!(inp->inp_vflag & INP_IPV4)) { + error = EINVAL; + break; + } + + /* Verify interface name parameter is sane */ + if (sopt->sopt_valsize > sizeof(ifname)) { + error = EINVAL; + break; + } + + /* Copy the interface name */ + if (sopt->sopt_valsize != 0) { + error = sooptcopyin(sopt, ifname, + sizeof (ifname), sopt->sopt_valsize); + if (error) + break; + } + + if (sopt->sopt_valsize == 0 || ifname[0] == NULL) { + /* Unbind this socket from any interface */ + ifscope = IFSCOPE_NONE; + } else { + ifnet_t ifp; + + /* Verify name is NULL terminated */ + if (ifname[sopt->sopt_valsize - 1] != NULL) { + error = EINVAL; + break; + } + + /* Bail out if given bogus interface name */ + if (ifnet_find_by_name(ifname, &ifp) != 0) { + error = ENXIO; + break; + } + + /* Bind this socket to this interface */ + ifscope = ifp->if_index; + + /* + * Won't actually free; since we don't release + * this later, we should do it now. + */ + ifnet_release(ifp); + } + ip_bindif(inp, ifscope); + } + break; +#endif case IP_MULTICAST_IF: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: @@ -1293,13 +2277,15 @@ ip_ctloutput(so, sopt) struct mbuf *m; int optname; - if (error = sooptgetm(sopt, &m)) /* XXX */ + if (sopt->sopt_valsize > MCLBYTES) { + error = EMSGSIZE; + break; + } + if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ break; - if (error = sooptmcopyin(sopt, m)) /* XXX */ + if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ break; - priv = (sopt->sopt_p != NULL && - suser(sopt->sopt_p->p_ucred, - &sopt->sopt_p->p_acflag) != 0) ? 0 : 1; + priv = (proc_suser(sopt->sopt_p) == 0); if (m) { req = mtod(m, caddr_t); len = m->m_len; @@ -1311,6 +2297,62 @@ ip_ctloutput(so, sopt) } #endif /*IPSEC*/ +#if TRAFFIC_MGT + case IP_TRAFFIC_MGT_BACKGROUND: + { + unsigned background = 0; + error = sooptcopyin(sopt, &background, sizeof(background), sizeof(background)); + if (error) + break; + + if (background) { + socket_set_traffic_mgt_flags(so, + TRAFFIC_MGT_SO_BACKGROUND | + TRAFFIC_MGT_SO_BG_REGULATE); + } else { + socket_clear_traffic_mgt_flags(so, + TRAFFIC_MGT_SO_BACKGROUND | + TRAFFIC_MGT_SO_BG_REGULATE); + } + + break; + } +#endif /* TRAFFIC_MGT */ + + /* + * On a multihomed system, scoped routing can be used to + * restrict the source interface used for sending packets. + * The socket option IP_BOUND_IF binds a particular AF_INET + * socket to an interface such that data sent on the socket + * is restricted to that interface. This is unlike the + * SO_DONTROUTE option where the routing table is bypassed; + * therefore it allows for a greater flexibility and control + * over the system behavior, and does not place any restriction + * on the destination address type (e.g. unicast, multicast, + * or broadcast if applicable) or whether or not the host is + * directly reachable. Note that in the multicast transmit + * case, IP_MULTICAST_IF takes precedence over IP_BOUND_IF, + * since the former practically bypasses the routing table; + * in this case, IP_BOUND_IF sets the default interface used + * for sending multicast packets in the absence of an explicit + * transmit interface set via IP_MULTICAST_IF. + */ + case IP_BOUND_IF: + /* This option is settable only for IPv4 */ + if (!(inp->inp_vflag & INP_IPV4)) { + error = EINVAL; + break; + } + + error = sooptcopyin(sopt, &optval, sizeof (optval), + sizeof (optval)); + + if (error) + break; + + ip_bindif(inp, optval); + break; + default: error = ENOPROTOOPT; break; @@ -1336,8 +2378,11 @@ ip_ctloutput(so, sopt) case IP_RECVRETOPTS: case IP_RECVDSTADDR: case IP_RECVIF: + case IP_RECVTTL: case IP_PORTRANGE: +#if defined(NFAITH) && NFAITH > 0 case IP_FAITH: +#endif switch (sopt->sopt_name) { case IP_TOS: @@ -1366,6 +2411,10 @@ ip_ctloutput(so, sopt) optval = OPTBIT(INP_RECVIF); break; + case IP_RECVTTL: + optval = OPTBIT(INP_RECVTTL); + break; + case IP_PORTRANGE: if (inp->inp_flags & INP_HIGHPORT) optval = IP_PORTRANGE_HIGH; @@ -1375,9 +2424,11 @@ ip_ctloutput(so, sopt) optval = 0; break; +#if defined(NFAITH) && NFAITH > 0 case IP_FAITH: optval = OPTBIT(INP_FAITH); break; +#endif } error = sooptcopyout(sopt, &optval, sizeof optval); break; @@ -1395,29 +2446,37 @@ ip_ctloutput(so, sopt) case IP_IPSEC_POLICY: { struct mbuf *m = NULL; - size_t len = 0; caddr_t req = NULL; + size_t len = 0; - if (error = sooptgetm(sopt, &m)) /* XXX */ - break; - if (error = sooptmcopyin(sopt, m)) /* XXX */ - break; - if (m) { + if (m != 0) { req = mtod(m, caddr_t); len = m->m_len; } - error = ipsec4_get_policy(sotoinpcb(so), req, len, &m); if (error == 0) - error = sooptmcopyout(sopt, m); /* XXX */ - - /* if error, m_freem called at soopt_mcopyout(). */ + error = soopt_mcopyout(sopt, m); /* XXX */ if (error == 0) m_freem(m); break; } #endif /*IPSEC*/ +#if TRAFFIC_MGT + case IP_TRAFFIC_MGT_BACKGROUND: + { + unsigned background = so->so_traffic_mgt_flags; + return (sooptcopyout(sopt, &background, sizeof(background))); + break; + } +#endif /* TRAFFIC_MGT */ + + case IP_BOUND_IF: + if (inp->inp_flags & INP_BOUND_IF) + optval = inp->inp_boundif; + error = sooptcopyout(sopt, &optval, sizeof (optval)); + break; + default: error = ENOPROTOOPT; break; @@ -1433,10 +2492,10 @@ ip_ctloutput(so, sopt) * with destination address if source routed. */ static int -ip_pcbopts(optname, pcbopt, m) - int optname; - struct mbuf **pcbopt; - register struct mbuf *m; +ip_pcbopts( + __unused int optname, + struct mbuf **pcbopt, + register struct mbuf *m) { register int cnt, optlen; register u_char *cp; @@ -1539,6 +2598,37 @@ bad: * transmission, and one (IP_MULTICAST_TTL) totally duplicates a * standard option (IP_TTL). */ + +/* + * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index. + */ +static struct ifnet * +ip_multicast_if(a, ifindexp) + struct in_addr *a; + int *ifindexp; +{ + int ifindex; + struct ifnet *ifp; + + if (ifindexp) + *ifindexp = 0; + if (ntohl(a->s_addr) >> 24 == 0) { + ifindex = ntohl(a->s_addr) & 0xffffff; + ifnet_head_lock_shared(); + if (ifindex < 0 || if_index < ifindex) { + ifnet_head_done(); + return NULL; + } + ifp = ifindex2ifnet[ifindex]; + ifnet_head_done(); + if (ifindexp) + *ifindexp = ifindex; + } else { + INADDR_TO_IFP(*a, ifp); + } + return ifp; +} + /* * Set the IP multicast options in response to user setsockopt(). */ @@ -1548,49 +2638,44 @@ ip_setmoptions(sopt, imop) struct ip_moptions **imop; { int error = 0; - int i; struct in_addr addr; struct ip_mreq mreq; - struct ifnet *ifp; + struct ifnet *ifp = NULL; struct ip_moptions *imo = *imop; - struct route ro; - struct sockaddr_in *dst; - int s; + int ifindex; if (imo == NULL) { /* * No multicast option buffer attached to the pcb; * allocate one and initialize to default values. */ - imo = (struct ip_moptions*) _MALLOC(sizeof(*imo), M_IPMOPTS, - M_WAITOK); - - if (imo == NULL) - return (ENOBUFS); - *imop = imo; - imo->imo_multicast_ifp = NULL; - imo->imo_multicast_vif = -1; - imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; - imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; - imo->imo_num_memberships = 0; + error = ip_createmoptions(imop); + if (error != 0) + return error; + imo = *imop; } switch (sopt->sopt_name) { /* store an index number for the vif you wanna use in the send */ - case IP_MULTICAST_VIF: - if (legal_vif_num == 0) { - error = EOPNOTSUPP; - break; - } - error = sooptcopyin(sopt, &i, sizeof i, sizeof i); - if (error) - break; - if (!legal_vif_num(i) && (i != -1)) { - error = EINVAL; +#if MROUTING + case IP_MULTICAST_VIF: + { + int i; + if (legal_vif_num == 0) { + error = EOPNOTSUPP; + break; + } + error = sooptcopyin(sopt, &i, sizeof i, sizeof i); + if (error) + break; + if (!legal_vif_num(i) && (i != -1)) { + error = EINVAL; + break; + } + imo->imo_multicast_vif = i; break; } - imo->imo_multicast_vif = i; - break; +#endif /* MROUTING */ case IP_MULTICAST_IF: /* @@ -1613,15 +2698,16 @@ ip_setmoptions(sopt, imop) * IP address. Find the interface and confirm that * it supports multicasting. */ - s = splimp(); - INADDR_TO_IFP(addr, ifp); + ifp = ip_multicast_if(&addr, &ifindex); if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { - splx(s); error = EADDRNOTAVAIL; break; } imo->imo_multicast_ifp = ifp; - splx(s); + if (ifindex) + imo->imo_multicast_addr = addr; + else + imo->imo_multicast_addr.s_addr = INADDR_ANY; break; case IP_MULTICAST_TTL: @@ -1681,76 +2767,8 @@ ip_setmoptions(sopt, imop) error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); if (error) break; - - if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) { - error = EINVAL; - break; - } - s = splimp(); - /* - * If no interface address was provided, use the interface of - * the route to the given multicast address. - */ - if (mreq.imr_interface.s_addr == INADDR_ANY) { - bzero((caddr_t)&ro, sizeof(ro)); - dst = (struct sockaddr_in *)&ro.ro_dst; - dst->sin_len = sizeof(*dst); - dst->sin_family = AF_INET; - dst->sin_addr = mreq.imr_multiaddr; - rtalloc(&ro); - if (ro.ro_rt == NULL) { - error = EADDRNOTAVAIL; - splx(s); - break; - } - ifp = ro.ro_rt->rt_ifp; - rtfree(ro.ro_rt); - } - else { - INADDR_TO_IFP(mreq.imr_interface, ifp); - } - - /* - * See if we found an interface, and confirm that it - * supports multicast. - */ - if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { - error = EADDRNOTAVAIL; - splx(s); - break; - } - /* - * See if the membership already exists or if all the - * membership slots are full. - */ - for (i = 0; i < imo->imo_num_memberships; ++i) { - if (imo->imo_membership[i]->inm_ifp == ifp && - imo->imo_membership[i]->inm_addr.s_addr - == mreq.imr_multiaddr.s_addr) - break; - } - if (i < imo->imo_num_memberships) { - error = EADDRINUSE; - splx(s); - break; - } - if (i == IP_MAX_MEMBERSHIPS) { - error = ETOOMANYREFS; - splx(s); - break; - } - /* - * Everything looks good; add a new record to the multicast - * address list for the given interface. - */ - if ((imo->imo_membership[i] = - in_addmulti(&mreq.imr_multiaddr, ifp)) == NULL) { - error = ENOBUFS; - splx(s); - break; - } - ++imo->imo_num_memberships; - splx(s); + + error = ip_addmembership(imo, &mreq); break; case IP_DROP_MEMBERSHIP: @@ -1761,54 +2779,8 @@ ip_setmoptions(sopt, imop) error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); if (error) break; - - if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) { - error = EINVAL; - break; - } - - s = splimp(); - /* - * If an interface address was specified, get a pointer - * to its ifnet structure. - */ - if (mreq.imr_interface.s_addr == INADDR_ANY) - ifp = NULL; - else { - INADDR_TO_IFP(mreq.imr_interface, ifp); - if (ifp == NULL) { - error = EADDRNOTAVAIL; - splx(s); - break; - } - } - /* - * Find the membership in the membership array. - */ - for (i = 0; i < imo->imo_num_memberships; ++i) { - if ((ifp == NULL || - imo->imo_membership[i]->inm_ifp == ifp) && - imo->imo_membership[i]->inm_addr.s_addr == - mreq.imr_multiaddr.s_addr) - break; - } - if (i == imo->imo_num_memberships) { - error = EADDRNOTAVAIL; - splx(s); - break; - } - /* - * Give up the multicast address record to which the - * membership points. - */ - in_delmulti(imo->imo_membership[i]); - /* - * Remove the gap in the membership array. - */ - for (++i; i < imo->imo_num_memberships; ++i) - imo->imo_membership[i-1] = imo->imo_membership[i]; - --imo->imo_num_memberships; - splx(s); + + error = ip_dropmembership(imo, &mreq); break; default: @@ -1820,7 +2792,7 @@ ip_setmoptions(sopt, imop) * If all options have default values, no need to keep the mbuf. */ if (imo->imo_multicast_ifp == NULL && - imo->imo_multicast_vif == -1 && + imo->imo_multicast_vif == (u_int32_t)-1 && imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL && imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP && imo->imo_num_memberships == 0) { @@ -1831,6 +2803,175 @@ ip_setmoptions(sopt, imop) return (error); } +/* + * Set the IP multicast options in response to user setsockopt(). + */ +__private_extern__ int +ip_createmoptions( + struct ip_moptions **imop) +{ + struct ip_moptions *imo; + imo = (struct ip_moptions*) _MALLOC(sizeof(*imo), M_IPMOPTS, + M_WAITOK); + + if (imo == NULL) + return (ENOBUFS); + *imop = imo; + imo->imo_multicast_ifp = NULL; + imo->imo_multicast_addr.s_addr = INADDR_ANY; + imo->imo_multicast_vif = -1; + imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; + imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; + imo->imo_num_memberships = 0; + + return 0; +} + +/* + * Add membership to an IPv4 multicast. + */ +__private_extern__ int +ip_addmembership( + struct ip_moptions *imo, + struct ip_mreq *mreq) +{ + struct route ro; + struct sockaddr_in *dst; + struct ifnet *ifp = NULL; + int error = 0; + int i; + + bzero((caddr_t)&ro, sizeof(ro)); + + if (!IN_MULTICAST(ntohl(mreq->imr_multiaddr.s_addr))) { + error = EINVAL; + goto done; + } + /* + * If no interface address was provided, use the interface of + * the route to the given multicast address. + */ + if (mreq->imr_interface.s_addr == INADDR_ANY) { + dst = (struct sockaddr_in *)&ro.ro_dst; + dst->sin_len = sizeof(*dst); + dst->sin_family = AF_INET; + dst->sin_addr = mreq->imr_multiaddr; + rtalloc_ign(&ro, 0); + if (ro.ro_rt != NULL) { + ifp = ro.ro_rt->rt_ifp; + } else { + /* If there's no default route, try using loopback */ + mreq->imr_interface.s_addr = htonl(INADDR_LOOPBACK); + } + } + + if (ifp == NULL) { + ifp = ip_multicast_if(&mreq->imr_interface, NULL); + } + + /* + * See if we found an interface, and confirm that it + * supports multicast. + */ + if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { + error = EADDRNOTAVAIL; + goto done; + } + /* + * See if the membership already exists or if all the + * membership slots are full. + */ + for (i = 0; i < imo->imo_num_memberships; ++i) { + if (imo->imo_membership[i]->inm_ifp == ifp && + imo->imo_membership[i]->inm_addr.s_addr + == mreq->imr_multiaddr.s_addr) + break; + } + if (i < imo->imo_num_memberships) { + error = EADDRINUSE; + goto done; + } + if (i == IP_MAX_MEMBERSHIPS) { + error = ETOOMANYREFS; + goto done; + } + /* + * Everything looks good; add a new record to the multicast + * address list for the given interface. + */ + if ((imo->imo_membership[i] = + in_addmulti(&mreq->imr_multiaddr, ifp)) == NULL) { + error = ENOBUFS; + goto done; + } + ++imo->imo_num_memberships; + +done: + if (ro.ro_rt != NULL) + rtfree(ro.ro_rt); + + return error; +} + +/* + * Drop membership of an IPv4 multicast. + */ +__private_extern__ int +ip_dropmembership( + struct ip_moptions *imo, + struct ip_mreq *mreq) +{ + int error = 0; + struct ifnet* ifp = NULL; + int i; + + if (!IN_MULTICAST(ntohl(mreq->imr_multiaddr.s_addr))) { + error = EINVAL; + return error; + } + + /* + * If an interface address was specified, get a pointer + * to its ifnet structure. + */ + if (mreq->imr_interface.s_addr == INADDR_ANY) + ifp = NULL; + else { + ifp = ip_multicast_if(&mreq->imr_interface, NULL); + if (ifp == NULL) { + error = EADDRNOTAVAIL; + return error; + } + } + /* + * Find the membership in the membership array. + */ + for (i = 0; i < imo->imo_num_memberships; ++i) { + if ((ifp == NULL || + imo->imo_membership[i]->inm_ifp == ifp) && + imo->imo_membership[i]->inm_addr.s_addr == + mreq->imr_multiaddr.s_addr) + break; + } + if (i == imo->imo_num_memberships) { + error = EADDRNOTAVAIL; + return error; + } + /* + * Give up the multicast address record to which the + * membership points. + */ + in_delmulti(&imo->imo_membership[i]); + /* + * Remove the gap in the membership array. + */ + for (++i; i < imo->imo_num_memberships; ++i) + imo->imo_membership[i-1] = imo->imo_membership[i]; + --imo->imo_num_memberships; + + return error; +} + /* * Return the IP multicast options in response to user getsockopt(). */ @@ -1846,6 +2987,7 @@ ip_getmoptions(sopt, imo) error = 0; switch (sopt->sopt_name) { +#if MROUTING case IP_MULTICAST_VIF: if (imo != NULL) optval = imo->imo_multicast_vif; @@ -1853,14 +2995,20 @@ ip_getmoptions(sopt, imo) optval = -1; error = sooptcopyout(sopt, &optval, sizeof optval); break; +#endif /* MROUTING */ case IP_MULTICAST_IF: if (imo == NULL || imo->imo_multicast_ifp == NULL) addr.s_addr = INADDR_ANY; - else { + else if (imo->imo_multicast_addr.s_addr) { + /* return the value user has set */ + addr = imo->imo_multicast_addr; + } else { IFP_TO_IA(imo->imo_multicast_ifp, ia); addr.s_addr = (ia == NULL) ? INADDR_ANY : IA_SIN(ia)->sin_addr.s_addr; + if (ia != NULL) + ifafree(&ia->ia_ifa); } error = sooptcopyout(sopt, &addr, sizeof addr); break; @@ -1905,7 +3053,7 @@ ip_freemoptions(imo) if (imo != NULL) { for (i = 0; i < imo->imo_num_memberships; ++i) - in_delmulti(imo->imo_membership[i]); + in_delmulti(&imo->imo_membership[i]); FREE(imo, M_IPMOPTS); } } @@ -1926,84 +3074,401 @@ ip_mloopback(ifp, m, dst, hlen) { register struct ip *ip; struct mbuf *copym; + int sw_csum = (apple_hwcksum_tx == 0); copym = m_copy(m, 0, M_COPYALL); if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen)) copym = m_pullup(copym, hlen); - if (copym != NULL) { - /* - * We don't bother to fragment if the IP length is greater - * than the interface's MTU. Can this possibly matter? - */ - ip = mtod(copym, struct ip *); - ip->ip_len = htons((u_short)ip->ip_len); - ip->ip_off = htons((u_short)ip->ip_off); - ip->ip_sum = 0; - ip->ip_sum = in_cksum(copym, hlen); + + if (copym == NULL) + return; + + /* + * We don't bother to fragment if the IP length is greater + * than the interface's MTU. Can this possibly matter? + */ + ip = mtod(copym, struct ip *); + +#if BYTE_ORDER != BIG_ENDIAN + HTONS(ip->ip_len); + HTONS(ip->ip_off); +#endif + + ip->ip_sum = 0; + ip->ip_sum = in_cksum(copym, hlen); + /* + * NB: + * It's not clear whether there are any lingering + * reentrancy problems in other areas which might + * be exposed by using ip_input directly (in + * particular, everything which modifies the packet + * in-place). Yet another option is using the + * protosw directly to deliver the looped back + * packet. For the moment, we'll err on the side + * of safety by using if_simloop(). + */ +#if 1 /* XXX */ + if (dst->sin_family != AF_INET) { + printf("ip_mloopback: bad address family %d\n", + dst->sin_family); + dst->sin_family = AF_INET; + } +#endif + + /* + * Mark checksum as valid or calculate checksum for loopback. + * + * This is done this way because we have to embed the ifp of + * the interface we will send the original copy of the packet + * out on in the mbuf. ip_input will check if_hwassist of the + * embedded ifp and ignore all csum_flags if if_hwassist is 0. + * The UDP checksum has not been calculated yet. + */ + if (sw_csum || (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA)) { + if (!sw_csum && IF_HWASSIST_CSUM_FLAGS(ifp->if_hwassist)) { + copym->m_pkthdr.csum_flags |= + CSUM_DATA_VALID | CSUM_PSEUDO_HDR | + CSUM_IP_CHECKED | CSUM_IP_VALID; + copym->m_pkthdr.csum_data = 0xffff; + } else { + +#if BYTE_ORDER != BIG_ENDIAN + NTOHS(ip->ip_len); +#endif + + in_delayed_cksum(copym); + +#if BYTE_ORDER != BIG_ENDIAN + HTONS(ip->ip_len); +#endif + + } + } + + /* + * TedW: + * We need to send all loopback traffic down to dlil in case + * a filter has tapped-in. + */ + + /* + * Stuff the 'real' ifp into the pkthdr, to be used in matching + * in ip_input(); we need the loopback ifp/dl_tag passed as args + * to make the loopback driver compliant with the data link + * requirements. + */ + if (lo_ifp) { + copym->m_pkthdr.rcvif = ifp; + dlil_output(lo_ifp, PF_INET, copym, 0, + (struct sockaddr *) dst, 0); + } else { + printf("Warning: ip_output call to dlil_find_dltag failed!\n"); + m_freem(copym); + } +} + +/* + * Given a source IP address (and route, if available), determine the best + * interface to send the packet from. Checking for (and updating) the + * ROF_SRCIF_SELECTED flag in the pcb-supplied route placeholder is done + * without any locks based on the assumption that ip_output() is single- + * threaded per-pcb, i.e. for any given pcb there can only be one thread + * performing output at the IP layer. + */ +static struct ifaddr * +in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) +{ + struct ifaddr *ifa = NULL; + struct in_addr src = ip->ip_src; + struct in_addr dst = ip->ip_dst; + struct ifnet *rt_ifp; + char s_src[MAX_IPv4_STR_LEN], s_dst[MAX_IPv4_STR_LEN]; + + if (ip_select_srcif_debug) { + (void) inet_ntop(AF_INET, &src.s_addr, s_src, sizeof (s_src)); + (void) inet_ntop(AF_INET, &dst.s_addr, s_dst, sizeof (s_dst)); + } + + if (ro->ro_rt != NULL) + RT_LOCK(ro->ro_rt); + + rt_ifp = (ro->ro_rt != NULL) ? ro->ro_rt->rt_ifp : NULL; + + /* + * Given the source IP address, find a suitable source interface + * to use for transmission; if the caller has specified a scope, + * optimize the search by looking at the addresses only for that + * interface. This is still suboptimal, however, as we need to + * traverse the per-interface list. + */ + if (ifscope != IFSCOPE_NONE || ro->ro_rt != NULL) { + unsigned int scope = ifscope; /* - * NB: - * It's not clear whether there are any lingering - * reentrancy problems in other areas which might - * be exposed by using ip_input directly (in - * particular, everything which modifies the packet - * in-place). Yet another option is using the - * protosw directly to deliver the looped back - * packet. For the moment, we'll err on the side - * of safety by using if_simloop(). + * If no scope is specified and the route is stale (pointing + * to a defunct interface) use the current primary interface; + * this happens when switching between interfaces configured + * with the same IP address. Otherwise pick up the scope + * information from the route; the ULP may have looked up a + * correct route and we just need to verify it here and mark + * it with the ROF_SRCIF_SELECTED flag below. */ -#if 1 /* XXX */ - if (dst->sin_family != AF_INET) { - printf("ip_mloopback: bad address family %d\n", - dst->sin_family); - dst->sin_family = AF_INET; + if (scope == IFSCOPE_NONE) { + scope = rt_ifp->if_index; + if (scope != get_primary_ifscope() && + ro->ro_rt->generation_id != route_generation) + scope = get_primary_ifscope(); } -#endif + ifa = (struct ifaddr *)ifa_foraddr_scoped(src.s_addr, scope); - /* - * Mark checksum as valid or calculate checksum for loopback. - * - * This is done this way because we have to embed the ifp of - * the interface we will send the original copy of the packet - * out on in the mbuf. ip_input will check if_hwassist of the - * embedded ifp and ignore all csum_flags if if_hwassist is 0. - * The UDP checksum has not been calculated yet. - */ - if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { - if (ifp->if_hwassist) { - copym->m_pkthdr.csum_flags |= - CSUM_DATA_VALID | CSUM_PSEUDO_HDR | - CSUM_IP_CHECKED | CSUM_IP_VALID; - copym->m_pkthdr.csum_data = 0xffff; - } else - in_delayed_cksum(copym); - } + if (ifa == NULL && ip->ip_p != IPPROTO_UDP && + ip->ip_p != IPPROTO_TCP && ipforwarding) { + /* + * If forwarding is enabled, and if the packet isn't + * TCP or UDP, check if the source address belongs + * to one of our own interfaces; if so, demote the + * interface scope and do a route lookup right below. + */ + ifa = (struct ifaddr *)ifa_foraddr(src.s_addr); + if (ifa != NULL) { + ifafree(ifa); + ifa = NULL; + ifscope = IFSCOPE_NONE; + } + } + + if (ip_select_srcif_debug && ifa != NULL) { + if (ro->ro_rt != NULL) { + printf("%s->%s ifscope %d->%d ifa_if %s%d " + "ro_if %s%d\n", s_src, s_dst, ifscope, + scope, ifa->ifa_ifp->if_name, + ifa->ifa_ifp->if_unit, rt_ifp->if_name, + rt_ifp->if_unit); + } else { + printf("%s->%s ifscope %d->%d ifa_if %s%d\n", + s_src, s_dst, ifscope, scope, + ifa->ifa_ifp->if_name, + ifa->ifa_ifp->if_unit); + } + } + } + /* + * Slow path; search for an interface having the corresponding source + * IP address if the scope was not specified by the caller, and: + * + * 1) There currently isn't any route, or, + * 2) The interface used by the route does not own that source + * IP address; in this case, the route will get blown away + * and we'll do a more specific scoped search using the newly + * found interface. + */ + if (ifa == NULL && ifscope == IFSCOPE_NONE) { + ifa = (struct ifaddr *)ifa_foraddr(src.s_addr); /* - * TedW: - * We need to send all loopback traffic down to dlil in case - * a filter has tapped-in. + * If we have the IP address, but not the route, we don't + * really know whether or not it belongs to the correct + * interface (it could be shared across multiple interfaces.) + * The only way to find out is to do a route lookup. */ + if (ifa != NULL && ro->ro_rt == NULL) { + struct rtentry *rt; + struct sockaddr_in sin; + struct ifaddr *oifa = NULL; + + bzero(&sin, sizeof (sin)); + sin.sin_family = AF_INET; + sin.sin_len = sizeof (sin); + sin.sin_addr = dst; + + lck_mtx_lock(rnh_lock); + if ((rt = rt_lookup(TRUE, (struct sockaddr *)&sin, NULL, + rt_tables[AF_INET], IFSCOPE_NONE)) != NULL) { + RT_LOCK(rt); + /* + * If the route uses a different interface, + * use that one instead. The IP address of + * the ifaddr that we pick up here is not + * relevant. + */ + if (ifa->ifa_ifp != rt->rt_ifp) { + oifa = ifa; + ifa = rt->rt_ifa; + ifaref(ifa); + RT_UNLOCK(rt); + } else { + RT_UNLOCK(rt); + } + rtfree_locked(rt); + } + lck_mtx_unlock(rnh_lock); + + if (oifa != NULL) { + struct ifaddr *iifa; + + /* + * See if the interface pointed to by the + * route is configured with the source IP + * address of the packet. + */ + iifa = (struct ifaddr *)ifa_foraddr_scoped( + src.s_addr, ifa->ifa_ifp->if_index); + + if (iifa != NULL) { + /* + * Found it; drop the original one + * as well as the route interface + * address, and use this instead. + */ + ifafree(oifa); + ifafree(ifa); + ifa = iifa; + } else if (!ipforwarding || + (rt->rt_flags & RTF_GATEWAY)) { + /* + * This interface doesn't have that + * source IP address; drop the route + * interface address and just use the + * original one, and let the caller + * do a scoped route lookup. + */ + ifafree(ifa); + ifa = oifa; + } else { + /* + * Forwarding is enabled and the source + * address belongs to one of our own + * interfaces which isn't the outgoing + * interface, and we have a route, and + * the destination is on a network that + * is directly attached (onlink); drop + * the original one and use the route + * interface address instead. + */ + ifafree(oifa); + } + } + } else if (ifa != NULL && ro->ro_rt != NULL && + !(ro->ro_rt->rt_flags & RTF_GATEWAY) && + ifa->ifa_ifp != ro->ro_rt->rt_ifp && ipforwarding) { + /* + * Forwarding is enabled and the source address belongs + * to one of our own interfaces which isn't the same + * as the interface used by the known route; drop the + * original one and use the route interface address. + */ + ifafree(ifa); + ifa = ro->ro_rt->rt_ifa; + ifaref(ifa); + } + + if (ip_select_srcif_debug && ifa != NULL) { + printf("%s->%s ifscope %d ifa_if %s%d\n", + s_src, s_dst, ifscope, ifa->ifa_ifp->if_name, + ifa->ifa_ifp->if_unit); + } + } + + if (ro->ro_rt != NULL) + RT_LOCK_ASSERT_HELD(ro->ro_rt); + /* + * If there is a non-loopback route with the wrong interface, or if + * there is no interface configured with such an address, blow it + * away. Except for local/loopback, we look for one with a matching + * interface scope/index. + */ + if (ro->ro_rt != NULL && + (ifa == NULL || (ifa->ifa_ifp != rt_ifp && rt_ifp != lo_ifp) || + !(ro->ro_rt->rt_flags & RTF_UP))) { + if (ip_select_srcif_debug) { + if (ifa != NULL) { + printf("%s->%s ifscope %d ro_if %s%d != " + "ifa_if %s%d (cached route cleared)\n", + s_src, s_dst, ifscope, rt_ifp->if_name, + rt_ifp->if_unit, ifa->ifa_ifp->if_name, + ifa->ifa_ifp->if_unit); + } else { + printf("%s->%s ifscope %d ro_if %s%d " + "(no ifa_if found)\n", + s_src, s_dst, ifscope, rt_ifp->if_name, + rt_ifp->if_unit); + } + } - if (lo_dl_tag == 0) - dlil_find_dltag(APPLE_IF_FAM_LOOPBACK, 0, PF_INET, &lo_dl_tag); + RT_UNLOCK(ro->ro_rt); + rtfree(ro->ro_rt); + ro->ro_rt = NULL; + ro->ro_flags &= ~ROF_SRCIF_SELECTED; /* - * Stuff the 'real' ifp into the pkthdr, to be used in matching - * in ip_input(); we need the loopback ifp/dl_tag passed as args - * to make the loopback driver compliant with the data link - * requirements. + * If the destination is IPv4 LLA and the route's interface + * doesn't match the source interface, then the source IP + * address is wrong; it most likely belongs to the primary + * interface associated with the IPv4 LL subnet. Drop the + * packet rather than letting it go out and return an error + * to the ULP. This actually applies not only to IPv4 LL + * but other shared subnets; for now we explicitly test only + * for the former case and save the latter for future. */ - if (lo_dl_tag) - { copym->m_pkthdr.rcvif = ifp; - dlil_output(lo_dl_tag, copym, 0, (struct sockaddr *) dst, 0); - } else { - printf("Warning: ip_output call to dlil_find_dltag failed!\n"); - m_freem(copym); + if (IN_LINKLOCAL(ntohl(dst.s_addr)) && + !IN_LINKLOCAL(ntohl(src.s_addr)) && ifa != NULL) { + ifafree(ifa); + ifa = NULL; } + } + + if (ip_select_srcif_debug && ifa == NULL) { + printf("%s->%s ifscope %d (neither ro_if/ifa_if found)\n", + s_src, s_dst, ifscope); + } + + /* + * If there is a route, mark it accordingly. If there isn't one, + * we'll get here again during the next transmit (possibly with a + * route) and the flag will get set at that point. For IPv4 LLA + * destination, mark it only if the route has been fully resolved; + * otherwise we want to come back here again when the route points + * to the interface over which the ARP reply arrives on. + */ + if (ro->ro_rt != NULL && (!IN_LINKLOCAL(ntohl(dst.s_addr)) || + (ro->ro_rt->rt_gateway->sa_family == AF_LINK && + SDL(ro->ro_rt->rt_gateway)->sdl_alen != 0))) { + ro->ro_flags |= ROF_SRCIF_SELECTED; + ro->ro_rt->generation_id = route_generation; + } + + if (ro->ro_rt != NULL) + RT_UNLOCK(ro->ro_rt); + + return (ifa); +} + +/* + * Handler for setting IP_FORCE_OUT_IFP or IP_BOUND_IF socket option. + */ +static void +ip_bindif(struct inpcb *inp, unsigned int ifscope) +{ + /* + * A zero interface scope value indicates an "unbind". + * Otherwise, take in whatever value the app desires; + * the app may already know the scope (or force itself + * to such a scope) ahead of time before the interface + * gets attached. It doesn't matter either way; any + * route lookup from this point on will require an + * exact match for the embedded interface scope. + */ + inp->inp_boundif = ifscope; + if (inp->inp_boundif == IFSCOPE_NONE) + inp->inp_flags &= ~INP_BOUND_IF; + else + inp->inp_flags |= INP_BOUND_IF; -/* if_simloop(ifp, copym, (struct sockaddr *)dst, 0);*/ + /* Blow away any cached route in the PCB */ + if (inp->inp_route.ro_rt != NULL) { + rtfree(inp->inp_route.ro_rt); + inp->inp_route.ro_rt = NULL; } }