X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/8f6c56a50524aa785f7e596d52dddfb331e18961..7ee9d059c4eecf68ae4f8b0fb99ae2471eda79af:/bsd/netinet/ip_input.c diff --git a/bsd/netinet/ip_input.c b/bsd/netinet/ip_input.c index d84dac759..761b4b40c 100644 --- a/bsd/netinet/ip_input.c +++ b/bsd/netinet/ip_input.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -60,6 +60,12 @@ * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 * $FreeBSD: src/sys/netinet/ip_input.c,v 1.130.2.25 2001/08/29 21:41:37 jesper Exp $ */ +/* + * NOTICE: This file was modified by SPARTA, Inc. in 2007 to introduce + * support for mandatory and extensible security protections. This notice + * is included in support of clause 2.2 (b) of the Apple Public License, + * Version 2.0. + */ #define _IP_VHL @@ -74,19 +80,27 @@ #include #include #include +#include +#include + +#include #include #include +#include + #include #include #include #include #include +#include #include #include #include +#include #include #include #include @@ -102,8 +116,14 @@ #include #include #include +#include + +#if CONFIG_MACF_NET +#include +#endif #include +#include #define DBG_LAYER_BEG NETDBG_CODE(DBG_NETIP, 0) #define DBG_LAYER_END NETDBG_CODE(DBG_NETIP, 2) @@ -124,58 +144,75 @@ #include #endif +#if PF +#include +#endif /* PF */ + #if IPSEC extern int ipsec_bypass; extern lck_mtx_t *sadb_mutex; + +lck_grp_t *sadb_stat_mutex_grp; +lck_grp_attr_t *sadb_stat_mutex_grp_attr; +lck_attr_t *sadb_stat_mutex_attr; +lck_mtx_t *sadb_stat_mutex; + #endif int rsvp_on = 0; static int ip_rsvp_on; struct socket *ip_rsvpd; +static int sysctl_ipforwarding SYSCTL_HANDLER_ARGS; + int ipforwarding = 0; -SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_RW, - &ipforwarding, 0, "Enable IP forwarding between interfaces"); +SYSCTL_PROC(_net_inet_ip, IPCTL_FORWARDING, forwarding, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &ipforwarding, 0, + sysctl_ipforwarding, "I", "Enable IP forwarding between interfaces"); static int ipsendredirects = 1; /* XXX */ -SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_RW | CTLFLAG_LOCKED, &ipsendredirects, 0, "Enable sending IP redirects"); int ip_defttl = IPDEFTTL; -SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_defttl, 0, "Maximum TTL on IP packets"); static int ip_dosourceroute = 0; -SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_dosourceroute, 0, "Enable forwarding source routed IP packets"); static int ip_acceptsourceroute = 0; SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute, - CTLFLAG_RW, &ip_acceptsourceroute, 0, + CTLFLAG_RW | CTLFLAG_LOCKED, &ip_acceptsourceroute, 0, "Enable accepting source routed IP packets"); static int ip_keepfaith = 0; -SYSCTL_INT(_net_inet_ip, IPCTL_KEEPFAITH, keepfaith, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_KEEPFAITH, keepfaith, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_keepfaith, 0, "Enable packet capture for FAITH IPv4->IPv6 translater daemon"); static int nipq = 0; /* total # of reass queues */ static int maxnipq; -SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragpackets, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragpackets, CTLFLAG_RW | CTLFLAG_LOCKED, &maxnipq, 0, "Maximum number of IPv4 fragment reassembly queue entries"); static int maxfragsperpacket; -SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_RW | CTLFLAG_LOCKED, &maxfragsperpacket, 0, "Maximum number of IPv4 fragments allowed per packet"); static int maxfrags; -SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfrags, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfrags, CTLFLAG_RW | CTLFLAG_LOCKED, &maxfrags, 0, "Maximum number of IPv4 fragments allowed"); static int currentfrags = 0; +int ip_doscopedroute = 1; +SYSCTL_INT(_net_inet_ip, OID_AUTO, scopedroute, CTLFLAG_RD | CTLFLAG_LOCKED, + &ip_doscopedroute, 0, "Enable IPv4 scoped routing"); + /* * XXX - Setting ip_checkinterface mostly implements the receive side of * the Strong ES model described in RFC 1122, but since the routing table @@ -190,26 +227,41 @@ static int currentfrags = 0; * packets for those addresses are received. */ static int ip_checkinterface = 0; -SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_checkinterface, 0, "Verify packet arrives on correct interface"); + #if DIAGNOSTIC static int ipprintfs = 0; #endif +extern int in_proto_count; extern struct domain inetdomain; extern struct protosw inetsw[]; struct protosw *ip_protox[IPPROTO_MAX]; static int ipqmaxlen = IFQ_MAXLEN; -struct in_ifaddrhead in_ifaddrhead; /* first inet address */ + +static lck_grp_attr_t *in_ifaddr_rwlock_grp_attr; +static lck_grp_t *in_ifaddr_rwlock_grp; +static lck_attr_t *in_ifaddr_rwlock_attr; +lck_rw_t *in_ifaddr_rwlock; + +/* Protected by in_ifaddr_rwlock */ +struct in_ifaddrhead in_ifaddrhead; /* first inet address */ +struct in_ifaddrhashhead *in_ifaddrhashtbl; /* inet addr hash table */ + +#define INADDR_NHASH 61 +static u_int32_t inaddr_nhash; /* hash table size */ +static u_int32_t inaddr_hashp; /* next largest prime */ + struct ifqueue ipintrq; -SYSCTL_INT(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, CTLFLAG_RW | CTLFLAG_LOCKED, &ipintrq.ifq_maxlen, 0, "Maximum size of the IP input queue"); -SYSCTL_INT(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops, CTLFLAG_RD, +SYSCTL_INT(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops, CTLFLAG_RD | CTLFLAG_LOCKED, &ipintrq.ifq_drops, 0, "Number of packets dropped from the IP input queue"); struct ipstat ipstat; -SYSCTL_STRUCT(_net_inet_ip, IPCTL_STATS, stats, CTLFLAG_RD, +SYSCTL_STRUCT(_net_inet_ip, IPCTL_STATS, stats, CTLFLAG_RD | CTLFLAG_LOCKED, &ipstat, ipstat, "IP statistics (struct ipstat, netinet/ip_var.h)"); /* Packet reassembly stuff */ @@ -223,47 +275,50 @@ static struct ipq ipq[IPREASS_NHASH]; static TAILQ_HEAD(ipq_list, ipq) ipq_list = TAILQ_HEAD_INITIALIZER(ipq_list); const int ipintrq_present = 1; -lck_mtx_t *ip_mutex; +lck_mtx_t *ip_mutex; lck_attr_t *ip_mutex_attr; -lck_grp_t *ip_mutex_grp; -lck_grp_attr_t *ip_mutex_grp_attr; +lck_grp_t *ip_mutex_grp; +lck_grp_attr_t *ip_mutex_grp_attr; lck_mtx_t *inet_domain_mutex; -extern lck_mtx_t *domain_proto_mtx; +extern lck_mtx_t *domain_proto_mtx; #if IPCTL_DEFMTU -SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_mtu, 0, "Default MTU"); #endif #if IPSTEALTH static int ipstealth = 0; -SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW | CTLFLAG_LOCKED, &ipstealth, 0, ""); #endif /* Firewall hooks */ +#if IPFIREWALL ip_fw_chk_t *ip_fw_chk_ptr; -int fw_enable = 1 ; -int fw_one_pass = 1; +int fw_enable = 1; +int fw_bypass = 1; +int fw_one_pass = 0; #if DUMMYNET ip_dn_io_t *ip_dn_io_ptr; #endif int (*fr_checkp)(struct ip *, int, struct ifnet *, int, struct mbuf **) = NULL; +#endif /* IPFIREWALL */ -SYSCTL_NODE(_net_inet_ip, OID_AUTO, linklocal, CTLFLAG_RW, 0, "link local"); +SYSCTL_NODE(_net_inet_ip, OID_AUTO, linklocal, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "link local"); struct ip_linklocal_stat ip_linklocal_stat; -SYSCTL_STRUCT(_net_inet_ip_linklocal, OID_AUTO, stat, CTLFLAG_RD, +SYSCTL_STRUCT(_net_inet_ip_linklocal, OID_AUTO, stat, CTLFLAG_RD | CTLFLAG_LOCKED, &ip_linklocal_stat, ip_linklocal_stat, "Number of link local packets with TTL less than 255"); -SYSCTL_NODE(_net_inet_ip_linklocal, OID_AUTO, in, CTLFLAG_RW, 0, "link local input"); +SYSCTL_NODE(_net_inet_ip_linklocal, OID_AUTO, in, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "link local input"); int ip_linklocal_in_allowbadttl = 1; -SYSCTL_INT(_net_inet_ip_linklocal_in, OID_AUTO, allowbadttl, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip_linklocal_in, OID_AUTO, allowbadttl, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_linklocal_in_allowbadttl, 0, "Allow incoming link local packets with TTL less than 255"); @@ -283,13 +338,10 @@ static struct ip_srcrt { struct in_addr route[MAX_IPOPTLEN/sizeof(struct in_addr)]; } ip_srcrt; -#ifdef __APPLE__ -extern struct mbuf* m_dup(register struct mbuf *m, int how); -#endif - +static void in_ifaddrhashtbl_init(void); static void save_rte(u_char *, struct in_addr); -static int ip_dooptions(struct mbuf *, int, struct sockaddr_in *, struct route *ipforward_rt); -static void ip_forward(struct mbuf *, int, struct sockaddr_in *, struct route *ipforward_rt); +static int ip_dooptions(struct mbuf *, int, struct sockaddr_in *); +static void ip_forward(struct mbuf *, int, struct sockaddr_in *); static void ip_freef(struct ipq *); #if IPDIVERT #ifdef IPDIVERT_44 @@ -302,38 +354,60 @@ static struct mbuf *ip_reass(struct mbuf *, #else static struct mbuf *ip_reass(struct mbuf *, struct ipq *, struct ipq *); #endif +static void ip_fwd_route_copyout(struct ifnet *, struct route *); +static void ip_fwd_route_copyin(struct ifnet *, struct route *); void ipintr(void); +void in_dinit(void); #if RANDOM_IP_ID extern u_short ip_id; + +int ip_use_randomid = 1; +SYSCTL_INT(_net_inet_ip, OID_AUTO, random_id, CTLFLAG_RW | CTLFLAG_LOCKED, + &ip_use_randomid, 0, "Randomize IP packets IDs"); #endif -extern u_long route_generation; -extern int apple_hwcksum_rx; +#define satosin(sa) ((struct sockaddr_in *)(sa)) +#define ifatoia(ifa) ((struct in_ifaddr *)(ifa)) /* * IP initialization: fill in IP protocol switch table. * All protocols not implemented in kernel go to raw IP protocol handler. */ void -ip_init() +ip_init(void) { - register struct protosw *pr; - register int i; - static ip_initialized = 0; - struct timeval timenow; - + struct protosw *pr; + int i; + static int ip_initialized = 0; if (!ip_initialized) { + PE_parse_boot_argn("net.inet.ip.scopedroute", + &ip_doscopedroute, sizeof (ip_doscopedroute)); + + in_ifaddr_init(); + + in_ifaddr_rwlock_grp_attr = lck_grp_attr_alloc_init(); + in_ifaddr_rwlock_grp = lck_grp_alloc_init("in_ifaddr_rwlock", + in_ifaddr_rwlock_grp_attr); + in_ifaddr_rwlock_attr = lck_attr_alloc_init(); + in_ifaddr_rwlock = lck_rw_alloc_init(in_ifaddr_rwlock_grp, + in_ifaddr_rwlock_attr); + TAILQ_INIT(&in_ifaddrhead); + in_ifaddrhashtbl_init(); + + ip_moptions_init(); + pr = pffindproto_locked(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == 0) panic("ip_init"); for (i = 0; i < IPPROTO_MAX; i++) ip_protox[i] = pr; - for (pr = inetdomain.dom_protosw; pr; pr = pr->pr_next) - { if(!((unsigned int)pr->pr_domain)) continue; /* If uninitialized, skip */ + for (pr = inetdomain.dom_protosw; pr; pr = pr->pr_next) { + if (pr->pr_domain == NULL) + continue; /* If uninitialized, skip */ if (pr->pr_domain->dom_family == PF_INET && pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) ip_protox[pr->pr_protocol] = pr; @@ -346,51 +420,127 @@ ip_init() maxfragsperpacket = 128; /* enough for 64k in 512 byte fragments */ #if RANDOM_IP_ID - getmicrouptime(&timenow); - ip_id = timenow.tv_sec & 0xffff; + { + struct timeval timenow; + getmicrotime(&timenow); + ip_id = timenow.tv_sec & 0xffff; + } #endif ipintrq.ifq_maxlen = ipqmaxlen; ipf_init(); ip_mutex_grp_attr = lck_grp_attr_alloc_init(); - lck_grp_attr_setdefault(ip_mutex_grp_attr); ip_mutex_grp = lck_grp_alloc_init("ip", ip_mutex_grp_attr); ip_mutex_attr = lck_attr_alloc_init(); - lck_attr_setdefault(ip_mutex_attr); - if ((ip_mutex = lck_mtx_alloc_init(ip_mutex_grp, ip_mutex_attr)) == NULL) { printf("ip_init: can't alloc ip_mutex\n"); return; } +#if IPSEC + + sadb_stat_mutex_grp_attr = lck_grp_attr_alloc_init(); + sadb_stat_mutex_grp = lck_grp_alloc_init("sadb_stat", sadb_stat_mutex_grp_attr); + sadb_stat_mutex_attr = lck_attr_alloc_init(); + + if ((sadb_stat_mutex = lck_mtx_alloc_init(sadb_stat_mutex_grp, sadb_stat_mutex_attr)) == NULL) { + printf("ip_init: can't alloc sadb_stat_mutex\n"); + return; + } + +#endif + arp_init(); + ip_initialized = 1; } } +/* + * Initialize IPv4 source address hash table. + */ +static void +in_ifaddrhashtbl_init(void) +{ + int i, k, p; + + if (in_ifaddrhashtbl != NULL) + return; + + PE_parse_boot_argn("inaddr_nhash", &inaddr_nhash, sizeof (inaddr_nhash)); + if (inaddr_nhash == 0) + inaddr_nhash = INADDR_NHASH; + + MALLOC(in_ifaddrhashtbl, struct in_ifaddrhashhead *, + inaddr_nhash * sizeof (*in_ifaddrhashtbl), + M_IFADDR, M_WAITOK | M_ZERO); + if (in_ifaddrhashtbl == NULL) + panic("in_ifaddrhashtbl_init allocation failed"); + + /* + * Generate the next largest prime greater than inaddr_nhash. + */ + k = (inaddr_nhash % 2 == 0) ? inaddr_nhash + 1 : inaddr_nhash + 2; + for (;;) { + p = 1; + for (i = 3; i * i <= k; i += 2) { + if (k % i == 0) + p = 0; + } + if (p == 1) + break; + k += 2; + } + inaddr_hashp = k; +} + +u_int32_t +inaddr_hashval(u_int32_t key) +{ + /* + * The hash index is the computed prime times the key modulo + * the hash size, as documented in "Introduction to Algorithms" + * (Cormen, Leiserson, Rivest). + */ + if (inaddr_nhash > 1) + return ((key * inaddr_hashp) % inaddr_nhash); + else + return (0); +} + static void ip_proto_input( - protocol_family_t protocol, - mbuf_t packet) + protocol_family_t __unused protocol, + mbuf_t packet_list) { - ip_input(packet); + mbuf_t packet; + int how_many = 0 ; + + /* ip_input should handle a list of packets but does not yet */ + + for (packet = packet_list; packet; packet = packet_list) { + how_many++; + packet_list = mbuf_nextpkt(packet); + mbuf_setnextpkt(packet, NULL); + ip_input(packet); + } } /* Initialize the PF_INET domain, and add in the pre-defined protos */ void -in_dinit() -{ register int i; - register struct protosw *pr; - register struct domain *dp; - static inetdomain_initted = 0; - extern int in_proto_count; +in_dinit(void) +{ + int i; + struct protosw *pr; + struct domain *dp; + static int inetdomain_initted = 0; if (!inetdomain_initted) { - kprintf("Initing %d protosw entries\n", in_proto_count); + /* kprintf("Initing %d protosw entries\n", in_proto_count); */ dp = &inetdomain; dp->dom_flags = DOM_REENTRANT; @@ -400,7 +550,7 @@ in_dinit() inetdomain_initted = 1; lck_mtx_unlock(domain_proto_mtx); - proto_register_input(PF_INET, ip_proto_input, NULL); + proto_register_input(PF_INET, ip_proto_input, NULL, 1); lck_mtx_lock(domain_proto_mtx); } } @@ -416,6 +566,7 @@ ip_proto_dispatch_in( int seen = (inject_ipfref == 0); int changed_header = 0; struct ip *ip; + void (*pr_input)(struct mbuf *, int len); if (!TAILQ_EMPTY(&ipv4_filters)) { ipf_ref(); @@ -455,28 +606,23 @@ ip_proto_dispatch_in( * otherwise let the protocol deal with its own locking */ ip = mtod(m, struct ip *); - + if (changed_header) { ip->ip_len = ntohs(ip->ip_len) - hlen; ip->ip_off = ntohs(ip->ip_off); } - - if (!(ip_protox[ip->ip_p]->pr_flags & PR_PROTOLOCK)) { + + if ((pr_input = ip_protox[ip->ip_p]->pr_input) == NULL) { + m_freem(m); + } else if (!(ip_protox[ip->ip_p]->pr_flags & PR_PROTOLOCK)) { lck_mtx_lock(inet_domain_mutex); - (*ip_protox[ip->ip_p]->pr_input)(m, hlen); + pr_input(m, hlen); lck_mtx_unlock(inet_domain_mutex); - } - else - (*ip_protox[ip->ip_p]->pr_input)(m, hlen); - + } else { + pr_input(m, hlen); + } } -/* - * ipforward_rt cleared in in_addroute() - * when a new route is successfully created. - */ -static struct sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET }; - /* * Ip input routine. Checksum and byte swap header. If fragmented * try to reassemble. Process options. Pass to next level. @@ -487,88 +633,109 @@ ip_input(struct mbuf *m) struct ip *ip; struct ipq *fp; struct in_ifaddr *ia = NULL; - int i, hlen, mff, checkif; + int hlen, checkif; u_short sum; struct in_addr pkt_dst; +#if IPFIREWALL + int i; u_int32_t div_info = 0; /* packet divert/tee info */ struct ip_fw_args args; - ipfilter_t inject_filter_ref = 0; struct m_tag *tag; - struct route ipforward_rt = { 0 }; - - lck_mtx_lock(ip_mutex); +#endif + ipfilter_t inject_filter_ref = 0; + /* Check if the mbuf is still valid after interface filter processing */ + MBUF_INPUT_CHECK(m, m->m_pkthdr.rcvif); + +#if IPFIREWALL args.eh = NULL; args.oif = NULL; args.rule = NULL; args.divert_rule = 0; /* divert cookie */ args.next_hop = NULL; + /* + * Don't bother searching for tag(s) if there's none. + */ + if (SLIST_EMPTY(&m->m_pkthdr.tags)) + goto ipfw_tags_done; + /* Grab info from mtags prepended to the chain */ #if DUMMYNET - if ((tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET, NULL)) != NULL) { + if ((tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, + KERNEL_TAG_TYPE_DUMMYNET, NULL)) != NULL) { struct dn_pkt_tag *dn_tag; - + dn_tag = (struct dn_pkt_tag *)(tag+1); args.rule = dn_tag->rule; - + m_tag_delete(m, tag); } #endif /* DUMMYNET */ - if ((tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DIVERT, NULL)) != NULL) { +#if IPDIVERT + if ((tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, + KERNEL_TAG_TYPE_DIVERT, NULL)) != NULL) { struct divert_tag *div_tag; - + div_tag = (struct divert_tag *)(tag+1); args.divert_rule = div_tag->cookie; m_tag_delete(m, tag); } - if ((tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFORWARD, NULL)) != NULL) { +#endif + + if ((tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, + KERNEL_TAG_TYPE_IPFORWARD, NULL)) != NULL) { struct ip_fwd_tag *ipfwd_tag; - + ipfwd_tag = (struct ip_fwd_tag *)(tag+1); args.next_hop = ipfwd_tag->next_hop; m_tag_delete(m, tag); } - + #if DIAGNOSTIC if (m == NULL || (m->m_flags & M_PKTHDR) == 0) panic("ip_input no HDR"); #endif if (args.rule) { /* dummynet already filtered us */ - ip = mtod(m, struct ip *); - hlen = IP_VHL_HL(ip->ip_vhl) << 2; - inject_filter_ref = ipf_get_inject_filter(m); - goto iphack ; + ip = mtod(m, struct ip *); + hlen = IP_VHL_HL(ip->ip_vhl) << 2; + inject_filter_ref = ipf_get_inject_filter(m); + goto iphack ; } - +ipfw_tags_done: +#endif /* IPFIREWALL */ + /* - * No need to proccess packet twice if we've - * already seen it + * No need to proccess packet twice if we've already seen it. */ - inject_filter_ref = ipf_get_inject_filter(m); + if (!SLIST_EMPTY(&m->m_pkthdr.tags)) + inject_filter_ref = ipf_get_inject_filter(m); if (inject_filter_ref != 0) { - lck_mtx_unlock(ip_mutex); ip = mtod(m, struct ip *); hlen = IP_VHL_HL(ip->ip_vhl) << 2; + + DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL, + struct ip *, ip, struct ifnet *, m->m_pkthdr.rcvif, + struct ip *, ip, struct ip6_hdr *, NULL); + ip->ip_len = ntohs(ip->ip_len) - hlen; ip->ip_off = ntohs(ip->ip_off); ip_proto_dispatch_in(m, hlen, ip->ip_p, inject_filter_ref); return; } - ipstat.ips_total++; + OSAddAtomic(1, &ipstat.ips_total); if (m->m_pkthdr.len < sizeof(struct ip)) goto tooshort; if (m->m_len < sizeof (struct ip) && (m = m_pullup(m, sizeof (struct ip))) == 0) { - ipstat.ips_toosmall++; - lck_mtx_unlock(ip_mutex); + OSAddAtomic(1, &ipstat.ips_toosmall); return; } ip = mtod(m, struct ip *); @@ -577,19 +744,18 @@ ip_input(struct mbuf *m) ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len); if (IP_VHL_V(ip->ip_vhl) != IPVERSION) { - ipstat.ips_badvers++; + OSAddAtomic(1, &ipstat.ips_badvers); goto bad; } hlen = IP_VHL_HL(ip->ip_vhl) << 2; if (hlen < sizeof(struct ip)) { /* minimum header length */ - ipstat.ips_badhlen++; + OSAddAtomic(1, &ipstat.ips_badhlen); goto bad; } if (hlen > m->m_len) { if ((m = m_pullup(m, hlen)) == 0) { - ipstat.ips_badhlen++; - lck_mtx_unlock(ip_mutex); + OSAddAtomic(1, &ipstat.ips_badhlen); return; } ip = mtod(m, struct ip *); @@ -599,7 +765,7 @@ ip_input(struct mbuf *m) if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) { - ipstat.ips_badaddr++; + OSAddAtomic(1, &ipstat.ips_badaddr); goto bad; } } @@ -609,7 +775,7 @@ ip_input(struct mbuf *m) IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)))) { ip_linklocal_stat.iplls_in_total++; if (ip->ip_ttl != MAXTTL) { - ip_linklocal_stat.iplls_in_badttl++; + OSAddAtomic(1, &ip_linklocal_stat.iplls_in_badttl); /* Silently drop link local traffic with bad TTL */ if (!ip_linklocal_in_allowbadttl) goto bad; @@ -623,24 +789,72 @@ ip_input(struct mbuf *m) if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); - } else { + } else if (!(m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) || + apple_hwcksum_tx == 0) { + /* + * Either this is not loopback packet coming from an interface + * that does not support checksum offloading, or it is loopback + * packet that has undergone software checksumming at the send + * side because apple_hwcksum_tx was set to 0. In this case, + * calculate the checksum in software to validate the packet. + */ sum = in_cksum(m, hlen); + } else { + /* + * This is a loopback packet without any valid checksum since + * the send side has bypassed it (apple_hwcksum_tx set to 1). + * We get here because apple_hwcksum_rx was set to 0, and so + * we pretend that all is well. + */ + sum = 0; + m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR | + CSUM_IP_CHECKED | CSUM_IP_VALID; + m->m_pkthdr.csum_data = 0xffff; } if (sum) { - ipstat.ips_badsum++; + OSAddAtomic(1, &ipstat.ips_badsum); goto bad; } + DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL, + struct ip *, ip, struct ifnet *, m->m_pkthdr.rcvif, + struct ip *, ip, struct ip6_hdr *, NULL); + + /* + * Naively assume we can attribute inbound data to the route we would + * use to send to this destination. Asymetric routing breaks this + * assumption, but it still allows us to account for traffic from + * a remote node in the routing table. + * this has a very significant performance impact so we bypass + * if nstat_collect is disabled. We may also bypass if the + * protocol is tcp in the future because tcp will have a route that + * we can use to attribute the data to. That does mean we would not + * account for forwarded tcp traffic. + */ + if (nstat_collect) { + struct rtentry *rt = + ifnet_cached_rtlookup_inet(m->m_pkthdr.rcvif, ip->ip_src); + if (rt != NULL) { + nstat_route_rx(rt, 1, m->m_pkthdr.len, 0); + rtfree(rt); + } + } + /* * Convert fields to host representation. */ +#if BYTE_ORDER != BIG_ENDIAN NTOHS(ip->ip_len); +#endif + if (ip->ip_len < hlen) { - ipstat.ips_badlen++; + OSAddAtomic(1, &ipstat.ips_badlen); goto bad; } - NTOHS(ip->ip_off); +#if BYTE_ORDER != BIG_ENDIAN + NTOHS(ip->ip_off); +#endif /* * Check that the amount of data in the buffers * is as at least much as the IP header would have us expect. @@ -649,7 +863,7 @@ ip_input(struct mbuf *m) */ if (m->m_pkthdr.len < ip->ip_len) { tooshort: - ipstat.ips_tooshort++; + OSAddAtomic(1, &ipstat.ips_tooshort); goto bad; } if (m->m_pkthdr.len > ip->ip_len) { @@ -664,26 +878,33 @@ tooshort: m_adj(m, ip->ip_len - m->m_pkthdr.len); } +#if PF + /* Invoke inbound packet filter */ + if (PF_IS_ENABLED) { + int error; + error = pf_af_hook(m->m_pkthdr.rcvif, NULL, &m, AF_INET, TRUE); + if (error != 0) { + if (m != NULL) { + panic("%s: unexpected packet %p\n", __func__, m); + /* NOTREACHED */ + } + /* Already freed by callee */ + return; + } + ip = mtod(m, struct ip *); + hlen = IP_VHL_HL(ip->ip_vhl) << 2; + } +#endif /* PF */ + #if IPSEC if (ipsec_bypass == 0 && ipsec_gethist(m, NULL)) goto pass; #endif - /* - * IpHack's section. - * Right now when no processing on packet has done - * and it is still fresh out of network we do our black - * deals with it. - * - Firewall: deny/allow/divert - * - Xlate: translate packet's addr/port (NAT). - * - Pipe: pass pkt through dummynet. - * - Wrap: fake packet's addr/port - * - Encapsulate: put it in another IP and send out. - */ - -#if defined(IPFIREWALL) && defined(DUMMYNET) +#if IPFIREWALL +#if DUMMYNET iphack: -#endif +#endif /* DUMMYNET */ /* * Check if we want to allow this packet to be processed. * Consider it to be bad if not. @@ -692,7 +913,6 @@ iphack: struct mbuf *m1 = m; if (fr_checkp(ip, hlen, m->m_pkthdr.rcvif, 0, &m1) || !m1) { - lck_mtx_unlock(ip_mutex); return; } ip = mtod(m = m1, struct ip *); @@ -708,7 +928,6 @@ iphack: #endif /* IPFIREWALL_FORWARD */ args.m = m; - lck_mtx_unlock(ip_mutex); i = ip_fw_chk_ptr(&args); m = args.m; @@ -719,8 +938,8 @@ iphack: return; } ip = mtod(m, struct ip *); /* just in case m changed */ + if (i == 0 && args.next_hop == NULL) { /* common case */ - lck_mtx_lock(ip_mutex); goto pass; } #if DUMMYNET @@ -733,14 +952,12 @@ iphack: #if IPDIVERT if (i != 0 && (i & IP_FW_PORT_DYNT_FLAG) == 0) { /* Divert or tee packet */ - lck_mtx_lock(ip_mutex); div_info = i; goto ours; } #endif #if IPFIREWALL_FORWARD if (i == 0 && args.next_hop != NULL) { - lck_mtx_lock(ip_mutex); goto pass; } #endif @@ -750,6 +967,7 @@ iphack: m_freem(m); return; } +#endif /* IPFIREWALL */ pass: /* @@ -759,8 +977,11 @@ pass: * to be sent and the original packet to be freed). */ ip_nhops = 0; /* for source routed packets */ - if (hlen > sizeof (struct ip) && ip_dooptions(m, 0, args.next_hop, &ipforward_rt)) { - lck_mtx_unlock(ip_mutex); +#if IPFIREWALL + if (hlen > sizeof (struct ip) && ip_dooptions(m, 0, args.next_hop)) { +#else + if (hlen > sizeof (struct ip) && ip_dooptions(m, 0, NULL)) { +#endif return; } @@ -787,8 +1008,12 @@ pass: * Cache the destination address of the packet; this may be * changed by use of 'ipfw fwd'. */ +#if IPFIREWALL pkt_dst = args.next_hop == NULL ? ip->ip_dst : args.next_hop->sin_addr; +#else + pkt_dst = ip->ip_dst; +#endif /* * Enable a consistency check between the destination address @@ -805,51 +1030,70 @@ pass: * the packets are received. */ checkif = ip_checkinterface && (ipforwarding == 0) && - ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) && - (args.next_hop == NULL); - - lck_mtx_lock(rt_mtx); - TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { -#define satosin(sa) ((struct sockaddr_in *)(sa)) + ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) +#if IPFIREWALL + && (args.next_hop == NULL); +#else + ; +#endif - if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) { - lck_mtx_unlock(rt_mtx); - goto ours; - } - + /* + * Check for exact addresses in the hash bucket. + */ + lck_rw_lock_shared(in_ifaddr_rwlock); + TAILQ_FOREACH(ia, INADDR_HASH(pkt_dst.s_addr), ia_hash) { /* * If the address matches, verify that the packet * arrived via the correct interface if checking is * enabled. */ + IFA_LOCK_SPIN(&ia->ia_ifa); if (IA_SIN(ia)->sin_addr.s_addr == pkt_dst.s_addr && (!checkif || ia->ia_ifp == m->m_pkthdr.rcvif)) { - lck_mtx_unlock(rt_mtx); + IFA_UNLOCK(&ia->ia_ifa); + lck_rw_done(in_ifaddr_rwlock); goto ours; } - /* - * Only accept broadcast packets that arrive via the - * matching interface. Reception of forwarded directed - * broadcasts would be handled via ip_forward() and - * ether_output() with the loopback into the stack for - * SIMPLEX interfaces handled by ether_output(). - */ - if ((!checkif || ia->ia_ifp == m->m_pkthdr.rcvif) && - ia->ia_ifp && ia->ia_ifp->if_flags & IFF_BROADCAST) { + IFA_UNLOCK(&ia->ia_ifa); + } + lck_rw_done(in_ifaddr_rwlock); + + /* + * Check for broadcast addresses. + * + * Only accept broadcast packets that arrive via the matching + * interface. Reception of forwarded directed broadcasts would be + * handled via ip_forward() and ether_frameout() with the loopback + * into the stack for SIMPLEX interfaces handled by ether_frameout(). + */ + if (m->m_pkthdr.rcvif->if_flags & IFF_BROADCAST) { + struct ifaddr *ifa; + struct ifnet *ifp = m->m_pkthdr.rcvif; + + ifnet_lock_shared(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + IFA_LOCK_SPIN(ifa); + if (ifa->ifa_addr->sa_family != AF_INET) { + IFA_UNLOCK(ifa); + continue; + } + ia = ifatoia(ifa); if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == + pkt_dst.s_addr || ia->ia_netbroadcast.s_addr == pkt_dst.s_addr) { - lck_mtx_unlock(rt_mtx); - goto ours; - } - if (ia->ia_netbroadcast.s_addr == pkt_dst.s_addr) { - lck_mtx_unlock(rt_mtx); + IFA_UNLOCK(ifa); + ifnet_lock_done(ifp); goto ours; } + IFA_UNLOCK(ifa); } + ifnet_lock_done(ifp); } - lck_mtx_unlock(rt_mtx); + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { struct in_multi *inm; + struct ifnet *ifp = m->m_pkthdr.rcvif; +#if MROUTING if (ip_mrouter) { /* * If we are acting as a multicast router, all @@ -859,9 +1103,9 @@ pass: * ip_mforward() returns a non-zero value, the packet * must be discarded, else it may be accepted below. */ - if (ip_mforward && - ip_mforward(ip, m->m_pkthdr.rcvif, m, 0) != 0) { - ipstat.ips_cantforward++; + lck_mtx_lock(ip_mutex); + if (ip_mforward && ip_mforward(ip, ifp, m, 0) != 0) { + OSAddAtomic(1, &ipstat.ips_cantforward); m_freem(m); lck_mtx_unlock(ip_mutex); return; @@ -874,22 +1118,25 @@ pass: */ if (ip->ip_p == IPPROTO_IGMP) goto ours; - ipstat.ips_forward++; + OSAddAtomic(1, &ipstat.ips_forward); } +#endif /* MROUTING */ /* * See if we belong to the destination multicast group on the * arrival interface. */ - IN_LOOKUP_MULTI(ip->ip_dst, m->m_pkthdr.rcvif, inm); + in_multihead_lock_shared(); + IN_LOOKUP_MULTI(&ip->ip_dst, ifp, inm); + in_multihead_lock_done(); if (inm == NULL) { - ipstat.ips_notmember++; + OSAddAtomic(1, &ipstat.ips_notmember); m_freem(m); - lck_mtx_unlock(ip_mutex); return; } + INM_REMREF(inm); goto ours; } - if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST) + if (ip->ip_dst.s_addr == (u_int32_t)INADDR_BROADCAST) goto ours; if (ip->ip_dst.s_addr == INADDR_ANY) goto ours; @@ -902,8 +1149,7 @@ pass: struct udpiphdr *ui; if (m->m_len < sizeof(struct udpiphdr) && (m = m_pullup(m, sizeof(struct udpiphdr))) == 0) { - udpstat.udps_hdrops++; - lck_mtx_unlock(ip_mutex); + OSAddAtomic(1, &udpstat.udps_hdrops); return; } ui = mtod(m, struct udpiphdr *); @@ -923,32 +1169,25 @@ pass: goto ours; } m_freem(m); - lck_mtx_unlock(ip_mutex); return; } #endif - lck_mtx_unlock(ip_mutex); /* * Not for us; forward if possible and desirable. */ if (ipforwarding == 0) { - ipstat.ips_cantforward++; + OSAddAtomic(1, &ipstat.ips_cantforward); m_freem(m); } else { - ip_forward(m, 0, args.next_hop, &ipforward_rt); +#if IPFIREWALL + ip_forward(m, 0, args.next_hop); +#else + ip_forward(m, 0, NULL); +#endif } return; ours: -#ifndef __APPLE__ - /* Darwin does not have an if_data in ifaddr */ - /* Count the packet in the ip address stats */ - if (ia != NULL) { - ia->ia_ifa.if_ipackets++; - ia->ia_ifa.if_ibytes += m->m_pkthdr.len; - } -#endif - /* * If offset or IP_MF are set, must reassemble. * Otherwise, nothing need be done. @@ -960,8 +1199,9 @@ ours: /* If maxnipq is 0, never accept fragments. */ if (maxnipq == 0) { - ipstat.ips_fragments++; - ipstat.ips_fragdropped++; + + OSAddAtomic(1, &ipstat.ips_fragments); + OSAddAtomic(1, &ipstat.ips_fragdropped); goto bad; } @@ -969,9 +1209,10 @@ ours: * If we will exceed the number of fragments in queues, timeout the * oldest fragemented packet to make space. */ + lck_mtx_lock(ip_mutex); if (currentfrags >= maxfrags) { fp = TAILQ_LAST(&ipq_list, ipq_list); - ipstat.ips_fragtimeout += fp->ipq_nfrags; + OSAddAtomic(fp->ipq_nfrags, &ipstat.ips_fragtimeout); if (ip->ip_id == fp->ipq_id && ip->ip_src.s_addr == fp->ipq_src.s_addr && @@ -981,8 +1222,9 @@ ours: * If we match the fragment queue we were going to * discard, drop this packet too. */ - ipstat.ips_fragdropped++; + OSAddAtomic(1, &ipstat.ips_fragdropped); ip_freef(fp); + lck_mtx_unlock(ip_mutex); goto bad; } @@ -998,6 +1240,9 @@ ours: if (ip->ip_id == fp->ipq_id && ip->ip_src.s_addr == fp->ipq_src.s_addr && ip->ip_dst.s_addr == fp->ipq_dst.s_addr && +#if CONFIG_MACF_NET + mac_ipq_label_compare(m, fp) && +#endif ip->ip_p == fp->ipq_p) goto found; @@ -1011,7 +1256,7 @@ ours: * drop the oldest fragment before proceeding further */ fp = TAILQ_LAST(&ipq_list, ipq_list); - ipstat.ips_fragtimeout += fp->ipq_nfrags; + OSAddAtomic(fp->ipq_nfrags, &ipstat.ips_fragtimeout); ip_freef(fp); } @@ -1029,7 +1274,8 @@ found: * that's a non-zero multiple of 8 bytes. */ if (ip->ip_len == 0 || (ip->ip_len & 0x7) != 0) { - ipstat.ips_toosmall++; /* XXX */ + OSAddAtomic(1, &ipstat.ips_toosmall); + lck_mtx_unlock(ip_mutex); goto bad; } m->m_flags |= M_FRAG; @@ -1044,11 +1290,11 @@ found: * ip_reass() will return a different mbuf, and update * the divert info in div_info and args.divert_rule. */ - ipstat.ips_fragments++; + OSAddAtomic(1, &ipstat.ips_fragments); m->m_pkthdr.header = ip; #if IPDIVERT - m = ip_reass(m, - fp, &ipq[sum], &div_info, &args.divert_rule); + m = ip_reass(m, fp, &ipq[sum], + (u_int16_t *)&div_info, &args.divert_rule); #else m = ip_reass(m, fp, &ipq[sum]); #endif @@ -1056,23 +1302,33 @@ found: lck_mtx_unlock(ip_mutex); return; } - ipstat.ips_reassembled++; + OSAddAtomic(1, &ipstat.ips_reassembled); ip = mtod(m, struct ip *); /* Get the header length of the reassembled packet */ hlen = IP_VHL_HL(ip->ip_vhl) << 2; + #if IPDIVERT /* Restore original checksum before diverting packet */ if (div_info != 0) { ip->ip_len += hlen; + +#if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_len); HTONS(ip->ip_off); +#endif + ip->ip_sum = 0; ip->ip_sum = in_cksum(m, hlen); + +#if BYTE_ORDER != BIG_ENDIAN NTOHS(ip->ip_off); NTOHS(ip->ip_len); +#endif + ip->ip_len -= hlen; } #endif + lck_mtx_unlock(ip_mutex); } else ip->ip_len -= hlen; @@ -1092,19 +1348,19 @@ found: /* Restore packet header fields to original values */ ip->ip_len += hlen; + +#if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_len); HTONS(ip->ip_off); - +#endif /* Deliver packet to divert input routine */ - ipstat.ips_delivered++; - lck_mtx_unlock(ip_mutex); + OSAddAtomic(1, &ipstat.ips_delivered); divert_packet(m, 1, div_info & 0xffff, args.divert_rule); /* If 'tee', continue with original packet */ if (clone == NULL) { return; } - lck_mtx_lock(ip_mutex); m = clone; ip = mtod(m, struct ip *); } @@ -1117,28 +1373,27 @@ found: * code - like udp/tcp/raw ip. */ if (ipsec_bypass == 0 && (ip_protox[ip->ip_p]->pr_flags & PR_LASTHDR) != 0) { - lck_mtx_lock(sadb_mutex); if (ipsec4_in_reject(m, NULL)) { - ipsecstat.in_polvio++; - lck_mtx_unlock(sadb_mutex); - goto bad; + IPSEC_STAT_INCREMENT(ipsecstat.in_polvio); + goto bad; } - lck_mtx_unlock(sadb_mutex); } #endif /* * Switch out to protocol's input routine. */ - ipstat.ips_delivered++; + OSAddAtomic(1, &ipstat.ips_delivered); { +#if IPFIREWALL if (args.next_hop && ip->ip_p == IPPROTO_TCP) { /* TCP needs IPFORWARD info if available */ struct m_tag *fwd_tag; struct ip_fwd_tag *ipfwd_tag; - fwd_tag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFORWARD, - sizeof(struct sockaddr_in), M_NOWAIT); + fwd_tag = m_tag_create(KERNEL_MODULE_TAG_ID, + KERNEL_TAG_TYPE_IPFORWARD, sizeof (*ipfwd_tag), + M_NOWAIT, m); if (fwd_tag == NULL) { goto bad; } @@ -1151,7 +1406,6 @@ found: KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr, ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len); - lck_mtx_unlock(ip_mutex); /* TCP deals with its own locking */ ip_proto_dispatch_in(m, hlen, ip->ip_p, 0); @@ -1159,15 +1413,16 @@ found: KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr, ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len); - lck_mtx_unlock(ip_mutex); ip_proto_dispatch_in(m, hlen, ip->ip_p, 0); } +#else + ip_proto_dispatch_in(m, hlen, ip->ip_p, 0); +#endif return; } bad: KERNEL_DEBUG(DBG_LAYER_END, 0,0,0,0,0); - lck_mtx_unlock(ip_mutex); m_freem(m); } @@ -1182,28 +1437,25 @@ bad: static struct mbuf * #if IPDIVERT -ip_reass(m, fp, where, divinfo, divcookie) -#else -ip_reass(m, fp, where) -#endif - register struct mbuf *m; - register struct ipq *fp; - struct ipq *where; -#if IPDIVERT +ip_reass(struct mbuf *m, struct ipq *fp, struct ipq *where, #ifdef IPDIVERT_44 - u_int32_t *divinfo; -#else - u_int16_t *divinfo; -#endif - u_int16_t *divcookie; -#endif + u_int32_t *divinfo, +#else /* IPDIVERT_44 */ + u_int16_t *divinfo, +#endif /* IPDIVERT_44 */ + u_int16_t *divcookie) +#else /* IPDIVERT */ +ip_reass(struct mbuf *m, struct ipq *fp, struct ipq *where) +#endif /* IPDIVERT */ { struct ip *ip = mtod(m, struct ip *); - register struct mbuf *p = 0, *q, *nq; + struct mbuf *p = 0, *q, *nq; struct mbuf *t; int hlen = IP_VHL_HL(ip->ip_vhl) << 2; int i, next; + u_int8_t ecn, ecn0; + lck_mtx_assert(ip_mutex, LCK_MTX_ASSERT_OWNED); /* * Presence of header sizes in mbufs * would confuse code below. @@ -1220,6 +1472,14 @@ ip_reass(m, fp, where) if ((t = m_get(M_DONTWAIT, MT_FTABLE)) == NULL) goto dropfrag; fp = mtod(t, struct ipq *); +#if CONFIG_MACF_NET + if (mac_ipq_label_init(fp, M_NOWAIT) != 0) { + m_free(t); + fp = NULL; + goto dropfrag; + } + mac_ipq_label_associate(m, fp); +#endif insque((void*)fp, (void*)where); nipq++; fp->ipq_nfrags = 1; @@ -1242,10 +1502,29 @@ ip_reass(m, fp, where) goto inserted; } else { fp->ipq_nfrags++; +#if CONFIG_MACF_NET + mac_ipq_label_update(m, fp); +#endif } #define GETIP(m) ((struct ip*)((m)->m_pkthdr.header)) + /* + * Handle ECN by comparing this segment with the first one; + * if CE is set, do not lose CE. + * drop if CE and not-ECT are mixed for the same packet. + */ + ecn = ip->ip_tos & IPTOS_ECN_MASK; + ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK; + if (ecn == IPTOS_ECN_CE) { + if (ecn0 == IPTOS_ECN_NOTECT) + goto dropfrag; + if (ecn0 != IPTOS_ECN_CE) + GETIP(fp->ipq_frags)->ip_tos |= IPTOS_ECN_CE; + } + if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) + goto dropfrag; + /* * Find a segment which begins after this one does. */ @@ -1296,7 +1575,7 @@ ip_reass(m, fp, where) } nq = q->m_nextpkt; m->m_nextpkt = nq; - ipstat.ips_fragdropped++; + OSAddAtomic(1, &ipstat.ips_fragdropped); fp->ipq_nfrags--; m_freem(q); } @@ -1335,7 +1614,7 @@ inserted: for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) { if (GETIP(q)->ip_off != next) { if (fp->ipq_nfrags > maxfragsperpacket) { - ipstat.ips_fragdropped += fp->ipq_nfrags; + OSAddAtomic(fp->ipq_nfrags, &ipstat.ips_fragdropped); ip_freef(fp); } return (0); @@ -1345,7 +1624,7 @@ inserted: /* Make sure the last packet didn't have the IP_MF flag */ if (p->m_flags & M_FRAG) { if (fp->ipq_nfrags > maxfragsperpacket) { - ipstat.ips_fragdropped += fp->ipq_nfrags; + OSAddAtomic(fp->ipq_nfrags, &ipstat.ips_fragdropped); ip_freef(fp); } return (0); @@ -1357,8 +1636,8 @@ inserted: q = fp->ipq_frags; ip = GETIP(q); if (next + (IP_VHL_HL(ip->ip_vhl) << 2) > IP_MAXPACKET) { - ipstat.ips_toolong++; - ipstat.ips_fragdropped += fp->ipq_nfrags; + OSAddAtomic(1, &ipstat.ips_toolong); + OSAddAtomic(fp->ipq_nfrags, &ipstat.ips_fragdropped); ip_freef(fp); return (0); } @@ -1396,6 +1675,10 @@ inserted: *divcookie = fp->ipq_div_cookie; #endif +#if CONFIG_MACF_NET + mac_mbuf_label_associate_ipq(fp, m); + mac_ipq_label_destroy(fp); +#endif /* * Create header for new ip packet by * modifying header of first packet; @@ -1414,7 +1697,7 @@ inserted: m->m_data -= (IP_VHL_HL(ip->ip_vhl) << 2); /* some debugging cruft by sklower, below, will go away soon */ if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */ - register int plen = 0; + int plen = 0; for (t = m; t; t = t->m_next) plen += t->m_len; m->m_pkthdr.len = plen; @@ -1426,7 +1709,7 @@ dropfrag: *divinfo = 0; *divcookie = 0; #endif - ipstat.ips_fragdropped++; + OSAddAtomic(1, &ipstat.ips_fragdropped); if (fp != 0) fp->ipq_nfrags--; m_freem(m); @@ -1440,9 +1723,9 @@ dropfrag: * associated datagrams. */ static void -ip_freef(fp) - struct ipq *fp; +ip_freef(struct ipq *fp) { + lck_mtx_assert(ip_mutex, LCK_MTX_ASSERT_OWNED); currentfrags -= fp->ipq_nfrags; m_freem_list(fp->ipq_frags); remque((void*)fp); @@ -1457,9 +1740,9 @@ ip_freef(fp) * queue, discard it. */ void -ip_slowtimo() +ip_slowtimo(void) { - register struct ipq *fp; + struct ipq *fp; int i; lck_mtx_lock(ip_mutex); for (i = 0; i < IPREASS_NHASH; i++) { @@ -1470,7 +1753,7 @@ ip_slowtimo() --fp->ipq_ttl; fp = fp->next; if (fp->prev->ipq_ttl == 0) { - ipstat.ips_fragtimeout += fp->prev->ipq_nfrags; + OSAddAtomic(fp->ipq_nfrags, &ipstat.ips_fragtimeout); ip_freef(fp->prev); } } @@ -1484,13 +1767,11 @@ ip_slowtimo() for (i = 0; i < IPREASS_NHASH; i++) { while (nipq > maxnipq && (ipq[i].next != &ipq[i])) { - ipstat.ips_fragdropped += - ipq[i].next->ipq_nfrags; + OSAddAtomic(ipq[i].next->ipq_nfrags, &ipstat.ips_fragdropped); ip_freef(ipq[i].next); } } } - ipflow_slowtimo(); lck_mtx_unlock(ip_mutex); } @@ -1498,14 +1779,14 @@ ip_slowtimo() * Drain off all datagram fragments. */ void -ip_drain() +ip_drain(void) { int i; lck_mtx_lock(ip_mutex); for (i = 0; i < IPREASS_NHASH; i++) { while (ipq[i].next != &ipq[i]) { - ipstat.ips_fragdropped += ipq[i].next->ipq_nfrags; + OSAddAtomic(ipq[i].next->ipq_nfrags, &ipstat.ips_fragdropped); ip_freef(ipq[i].next); } } @@ -1526,15 +1807,17 @@ ip_drain() * 0 if the packet should be processed further. */ static int -ip_dooptions(struct mbuf *m, int pass, struct sockaddr_in *next_hop, struct route *ipforward_rt) +ip_dooptions(struct mbuf *m, __unused int pass, struct sockaddr_in *next_hop) { - register struct ip *ip = mtod(m, struct ip *); - register u_char *cp; - register struct ip_timestamp *ipt; - register struct in_ifaddr *ia; + struct ip *ip = mtod(m, struct ip *); + u_char *cp; + struct ip_timestamp *ipt; + struct in_ifaddr *ia; int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0; struct in_addr *sin, dst; n_time ntime; + struct sockaddr_in ipaddr = { + sizeof (ipaddr), AF_INET , 0 , { 0 }, { 0, } }; dst = ip->ip_dst; cp = (u_char *)(ip + 1); @@ -1598,7 +1881,7 @@ ip_dooptions(struct mbuf *m, int pass, struct sockaddr_in *next_hop, struct rout break; } else { - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); ia = NULL; } off--; /* 0 origin */ @@ -1631,7 +1914,7 @@ nosourcerouting: /* * Not acting as a router, so silently drop. */ - ipstat.ips_cantforward++; + OSAddAtomic(1, &ipstat.ips_cantforward); m_freem(m); return (1); } @@ -1650,7 +1933,7 @@ nosourcerouting: ia = (INA)ifa_ifwithnet((SA)&ipaddr); } } else { - ia = ip_rtaddr(ipaddr.sin_addr, ipforward_rt); + ia = ip_rtaddr(ipaddr.sin_addr); } if (ia == 0) { type = ICMP_UNREACH; @@ -1658,9 +1941,11 @@ nosourcerouting: goto bad; } ip->ip_dst = ipaddr.sin_addr; + IFA_LOCK(&ia->ia_ifa); (void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr), sizeof(struct in_addr)); - ifafree(&ia->ia_ifa); + IFA_UNLOCK(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); ia = NULL; cp[IPOPT_OFFSET] += sizeof(struct in_addr); /* @@ -1691,15 +1976,17 @@ nosourcerouting: * use the incoming interface (should be same). */ if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) == 0) { - if ((ia = ip_rtaddr(ipaddr.sin_addr, ipforward_rt)) == 0) { + if ((ia = ip_rtaddr(ipaddr.sin_addr)) == 0) { type = ICMP_UNREACH; code = ICMP_UNREACH_HOST; goto bad; } } + IFA_LOCK(&ia->ia_ifa); (void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr), sizeof(struct in_addr)); - ifafree(&ia->ia_ifa); + IFA_UNLOCK(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); ia = NULL; cp[IPOPT_OFFSET] += sizeof(struct in_addr); break; @@ -1742,10 +2029,12 @@ nosourcerouting: m->m_pkthdr.rcvif); if (ia == 0) continue; + IFA_LOCK(&ia->ia_ifa); (void)memcpy(sin, &IA_SIN(ia)->sin_addr, sizeof(struct in_addr)); + IFA_UNLOCK(&ia->ia_ifa); ipt->ipt_ptr += sizeof(struct in_addr); - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); ia = NULL; break; @@ -1760,7 +2049,7 @@ nosourcerouting: sizeof(struct in_addr)); if ((ia = (struct in_ifaddr*)ifa_ifwithaddr((SA)&ipaddr)) == 0) continue; - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); ia = NULL; ipt->ipt_ptr += sizeof(struct in_addr); break; @@ -1778,16 +2067,14 @@ nosourcerouting: } } if (forward && ipforwarding) { - ip_forward(m, 1, next_hop, ipforward_rt); + ip_forward(m, 1, next_hop); return (1); } return (0); bad: ip->ip_len -= IP_VHL_HL(ip->ip_vhl) << 2; /* XXX icmp_error adds in hdr length */ - lck_mtx_unlock(ip_mutex); icmp_error(m, type, code, 0, 0); - lck_mtx_lock(ip_mutex); - ipstat.ips_badoptions++; + OSAddAtomic(1, &ipstat.ips_badoptions); return (1); } @@ -1796,36 +2083,29 @@ bad: * return internet address info of interface to be used to get there. */ struct in_ifaddr * -ip_rtaddr(dst, rt) - struct in_addr dst; - struct route *rt; +ip_rtaddr(struct in_addr dst) { - register struct sockaddr_in *sin; - - sin = (struct sockaddr_in *)&rt->ro_dst; - - lck_mtx_lock(rt_mtx); - if (rt->ro_rt == 0 || dst.s_addr != sin->sin_addr.s_addr || - rt->ro_rt->generation_id != route_generation) { - if (rt->ro_rt) { - rtfree_locked(rt->ro_rt); - rt->ro_rt = 0; - } - sin->sin_family = AF_INET; - sin->sin_len = sizeof(*sin); - sin->sin_addr = dst; - - rtalloc_ign_locked(rt, RTF_PRCLONING); - } - if (rt->ro_rt == 0) { - lck_mtx_unlock(rt_mtx); - return ((struct in_ifaddr *)0); - } - - if (rt->ro_rt->rt_ifa) - ifaref(rt->ro_rt->rt_ifa); - lck_mtx_unlock(rt_mtx); - return ((struct in_ifaddr *) rt->ro_rt->rt_ifa); + struct sockaddr_in *sin; + struct ifaddr *rt_ifa; + struct route ro; + + bzero(&ro, sizeof (ro)); + sin = (struct sockaddr_in *)&ro.ro_dst; + sin->sin_family = AF_INET; + sin->sin_len = sizeof (*sin); + sin->sin_addr = dst; + + rtalloc_ign(&ro, RTF_PRCLONING); + if (ro.ro_rt == NULL) + return (NULL); + + RT_LOCK(ro.ro_rt); + if ((rt_ifa = ro.ro_rt->rt_ifa) != NULL) + IFA_ADDREF(rt_ifa); + RT_UNLOCK(ro.ro_rt); + rtfree(ro.ro_rt); + + return ((struct in_ifaddr *)rt_ifa); } /* @@ -1833,9 +2113,7 @@ ip_rtaddr(dst, rt) * to be picked up later by ip_srcroute if the receiver is interested. */ void -save_rte(option, dst) - u_char *option; - struct in_addr dst; +save_rte(u_char *option, struct in_addr dst) { unsigned olen; @@ -1857,10 +2135,10 @@ save_rte(option, dst) * The first hop is placed before the options, will be removed later. */ struct mbuf * -ip_srcroute() +ip_srcroute(void) { - register struct in_addr *p, *q; - register struct mbuf *m; + struct in_addr *p, *q; + struct mbuf *m; if (ip_nhops == 0) return ((struct mbuf *)0); @@ -1885,7 +2163,7 @@ ip_srcroute() *(mtod(m, struct in_addr *)) = *p--; #if DIAGNOSTIC if (ipprintfs) - printf(" hops %lx", (u_long)ntohl(mtod(m, struct in_addr *)->s_addr)); + printf(" hops %lx", (u_int32_t)ntohl(mtod(m, struct in_addr *)->s_addr)); #endif /* @@ -1905,7 +2183,7 @@ ip_srcroute() while (p >= ip_srcrt.route) { #if DIAGNOSTIC if (ipprintfs) - printf(" %lx", (u_long)ntohl(q->s_addr)); + printf(" %lx", (u_int32_t)ntohl(q->s_addr)); #endif *q++ = *p--; } @@ -1915,7 +2193,7 @@ ip_srcroute() *q = ip_srcrt.dst; #if DIAGNOSTIC if (ipprintfs) - printf(" %lx\n", (u_long)ntohl(q->s_addr)); + printf(" %lx\n", (u_int32_t)ntohl(q->s_addr)); #endif return (m); } @@ -1928,13 +2206,11 @@ ip_srcroute() * XXX should be deleted; last arg currently ignored. */ void -ip_stripoptions(m, mopt) - register struct mbuf *m; - struct mbuf *mopt; +ip_stripoptions(struct mbuf *m, __unused struct mbuf *mopt) { - register int i; + int i; struct ip *ip = mtod(m, struct ip *); - register caddr_t opts; + caddr_t opts; int olen; olen = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip); @@ -1950,12 +2226,82 @@ ip_stripoptions(m, mopt) u_char inetctlerrmap[PRC_NCMDS] = { 0, 0, 0, 0, 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, - EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, + ENETUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, EMSGSIZE, EHOSTUNREACH, 0, 0, 0, 0, 0, 0, ENOPROTOOPT, ECONNREFUSED }; +static int +sysctl_ipforwarding SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int i, was_ipforwarding = ipforwarding; + + i = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); + if (i != 0 || req->newptr == USER_ADDR_NULL) + return (i); + + if (was_ipforwarding && !ipforwarding) { + /* clean up IPv4 forwarding cached routes */ + ifnet_head_lock_shared(); + for (i = 0; i <= if_index; i++) { + struct ifnet *ifp = ifindex2ifnet[i]; + if (ifp != NULL) { + lck_mtx_lock(&ifp->if_cached_route_lock); + if (ifp->if_fwd_route.ro_rt != NULL) + rtfree(ifp->if_fwd_route.ro_rt); + bzero(&ifp->if_fwd_route, + sizeof (ifp->if_fwd_route)); + lck_mtx_unlock(&ifp->if_cached_route_lock); + } + } + ifnet_head_done(); + } + + return (0); +} + +/* + * Similar to inp_route_{copyout,copyin} routines except that these copy + * out the cached IPv4 forwarding route from struct ifnet instead of the + * inpcb. See comments for those routines for explanations. + */ +static void +ip_fwd_route_copyout(struct ifnet *ifp, struct route *dst) +{ + struct route *src = &ifp->if_fwd_route; + + lck_mtx_lock_spin(&ifp->if_cached_route_lock); + lck_mtx_convert_spin(&ifp->if_cached_route_lock); + + /* Minor sanity check */ + if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET) + panic("%s: wrong or corrupted route: %p", __func__, src); + + route_copyout(dst, src, sizeof(*dst)); + + lck_mtx_unlock(&ifp->if_cached_route_lock); +} + +static void +ip_fwd_route_copyin(struct ifnet *ifp, struct route *src) +{ + struct route *dst = &ifp->if_fwd_route; + + lck_mtx_lock_spin(&ifp->if_cached_route_lock); + lck_mtx_convert_spin(&ifp->if_cached_route_lock); + + /* Minor sanity check */ + if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET) + panic("%s: wrong or corrupted route: %p", __func__, src); + + if (ifp->if_fwd_cacheok) + route_copyin(src, dst, sizeof(*src)); + + lck_mtx_unlock(&ifp->if_cached_route_lock); +} + /* * Forward a packet. If some error occurs return the sender * an icmp packet. Note we can't always generate a meaningful @@ -1971,37 +2317,46 @@ u_char inetctlerrmap[PRC_NCMDS] = { * via a source route. */ static void -ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop, struct route *ipforward_rt) +ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop) { - register struct ip *ip = mtod(m, struct ip *); - register struct sockaddr_in *sin; - register struct rtentry *rt; +#if !IPFIREWALL +#pragma unused(next_hop) +#endif + struct ip *ip = mtod(m, struct ip *); + struct sockaddr_in *sin; + struct rtentry *rt; + struct route fwd_rt; int error, type = 0, code = 0; struct mbuf *mcopy; n_long dest; struct in_addr pkt_dst; - struct ifnet *destifp; -#if IPSEC - struct ifnet dummyifp; -#endif + u_int32_t nextmtu = 0; + struct ip_out_args ipoa = { IFSCOPE_NONE, 0 }; + struct ifnet *ifp = m->m_pkthdr.rcvif; +#if PF + struct pf_mtag *pf_mtag; +#endif /* PF */ dest = 0; +#if IPFIREWALL /* * Cache the destination address of the packet; this may be * changed by use of 'ipfw fwd'. */ pkt_dst = next_hop ? next_hop->sin_addr : ip->ip_dst; +#else + pkt_dst = ip->ip_dst; +#endif #if DIAGNOSTIC if (ipprintfs) printf("forward: src %lx dst %lx ttl %x\n", - (u_long)ip->ip_src.s_addr, (u_long)pkt_dst.s_addr, + (u_int32_t)ip->ip_src.s_addr, (u_int32_t)pkt_dst.s_addr, ip->ip_ttl); #endif - if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(pkt_dst) == 0) { - ipstat.ips_cantforward++; + OSAddAtomic(1, &ipstat.ips_cantforward); m_freem(m); return; } @@ -2017,25 +2372,33 @@ ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop, struct route } #endif - sin = (struct sockaddr_in *)&ipforward_rt->ro_dst; - if ((rt = ipforward_rt->ro_rt) == 0 || - pkt_dst.s_addr != sin->sin_addr.s_addr || - ipforward_rt->ro_rt->generation_id != route_generation) { - if (ipforward_rt->ro_rt) { - rtfree(ipforward_rt->ro_rt); - ipforward_rt->ro_rt = 0; +#if PF + pf_mtag = pf_find_mtag(m); + if (pf_mtag != NULL && pf_mtag->rtableid != IFSCOPE_NONE) + ipoa.ipoa_boundif = pf_mtag->rtableid; +#endif /* PF */ + + ip_fwd_route_copyout(ifp, &fwd_rt); + + sin = (struct sockaddr_in *)&fwd_rt.ro_dst; + if (fwd_rt.ro_rt == NULL || + fwd_rt.ro_rt->generation_id != route_generation || + pkt_dst.s_addr != sin->sin_addr.s_addr) { + if (fwd_rt.ro_rt != NULL) { + rtfree(fwd_rt.ro_rt); + fwd_rt.ro_rt = NULL; } sin->sin_family = AF_INET; - sin->sin_len = sizeof(*sin); + sin->sin_len = sizeof (*sin); sin->sin_addr = pkt_dst; - rtalloc_ign(ipforward_rt, RTF_PRCLONING); - if (ipforward_rt->ro_rt == 0) { + rtalloc_scoped_ign(&fwd_rt, RTF_PRCLONING, ipoa.ipoa_boundif); + if (fwd_rt.ro_rt == NULL) { icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest, 0); - return; + goto done; } - rt = ipforward_rt->ro_rt; } + rt = fwd_rt.ro_rt; /* * Save the IP header and at most 8 bytes of the payload, @@ -2070,69 +2433,82 @@ ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop, struct route * Also, don't send redirect if forwarding using a default route * or a route modified by a redirect. */ -#define satosin(sa) ((struct sockaddr_in *)(sa)) + RT_LOCK_SPIN(rt); if (rt->rt_ifp == m->m_pkthdr.rcvif && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 && satosin(rt_key(rt))->sin_addr.s_addr != 0 && - ipsendredirects && !srcrt) { -#define RTA(rt) ((struct in_ifaddr *)(rt->rt_ifa)) - u_long src = ntohl(ip->ip_src.s_addr); - - if (RTA(rt) && - (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) { - if (rt->rt_flags & RTF_GATEWAY) - dest = satosin(rt->rt_gateway)->sin_addr.s_addr; - else - dest = pkt_dst.s_addr; - /* Router requirements says to only send host redirects */ - type = ICMP_REDIRECT; - code = ICMP_REDIRECT_HOST; + ipsendredirects && !srcrt && rt->rt_ifa != NULL) { + struct in_ifaddr *ia = (struct in_ifaddr *)rt->rt_ifa; + u_int32_t src = ntohl(ip->ip_src.s_addr); + + /* Become a regular mutex */ + RT_CONVERT_LOCK(rt); + IFA_LOCK_SPIN(&ia->ia_ifa); + if ((src & ia->ia_subnetmask) == ia->ia_subnet) { + if (rt->rt_flags & RTF_GATEWAY) + dest = satosin(rt->rt_gateway)->sin_addr.s_addr; + else + dest = pkt_dst.s_addr; + /* Router requirements says to only send host redirects */ + type = ICMP_REDIRECT; + code = ICMP_REDIRECT_HOST; #if DIAGNOSTIC - if (ipprintfs) - printf("redirect (%d) to %lx\n", code, (u_long)dest); + if (ipprintfs) + printf("redirect (%d) to %lx\n", code, (u_int32_t)dest); #endif } + IFA_UNLOCK(&ia->ia_ifa); } + RT_UNLOCK(rt); - { +#if IPFIREWALL if (next_hop) { /* Pass IPFORWARD info if available */ struct m_tag *tag; struct ip_fwd_tag *ipfwd_tag; - - tag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFORWARD, - sizeof(struct sockaddr_in), M_NOWAIT); + + tag = m_tag_create(KERNEL_MODULE_TAG_ID, + KERNEL_TAG_TYPE_IPFORWARD, + sizeof (*ipfwd_tag), M_NOWAIT, m); if (tag == NULL) { error = ENOBUFS; m_freem(m); - return; + goto done; } - + ipfwd_tag = (struct ip_fwd_tag *)(tag+1); ipfwd_tag->next_hop = next_hop; m_tag_prepend(m, tag); } - error = ip_output_list(m, 0, (struct mbuf *)0, ipforward_rt, - IP_FORWARDING, 0); - } - if (error) - ipstat.ips_cantforward++; - else { - ipstat.ips_forward++; +#endif + error = ip_output_list(m, 0, NULL, &fwd_rt, + IP_FORWARDING | IP_OUTARGS, 0, &ipoa); + + /* Refresh rt since the route could have changed while in IP */ + rt = fwd_rt.ro_rt; + + if (error) { + OSAddAtomic(1, &ipstat.ips_cantforward); + } else { + OSAddAtomic(1, &ipstat.ips_forward); if (type) - ipstat.ips_redirectsent++; + OSAddAtomic(1, &ipstat.ips_redirectsent); else { if (mcopy) { - ipflow_create(ipforward_rt, mcopy); + /* + * If we didn't have to go thru ipflow and + * the packet was successfully consumed by + * ip_output, the mcopy is rather a waste; + * this could be further optimized. + */ m_freem(mcopy); } - return; + goto done; } } if (mcopy == NULL) - return; - destifp = NULL; + goto done; switch (error) { @@ -2153,8 +2529,12 @@ ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop, struct route type = ICMP_UNREACH; code = ICMP_UNREACH_NEEDFRAG; #ifndef IPSEC - if (ipforward_rt->ro_rt) - destifp = ipforward_rt->ro_rt->rt_ifp; + if (rt != NULL) { + RT_LOCK_SPIN(rt); + if (rt->rt_ifp != NULL) + nextmtu = rt->rt_ifp->if_mtu; + RT_UNLOCK(rt); + } #else /* * If the packet is routed over IPsec tunnel, tell the @@ -2162,59 +2542,86 @@ ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop, struct route * tunnel MTU = if MTU - sizeof(IP) - ESP/AH hdrsiz * XXX quickhack!!! */ - if (ipforward_rt->ro_rt) { + if (rt != NULL) { struct secpolicy *sp = NULL; int ipsecerror; int ipsechdr; struct route *ro; + RT_LOCK_SPIN(rt); + if (rt->rt_ifp != NULL) + nextmtu = rt->rt_ifp->if_mtu; + RT_UNLOCK(rt); + if (ipsec_bypass) { - destifp = ipforward_rt->ro_rt->rt_ifp; - ipstat.ips_cantfrag++; + OSAddAtomic(1, &ipstat.ips_cantfrag); break; } - lck_mtx_lock(sadb_mutex); sp = ipsec4_getpolicybyaddr(mcopy, IPSEC_DIR_OUTBOUND, IP_FORWARDING, &ipsecerror); - if (sp == NULL) - destifp = ipforward_rt->ro_rt->rt_ifp; - else { + if (sp != NULL) { /* count IPsec header size */ - ipsechdr = ipsec4_hdrsiz(mcopy, - IPSEC_DIR_OUTBOUND, - NULL); + ipsechdr = ipsec_hdrsiz(sp); /* * find the correct route for outer IPv4 * header, compute tunnel MTU. - * - * XXX BUG ALERT - * The "dummyifp" code relies upon the fact - * that icmp_error() touches only ifp->if_mtu. */ - /*XXX*/ - destifp = NULL; - if (sp->req != NULL - && sp->req->sav != NULL - && sp->req->sav->sah != NULL) { - ro = &sp->req->sav->sah->sa_route; - if (ro->ro_rt && ro->ro_rt->rt_ifp) { - dummyifp.if_mtu = - ro->ro_rt->rt_ifp->if_mtu; - dummyifp.if_mtu -= ipsechdr; - destifp = &dummyifp; + nextmtu = 0; + + if (sp->req != NULL) { + if (sp->req->saidx.mode == IPSEC_MODE_TUNNEL) { + struct secasindex saidx; + struct ip *ipm; + struct secasvar *sav; + + ipm = mtod(mcopy, struct ip *); + bcopy(&sp->req->saidx, &saidx, sizeof(saidx)); + saidx.mode = sp->req->saidx.mode; + saidx.reqid = sp->req->saidx.reqid; + sin = (struct sockaddr_in *)&saidx.src; + if (sin->sin_len == 0) { + sin->sin_len = sizeof(*sin); + sin->sin_family = AF_INET; + sin->sin_port = IPSEC_PORT_ANY; + bcopy(&ipm->ip_src, &sin->sin_addr, + sizeof(sin->sin_addr)); + } + sin = (struct sockaddr_in *)&saidx.dst; + if (sin->sin_len == 0) { + sin->sin_len = sizeof(*sin); + sin->sin_family = AF_INET; + sin->sin_port = IPSEC_PORT_ANY; + bcopy(&ipm->ip_dst, &sin->sin_addr, + sizeof(sin->sin_addr)); + } + sav = key_allocsa_policy(&saidx); + if (sav != NULL) { + lck_mtx_lock(sadb_mutex); + if (sav->sah != NULL) { + ro = &sav->sah->sa_route; + if (ro->ro_rt != NULL) { + RT_LOCK(ro->ro_rt); + if (ro->ro_rt->rt_ifp != NULL) { + nextmtu = ro->ro_rt->rt_ifp->if_mtu; + nextmtu -= ipsechdr; + } + RT_UNLOCK(ro->ro_rt); + } + } + key_freesav(sav, KEY_SADB_LOCKED); + lck_mtx_unlock(sadb_mutex); + } } } - - key_freesp(sp); + key_freesp(sp, KEY_SADB_UNLOCKED); } - lck_mtx_unlock(sadb_mutex); } #endif /*IPSEC*/ - ipstat.ips_cantfrag++; + OSAddAtomic(1, &ipstat.ips_cantfrag); break; case ENOBUFS: @@ -2224,32 +2631,49 @@ ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop, struct route case EACCES: /* ipfw denied packet */ m_freem(mcopy); - return; + goto done; } - icmp_error(mcopy, type, code, dest, destifp); + + icmp_error(mcopy, type, code, dest, nextmtu); +done: + ip_fwd_route_copyin(ifp, &fwd_rt); } -void +int ip_savecontrol( - register struct inpcb *inp, - register struct mbuf **mp, - register struct ip *ip, - register struct mbuf *m) + struct inpcb *inp, + struct mbuf **mp, + struct ip *ip, + struct mbuf *m) { + *mp = NULL; if (inp->inp_socket->so_options & SO_TIMESTAMP) { struct timeval tv; microtime(&tv); - *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv), - SCM_TIMESTAMP, SOL_SOCKET); - if (*mp) - mp = &(*mp)->m_next; + mp = sbcreatecontrol_mbuf((caddr_t) &tv, sizeof(tv), + SCM_TIMESTAMP, SOL_SOCKET, mp); + if (*mp == NULL) { + goto no_mbufs; + } } + if ((inp->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { + uint64_t time; + + time = mach_absolute_time(); + mp = sbcreatecontrol_mbuf((caddr_t) &time, sizeof(time), + SCM_TIMESTAMP_MONOTONIC, SOL_SOCKET, mp); + + if (*mp == NULL) { + goto no_mbufs; + } + } if (inp->inp_flags & INP_RECVDSTADDR) { - *mp = sbcreatecontrol((caddr_t) &ip->ip_dst, - sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP); - if (*mp) - mp = &(*mp)->m_next; + mp = sbcreatecontrol_mbuf((caddr_t) &ip->ip_dst, + sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP, mp); + if (*mp == NULL) { + goto no_mbufs; + } } #ifdef notyet /* XXX @@ -2258,17 +2682,19 @@ ip_savecontrol( */ /* options were tossed already */ if (inp->inp_flags & INP_RECVOPTS) { - *mp = sbcreatecontrol((caddr_t) opts_deleted_above, - sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP); - if (*mp) - mp = &(*mp)->m_next; + mp = sbcreatecontrol_mbuf((caddr_t) opts_deleted_above, + sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP, mp); + if (*mp == NULL) { + goto no_mbufs; + } } /* ip_srcroute doesn't do what we want here, need to fix */ if (inp->inp_flags & INP_RECVRETOPTS) { - *mp = sbcreatecontrol((caddr_t) ip_srcroute(), - sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP); - if (*mp) - mp = &(*mp)->m_next; + mp = sbcreatecontrol_mbuf((caddr_t) ip_srcroute(), + sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP, mp); + if (*mp == NULL) { + goto no_mbufs; + } } #endif if (inp->inp_flags & INP_RECVIF) { @@ -2281,24 +2707,27 @@ ip_savecontrol( struct sockaddr_dl *sdl2 = &sdlbuf.sdl; ifnet_head_lock_shared(); - if (((ifp = m->m_pkthdr.rcvif)) - && ( ifp->if_index && (ifp->if_index <= if_index))) { + if ((ifp = m->m_pkthdr.rcvif) != NULL && + ifp->if_index && (ifp->if_index <= if_index)) { struct ifaddr *ifa = ifnet_addrs[ifp->if_index - 1]; - + if (!ifa || !ifa->ifa_addr) goto makedummy; - + + IFA_LOCK_SPIN(ifa); sdp = (struct sockaddr_dl *)ifa->ifa_addr; /* * Change our mind and don't try copy. */ - if ((sdp->sdl_family != AF_LINK) - || (sdp->sdl_len > sizeof(sdlbuf))) { + if ((sdp->sdl_family != AF_LINK) || + (sdp->sdl_len > sizeof(sdlbuf))) { + IFA_UNLOCK(ifa); goto makedummy; } bcopy(sdp, sdl2, sdp->sdl_len); + IFA_UNLOCK(ifa); } else { -makedummy: +makedummy: sdl2->sdl_len = offsetof(struct sockaddr_dl, sdl_data[0]); sdl2->sdl_family = AF_LINK; @@ -2306,15 +2735,46 @@ makedummy: sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0; } ifnet_head_done(); - *mp = sbcreatecontrol((caddr_t) sdl2, sdl2->sdl_len, - IP_RECVIF, IPPROTO_IP); - if (*mp) - mp = &(*mp)->m_next; + mp = sbcreatecontrol_mbuf((caddr_t) sdl2, sdl2->sdl_len, + IP_RECVIF, IPPROTO_IP, mp); + if (*mp == NULL) { + goto no_mbufs; + } } if (inp->inp_flags & INP_RECVTTL) { - *mp = sbcreatecontrol((caddr_t)&ip->ip_ttl, sizeof(ip->ip_ttl), IP_RECVTTL, IPPROTO_IP); - if (*mp) mp = &(*mp)->m_next; + mp = sbcreatecontrol_mbuf((caddr_t)&ip->ip_ttl, sizeof(ip->ip_ttl), + IP_RECVTTL, IPPROTO_IP, mp); + if (*mp == NULL) { + goto no_mbufs; + } } + if ((inp->inp_socket->so_flags & SOF_RECV_TRAFFIC_CLASS) != 0) { + int tc = m->m_pkthdr.prio; + + mp = sbcreatecontrol_mbuf((caddr_t) &tc, sizeof(tc), + SO_TRAFFIC_CLASS, SOL_SOCKET, mp); + if (*mp == NULL) { + goto no_mbufs; + } + } + if (inp->inp_flags & INP_PKTINFO) { + struct in_pktinfo pi; + + bzero(&pi, sizeof(struct in_pktinfo)); + bcopy(&ip->ip_dst, &pi.ipi_addr, sizeof(struct in_addr)); + pi.ipi_ifindex = (m && m->m_pkthdr.rcvif) ? m->m_pkthdr.rcvif->if_index : 0; + + mp = sbcreatecontrol_mbuf((caddr_t)&pi, sizeof(struct in_pktinfo), + IP_RECVPKTINFO, IPPROTO_IP, mp); + if (*mp == NULL) { + goto no_mbufs; + } + } + return 0; + +no_mbufs: + ipstat.ips_pktdropcntrl++; + return ENOBUFS; } int