X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/39236c6e673c41db228275375ab7fdb0f837b292..4d15aeb193b2c68f1d38666c317f8d3734f5f083:/bsd/netinet/ip_input.c diff --git a/bsd/netinet/ip_input.c b/bsd/netinet/ip_input.c index 99d474d3c..3b32f0dde 100644 --- a/bsd/netinet/ip_input.c +++ b/bsd/netinet/ip_input.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2013 Apple Inc. All rights reserved. + * Copyright (c) 2000-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -102,6 +102,8 @@ #include #include #include +#include +#include #if PF #include #endif /* PF */ @@ -150,12 +152,6 @@ decl_lck_mtx_data(, sadb_stat_mutex_data); lck_mtx_t *sadb_stat_mutex = &sadb_stat_mutex_data; #endif /* IPSEC */ -#if MROUTING -int rsvp_on = 0; -static int ip_rsvp_on; -struct socket *ip_rsvpd; -#endif /* MROUTING */ - MBUFQ_HEAD(fq_head); static int frag_timeout_run; /* frag timer is scheduled to run */ @@ -165,6 +161,8 @@ static void frag_sched_timeout(void); static struct ipq *ipq_alloc(int); static void ipq_free(struct ipq *); static void ipq_updateparams(void); +static void ip_input_second_pass(struct mbuf *, struct ifnet *, + u_int32_t, int, int, struct ip_fw_in_args *, int); decl_lck_mtx_data(static, ipqlock); static lck_attr_t *ipqlock_attr; @@ -190,6 +188,12 @@ static int sysctl_ipforwarding SYSCTL_HANDLER_ARGS; static int sysctl_maxnipq SYSCTL_HANDLER_ARGS; static int sysctl_maxfragsperpacket SYSCTL_HANDLER_ARGS; +#if (DEBUG || DEVELOPMENT) +static int sysctl_reset_ip_input_stats SYSCTL_HANDLER_ARGS; +static int sysctl_ip_input_measure_bins SYSCTL_HANDLER_ARGS; +static int sysctl_ip_input_getperf SYSCTL_HANDLER_ARGS; +#endif /* (DEBUG || DEVELOPMENT) */ + int ipforwarding = 0; SYSCTL_PROC(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &ipforwarding, 0, @@ -231,10 +235,6 @@ SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragsperpacket, sysctl_maxfragsperpacket, "I", "Maximum number of IPv4 fragments allowed per packet"); -int ip_doscopedroute = 1; -SYSCTL_INT(_net_inet_ip, OID_AUTO, scopedroute, CTLFLAG_RD | CTLFLAG_LOCKED, - &ip_doscopedroute, 0, "Enable IPv4 scoped routing"); - static uint32_t ip_adj_clear_hwcksum = 0; SYSCTL_UINT(_net_inet_ip, OID_AUTO, adj_clear_hwcksum, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_adj_clear_hwcksum, 0, @@ -257,6 +257,33 @@ static int ip_checkinterface = 0; SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_checkinterface, 0, "Verify packet arrives on correct interface"); +static int ip_chaining = 1; +SYSCTL_INT(_net_inet_ip, OID_AUTO, rx_chaining, CTLFLAG_RW | CTLFLAG_LOCKED, + &ip_chaining, 1, "Do receive side ip address based chaining"); + +static int ip_chainsz = 6; +SYSCTL_INT(_net_inet_ip, OID_AUTO, rx_chainsz, CTLFLAG_RW | CTLFLAG_LOCKED, + &ip_chainsz, 1, "IP receive side max chaining"); + +#if (DEBUG || DEVELOPMENT) +static int ip_input_measure = 0; +SYSCTL_PROC(_net_inet_ip, OID_AUTO, input_perf, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + &ip_input_measure, 0, sysctl_reset_ip_input_stats, "I", "Do time measurement"); + +static uint64_t ip_input_measure_bins = 0; +SYSCTL_PROC(_net_inet_ip, OID_AUTO, input_perf_bins, + CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &ip_input_measure_bins, 0, + sysctl_ip_input_measure_bins, "I", + "bins for chaining performance data histogram"); + +static net_perf_t net_perf; +SYSCTL_PROC(_net_inet_ip, OID_AUTO, input_perf_data, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, + 0, 0, sysctl_ip_input_getperf, "S,net_perf", + "IP input performance data (struct net_perf, net/net_perf.h)"); +#endif /* (DEBUG || DEVELOPMENT) */ + #if DIAGNOSTIC static int ipprintfs = 0; #endif @@ -279,7 +306,8 @@ static u_int32_t inaddr_hashp; /* next largest prime */ static int ip_getstat SYSCTL_HANDLER_ARGS; struct ipstat ipstat; -SYSCTL_PROC(_net_inet_ip, IPCTL_STATS, stats, CTLFLAG_RD | CTLFLAG_LOCKED, +SYSCTL_PROC(_net_inet_ip, IPCTL_STATS, stats, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, ip_getstat, "S,ipstat", "IP statistics (struct ipstat, netinet/ip_var.h)"); @@ -397,6 +425,25 @@ SYSCTL_INT(_net_inet_ip, OID_AUTO, random_id, CTLFLAG_RW | CTLFLAG_LOCKED, */ static gre_input_func_t gre_input_func; +static void +ip_init_delayed(void) +{ + struct ifreq ifr; + int error; + struct sockaddr_in *sin; + + bzero(&ifr, sizeof(ifr)); + strlcpy(ifr.ifr_name, "lo0", sizeof(ifr.ifr_name)); + sin = (struct sockaddr_in *)(void *)&ifr.ifr_addr; + sin->sin_len = sizeof(struct sockaddr_in); + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); + error = in_control(NULL, SIOCSIFADDR, (caddr_t)&ifr, lo_ifp, kernproc); + if (error) + printf("%s: failed to initialise lo0's address, error=%d\n", + __func__, error); +} + /* * IP initialization: fill in IP protocol switch table. * All protocols not implemented in kernel go to raw IP protocol handler. @@ -425,9 +472,6 @@ ip_init(struct protosw *pp, struct domain *dp) return; ip_initialized = 1; - PE_parse_boot_argn("net.inet.ip.scopedroute", - &ip_doscopedroute, sizeof (ip_doscopedroute)); - in_ifaddr_init(); in_ifaddr_rwlock_grp_attr = lck_grp_attr_alloc_init(); @@ -498,6 +542,7 @@ ip_init(struct protosw *pp, struct domain *dp) #endif arp_init(); + net_init_add(ip_init_delayed); } /* @@ -511,136 +556,1220 @@ in_ifaddrhashtbl_init(void) if (in_ifaddrhashtbl != NULL) return; - PE_parse_boot_argn("inaddr_nhash", &inaddr_nhash, - sizeof (inaddr_nhash)); - if (inaddr_nhash == 0) - inaddr_nhash = INADDR_NHASH; + PE_parse_boot_argn("inaddr_nhash", &inaddr_nhash, + sizeof (inaddr_nhash)); + if (inaddr_nhash == 0) + inaddr_nhash = INADDR_NHASH; + + MALLOC(in_ifaddrhashtbl, struct in_ifaddrhashhead *, + inaddr_nhash * sizeof (*in_ifaddrhashtbl), + M_IFADDR, M_WAITOK | M_ZERO); + if (in_ifaddrhashtbl == NULL) + panic("in_ifaddrhashtbl_init allocation failed"); + + /* + * Generate the next largest prime greater than inaddr_nhash. + */ + k = (inaddr_nhash % 2 == 0) ? inaddr_nhash + 1 : inaddr_nhash + 2; + for (;;) { + p = 1; + for (i = 3; i * i <= k; i += 2) { + if (k % i == 0) + p = 0; + } + if (p == 1) + break; + k += 2; + } + inaddr_hashp = k; +} + +u_int32_t +inaddr_hashval(u_int32_t key) +{ + /* + * The hash index is the computed prime times the key modulo + * the hash size, as documented in "Introduction to Algorithms" + * (Cormen, Leiserson, Rivest). + */ + if (inaddr_nhash > 1) + return ((key * inaddr_hashp) % inaddr_nhash); + else + return (0); +} + +void +ip_proto_dispatch_in_wrapper(struct mbuf *m, int hlen, u_int8_t proto) +{ + ip_proto_dispatch_in(m, hlen, proto, 0); +} + +__private_extern__ void +ip_proto_dispatch_in(struct mbuf *m, int hlen, u_int8_t proto, + ipfilter_t inject_ipfref) +{ + struct ipfilter *filter; + int seen = (inject_ipfref == NULL); + int changed_header = 0; + struct ip *ip; + void (*pr_input)(struct mbuf *, int len); + + if (!TAILQ_EMPTY(&ipv4_filters)) { + ipf_ref(); + TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) { + if (seen == 0) { + if ((struct ipfilter *)inject_ipfref == filter) + seen = 1; + } else if (filter->ipf_filter.ipf_input) { + errno_t result; + + if (changed_header == 0) { + /* + * Perform IP header alignment fixup, + * if needed, before passing packet + * into filter(s). + */ + IP_HDR_ALIGNMENT_FIXUP(m, + m->m_pkthdr.rcvif, ipf_unref()); + + /* ipf_unref() already called */ + if (m == NULL) + return; + + changed_header = 1; + ip = mtod(m, struct ip *); + ip->ip_len = htons(ip->ip_len + hlen); + ip->ip_off = htons(ip->ip_off); + ip->ip_sum = 0; + ip->ip_sum = ip_cksum_hdr_in(m, hlen); + } + result = filter->ipf_filter.ipf_input( + filter->ipf_filter.cookie, (mbuf_t *)&m, + hlen, proto); + if (result == EJUSTRETURN) { + ipf_unref(); + return; + } + if (result != 0) { + ipf_unref(); + m_freem(m); + return; + } + } + } + ipf_unref(); + } + + /* Perform IP header alignment fixup (post-filters), if needed */ + IP_HDR_ALIGNMENT_FIXUP(m, m->m_pkthdr.rcvif, return); + + /* + * If there isn't a specific lock for the protocol + * we're about to call, use the generic lock for AF_INET. + * otherwise let the protocol deal with its own locking + */ + ip = mtod(m, struct ip *); + + if (changed_header) { + ip->ip_len = ntohs(ip->ip_len) - hlen; + ip->ip_off = ntohs(ip->ip_off); + } + + if ((pr_input = ip_protox[ip->ip_p]->pr_input) == NULL) { + m_freem(m); + } else if (!(ip_protox[ip->ip_p]->pr_flags & PR_PROTOLOCK)) { + lck_mtx_lock(inet_domain_mutex); + pr_input(m, hlen); + lck_mtx_unlock(inet_domain_mutex); + } else { + pr_input(m, hlen); + } +} + +struct pktchain_elm { + struct mbuf *pkte_head; + struct mbuf *pkte_tail; + struct in_addr pkte_saddr; + struct in_addr pkte_daddr; + uint16_t pkte_npkts; + uint16_t pkte_proto; + uint32_t pkte_nbytes; +}; + +typedef struct pktchain_elm pktchain_elm_t; + +/* Store upto PKTTBL_SZ unique flows on the stack */ +#define PKTTBL_SZ 7 + +static struct mbuf * +ip_chain_insert(struct mbuf *packet, pktchain_elm_t *tbl) +{ + struct ip* ip; + int pkttbl_idx = 0; + + ip = mtod(packet, struct ip*); + + /* reusing the hash function from inaddr_hashval */ + pkttbl_idx = inaddr_hashval(ntohs(ip->ip_src.s_addr)) % PKTTBL_SZ; + if (tbl[pkttbl_idx].pkte_head == NULL) { + tbl[pkttbl_idx].pkte_head = packet; + tbl[pkttbl_idx].pkte_saddr.s_addr = ip->ip_src.s_addr; + tbl[pkttbl_idx].pkte_daddr.s_addr = ip->ip_dst.s_addr; + tbl[pkttbl_idx].pkte_proto = ip->ip_p; + } else { + if ((ip->ip_dst.s_addr == tbl[pkttbl_idx].pkte_daddr.s_addr) && + (ip->ip_src.s_addr == tbl[pkttbl_idx].pkte_saddr.s_addr) && + (ip->ip_p == tbl[pkttbl_idx].pkte_proto)) { + } else { + return (packet); + } + } + if (tbl[pkttbl_idx].pkte_tail != NULL) + mbuf_setnextpkt(tbl[pkttbl_idx].pkte_tail, packet); + + tbl[pkttbl_idx].pkte_tail = packet; + tbl[pkttbl_idx].pkte_npkts += 1; + tbl[pkttbl_idx].pkte_nbytes += packet->m_pkthdr.len; + return (NULL); +} + +/* args is a dummy variable here for backward compatibility */ +static void +ip_input_second_pass_loop_tbl(pktchain_elm_t *tbl, struct ip_fw_in_args *args) +{ + int i = 0; + + for (i = 0; i < PKTTBL_SZ; i++) { + if (tbl[i].pkte_head != NULL) { + struct mbuf *m = tbl[i].pkte_head; + ip_input_second_pass(m, m->m_pkthdr.rcvif, 0, + tbl[i].pkte_npkts, tbl[i].pkte_nbytes, args, 0); + + if (tbl[i].pkte_npkts > 2) + ipstat.ips_rxc_chainsz_gt2++; + if (tbl[i].pkte_npkts > 4) + ipstat.ips_rxc_chainsz_gt4++; +#if (DEBUG || DEVELOPMENT) + if (ip_input_measure) + net_perf_histogram(&net_perf, tbl[i].pkte_npkts); +#endif /* (DEBUG || DEVELOPMENT) */ + tbl[i].pkte_head = tbl[i].pkte_tail = NULL; + tbl[i].pkte_npkts = 0; + tbl[i].pkte_nbytes = 0; + /* no need to initialize address and protocol in tbl */ + } + } +} + +static void +ip_input_cpout_args(struct ip_fw_in_args *args, struct ip_fw_args *args1, + boolean_t *done_init) +{ + if (*done_init == FALSE) { + bzero(args1, sizeof(struct ip_fw_args)); + *done_init = TRUE; + } + args1->fwa_next_hop = args->fwai_next_hop; + args1->fwa_ipfw_rule = args->fwai_ipfw_rule; + args1->fwa_pf_rule = args->fwai_pf_rule; + args1->fwa_divert_rule = args->fwai_divert_rule; +} + +static void +ip_input_cpin_args(struct ip_fw_args *args1, struct ip_fw_in_args *args) +{ + args->fwai_next_hop = args1->fwa_next_hop; + args->fwai_ipfw_rule = args1->fwa_ipfw_rule; + args->fwai_pf_rule = args1->fwa_pf_rule; + args->fwai_divert_rule = args1->fwa_divert_rule; +} + +typedef enum { + IPINPUT_DOCHAIN = 0, + IPINPUT_DONTCHAIN, + IPINPUT_FREED, + IPINPUT_DONE +} ipinput_chain_ret_t; + +static void +ip_input_update_nstat(struct ifnet *ifp, struct in_addr src_ip, + u_int32_t packets, u_int32_t bytes) +{ + if (nstat_collect) { + struct rtentry *rt = ifnet_cached_rtlookup_inet(ifp, + src_ip); + if (rt != NULL) { + nstat_route_rx(rt, packets, bytes, 0); + rtfree(rt); + } + } +} + +static void +ip_input_dispatch_chain(struct mbuf *m) +{ + struct mbuf *tmp_mbuf = m; + struct mbuf *nxt_mbuf = NULL; + struct ip *ip = NULL; + unsigned int hlen; + + ip = mtod(tmp_mbuf, struct ip *); + hlen = IP_VHL_HL(ip->ip_vhl) << 2; + while(tmp_mbuf) { + nxt_mbuf = mbuf_nextpkt(tmp_mbuf); + mbuf_setnextpkt(tmp_mbuf, NULL); + + if ((sw_lro) && (ip->ip_p == IPPROTO_TCP)) + tmp_mbuf = tcp_lro(tmp_mbuf, hlen); + if (tmp_mbuf) + ip_proto_dispatch_in(tmp_mbuf, hlen, ip->ip_p, 0); + tmp_mbuf = nxt_mbuf; + if (tmp_mbuf) { + ip = mtod(tmp_mbuf, struct ip *); + /* first mbuf of chain already has adjusted ip_len */ + hlen = IP_VHL_HL(ip->ip_vhl) << 2; + ip->ip_len -= hlen; + } + } +} + +static void +ip_input_setdst_chain(struct mbuf *m, uint32_t ifindex, struct in_ifaddr *ia) +{ + struct mbuf *tmp_mbuf = m; + + while (tmp_mbuf) { + ip_setdstifaddr_info(tmp_mbuf, ifindex, ia); + tmp_mbuf = mbuf_nextpkt(tmp_mbuf); + } +} + +/* + * First pass does all essential packet validation and places on a per flow + * queue for doing operations that have same outcome for all packets of a flow. + * div_info is packet divert/tee info + */ +static ipinput_chain_ret_t +ip_input_first_pass(struct mbuf *m, u_int32_t *div_info, + struct ip_fw_in_args *args, int *ours, struct mbuf **modm) +{ + struct ip *ip; + struct ifnet *inifp; + unsigned int hlen; + int retval = IPINPUT_DOCHAIN; + int len = 0; + struct in_addr src_ip; +#if IPFIREWALL + int i; +#endif +#if IPFIREWALL || DUMMYNET + struct m_tag *copy; + struct m_tag *p; + boolean_t delete = FALSE; + struct ip_fw_args args1; + boolean_t init = FALSE; +#endif + ipfilter_t inject_filter_ref = NULL; + +#if !IPFIREWALL +#pragma unused (args) +#endif + +#if !IPDIVERT +#pragma unused (div_info) +#pragma unused (ours) +#endif + +#if !IPFIREWALL_FORWARD +#pragma unused (ours) +#endif + + /* Check if the mbuf is still valid after interface filter processing */ + MBUF_INPUT_CHECK(m, m->m_pkthdr.rcvif); + inifp = mbuf_pkthdr_rcvif(m); + VERIFY(inifp != NULL); + + /* Perform IP header alignment fixup, if needed */ + IP_HDR_ALIGNMENT_FIXUP(m, inifp, goto bad); + + m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED; + +#if IPFIREWALL || DUMMYNET + + /* + * Don't bother searching for tag(s) if there's none. + */ + if (SLIST_EMPTY(&m->m_pkthdr.tags)) + goto ipfw_tags_done; + + /* Grab info from mtags prepended to the chain */ + p = m_tag_first(m); + while (p) { + if (p->m_tag_id == KERNEL_MODULE_TAG_ID) { +#if DUMMYNET + if (p->m_tag_type == KERNEL_TAG_TYPE_DUMMYNET) { + struct dn_pkt_tag *dn_tag; + + dn_tag = (struct dn_pkt_tag *)(p+1); + args->fwai_ipfw_rule = dn_tag->dn_ipfw_rule; + args->fwai_pf_rule = dn_tag->dn_pf_rule; + delete = TRUE; + } +#endif + +#if IPDIVERT + if (p->m_tag_type == KERNEL_TAG_TYPE_DIVERT) { + struct divert_tag *div_tag; + + div_tag = (struct divert_tag *)(p+1); + args->fwai_divert_rule = div_tag->cookie; + delete = TRUE; + } +#endif + + if (p->m_tag_type == KERNEL_TAG_TYPE_IPFORWARD) { + struct ip_fwd_tag *ipfwd_tag; + + ipfwd_tag = (struct ip_fwd_tag *)(p+1); + args->fwai_next_hop = ipfwd_tag->next_hop; + delete = TRUE; + } + + if (delete) { + copy = p; + p = m_tag_next(m, p); + m_tag_delete(m, copy); + } else { + p = m_tag_next(m, p); + } + } else { + p = m_tag_next(m, p); + } + } + +#if DIAGNOSTIC + if (m == NULL || !(m->m_flags & M_PKTHDR)) + panic("ip_input no HDR"); +#endif + +#if DUMMYNET + if (args->fwai_ipfw_rule || args->fwai_pf_rule) { + /* dummynet already filtered us */ + ip = mtod(m, struct ip *); + hlen = IP_VHL_HL(ip->ip_vhl) << 2; + inject_filter_ref = ipf_get_inject_filter(m); +#if IPFIREWALL + if (args->fwai_ipfw_rule) + goto iphack; +#endif /* IPFIREWALL */ + if (args->fwai_pf_rule) + goto check_with_pf; + } +#endif /* DUMMYNET */ +ipfw_tags_done: +#endif /* IPFIREWALL || DUMMYNET */ + + /* + * No need to process packet twice if we've already seen it. + */ + if (!SLIST_EMPTY(&m->m_pkthdr.tags)) + inject_filter_ref = ipf_get_inject_filter(m); + if (inject_filter_ref != NULL) { + ip = mtod(m, struct ip *); + hlen = IP_VHL_HL(ip->ip_vhl) << 2; + + DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL, + struct ip *, ip, struct ifnet *, inifp, + struct ip *, ip, struct ip6_hdr *, NULL); + + ip->ip_len = ntohs(ip->ip_len) - hlen; + ip->ip_off = ntohs(ip->ip_off); + ip_proto_dispatch_in(m, hlen, ip->ip_p, inject_filter_ref); + return (IPINPUT_DONE); + } + + if (m->m_pkthdr.len < sizeof (struct ip)) { + OSAddAtomic(1, &ipstat.ips_total); + OSAddAtomic(1, &ipstat.ips_tooshort); + m_freem(m); + return (IPINPUT_FREED); + } + + if (m->m_len < sizeof (struct ip) && + (m = m_pullup(m, sizeof (struct ip))) == NULL) { + OSAddAtomic(1, &ipstat.ips_total); + OSAddAtomic(1, &ipstat.ips_toosmall); + return (IPINPUT_FREED); + } + + ip = mtod(m, struct ip *); + *modm = m; + + KERNEL_DEBUG(DBG_LAYER_BEG, ip->ip_dst.s_addr, ip->ip_src.s_addr, + ip->ip_p, ip->ip_off, ip->ip_len); + + if (IP_VHL_V(ip->ip_vhl) != IPVERSION) { + OSAddAtomic(1, &ipstat.ips_total); + OSAddAtomic(1, &ipstat.ips_badvers); + KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0); + m_freem(m); + return (IPINPUT_FREED); + } + + hlen = IP_VHL_HL(ip->ip_vhl) << 2; + if (hlen < sizeof (struct ip)) { + OSAddAtomic(1, &ipstat.ips_total); + OSAddAtomic(1, &ipstat.ips_badhlen); + KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0); + m_freem(m); + return (IPINPUT_FREED); + } + + if (hlen > m->m_len) { + if ((m = m_pullup(m, hlen)) == NULL) { + OSAddAtomic(1, &ipstat.ips_total); + OSAddAtomic(1, &ipstat.ips_badhlen); + KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0); + return (IPINPUT_FREED); + } + ip = mtod(m, struct ip *); + *modm = m; + } + + /* 127/8 must not appear on wire - RFC1122 */ + if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || + (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { + /* + * Allow for the following exceptions: + * + * 1. If the packet was sent to loopback (i.e. rcvif + * would have been set earlier at output time.) + * + * 2. If the packet was sent out on loopback from a local + * source address which belongs to a non-loopback + * interface (i.e. rcvif may not necessarily be a + * loopback interface, hence the test for PKTF_LOOP.) + * Unlike IPv6, there is no interface scope ID, and + * therefore we don't care so much about PKTF_IFINFO. + */ + if (!(inifp->if_flags & IFF_LOOPBACK) && + !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) { + OSAddAtomic(1, &ipstat.ips_total); + OSAddAtomic(1, &ipstat.ips_badaddr); + KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0); + m_freem(m); + return (IPINPUT_FREED); + } + } + + /* IPv4 Link-Local Addresses as defined in RFC3927 */ + if ((IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)) || + IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)))) { + ip_linklocal_stat.iplls_in_total++; + if (ip->ip_ttl != MAXTTL) { + OSAddAtomic(1, &ip_linklocal_stat.iplls_in_badttl); + /* Silently drop link local traffic with bad TTL */ + if (!ip_linklocal_in_allowbadttl) { + OSAddAtomic(1, &ipstat.ips_total); + KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0); + m_freem(m); + return (IPINPUT_FREED); + } + } + } + + if (ip_cksum(m, hlen)) { + OSAddAtomic(1, &ipstat.ips_total); + KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0); + m_freem(m); + return (IPINPUT_FREED); + } + + DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL, + struct ip *, ip, struct ifnet *, inifp, + struct ip *, ip, struct ip6_hdr *, NULL); + + /* + * Convert fields to host representation. + */ +#if BYTE_ORDER != BIG_ENDIAN + NTOHS(ip->ip_len); +#endif + + if (ip->ip_len < hlen) { + OSAddAtomic(1, &ipstat.ips_total); + OSAddAtomic(1, &ipstat.ips_badlen); + KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0); + m_freem(m); + return (IPINPUT_FREED); + } + +#if BYTE_ORDER != BIG_ENDIAN + NTOHS(ip->ip_off); +#endif + + /* + * Check that the amount of data in the buffers + * is as at least much as the IP header would have us expect. + * Trim mbufs if longer than we expect. + * Drop packet if shorter than we expect. + */ + if (m->m_pkthdr.len < ip->ip_len) { + OSAddAtomic(1, &ipstat.ips_total); + OSAddAtomic(1, &ipstat.ips_tooshort); + KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0); + m_freem(m); + return (IPINPUT_FREED); + } + + if (m->m_pkthdr.len > ip->ip_len) { + /* + * Invalidate hardware checksum info if ip_adj_clear_hwcksum + * is set; useful to handle buggy drivers. Note that this + * should not be enabled by default, as we may get here due + * to link-layer padding. + */ + if (ip_adj_clear_hwcksum && + (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) && + !(inifp->if_flags & IFF_LOOPBACK) && + !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) { + m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID; + m->m_pkthdr.csum_data = 0; + ipstat.ips_adj_hwcsum_clr++; + } + + ipstat.ips_adj++; + if (m->m_len == m->m_pkthdr.len) { + m->m_len = ip->ip_len; + m->m_pkthdr.len = ip->ip_len; + } else + m_adj(m, ip->ip_len - m->m_pkthdr.len); + } + + /* for consistency */ + m->m_pkthdr.pkt_proto = ip->ip_p; + + /* for netstat route statistics */ + src_ip = ip->ip_src; + len = m->m_pkthdr.len; + +#if DUMMYNET +check_with_pf: +#endif +#if PF + /* Invoke inbound packet filter */ + if (PF_IS_ENABLED) { + int error; + ip_input_cpout_args(args, &args1, &init); + +#if DUMMYNET + error = pf_af_hook(inifp, NULL, &m, AF_INET, TRUE, &args1); +#else + error = pf_af_hook(inifp, NULL, &m, AF_INET, TRUE, NULL); +#endif /* DUMMYNET */ + if (error != 0 || m == NULL) { + if (m != NULL) { + panic("%s: unexpected packet %p\n", + __func__, m); + /* NOTREACHED */ + } + /* Already freed by callee */ + ip_input_update_nstat(inifp, src_ip, 1, len); + KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0); + OSAddAtomic(1, &ipstat.ips_total); + return (IPINPUT_FREED); + } + ip = mtod(m, struct ip *); + hlen = IP_VHL_HL(ip->ip_vhl) << 2; + *modm = m; + ip_input_cpin_args(&args1, args); + } +#endif /* PF */ + +#if IPSEC + if (ipsec_bypass == 0 && ipsec_gethist(m, NULL)) { + retval = IPINPUT_DONTCHAIN; /* XXX scope for chaining here? */ + goto pass; + } +#endif + +#if IPFIREWALL +#if DUMMYNET +iphack: +#endif /* DUMMYNET */ + /* + * Check if we want to allow this packet to be processed. + * Consider it to be bad if not. + */ + if (fw_enable && IPFW_LOADED) { +#if IPFIREWALL_FORWARD + /* + * If we've been forwarded from the output side, then + * skip the firewall a second time + */ + if (args->fwai_next_hop) { + *ours = 1; + return (IPINPUT_DONTCHAIN); + } +#endif /* IPFIREWALL_FORWARD */ + ip_input_cpout_args(args, &args1, &init); + args1.fwa_m = m; + + i = ip_fw_chk_ptr(&args1); + m = args1.fwa_m; + + if ((i & IP_FW_PORT_DENY_FLAG) || m == NULL) { /* drop */ + if (m) + m_freem(m); + ip_input_update_nstat(inifp, src_ip, 1, len); + KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0); + OSAddAtomic(1, &ipstat.ips_total); + return (IPINPUT_FREED); + } + ip = mtod(m, struct ip *); /* just in case m changed */ + *modm = m; + ip_input_cpin_args(&args1, args); + + if (i == 0 && args->fwai_next_hop == NULL) { /* common case */ + goto pass; + } +#if DUMMYNET + if (DUMMYNET_LOADED && (i & IP_FW_PORT_DYNT_FLAG) != 0) { + /* Send packet to the appropriate pipe */ + ip_dn_io_ptr(m, i&0xffff, DN_TO_IP_IN, &args1, + DN_CLIENT_IPFW); + ip_input_update_nstat(inifp, src_ip, 1, len); + KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0); + OSAddAtomic(1, &ipstat.ips_total); + return (IPINPUT_FREED); + } +#endif /* DUMMYNET */ +#if IPDIVERT + if (i != 0 && (i & IP_FW_PORT_DYNT_FLAG) == 0) { + /* Divert or tee packet */ + *div_info = i; + *ours = 1; + return (IPINPUT_DONTCHAIN); + } +#endif +#if IPFIREWALL_FORWARD + if (i == 0 && args->fwai_next_hop != NULL) { + retval = IPINPUT_DONTCHAIN; + goto pass; + } +#endif + /* + * if we get here, the packet must be dropped + */ + ip_input_update_nstat(inifp, src_ip, 1, len); + KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0); + m_freem(m); + OSAddAtomic(1, &ipstat.ips_total); + return (IPINPUT_FREED); + } +#endif /* IPFIREWALL */ +#if IPSEC | IPFIREWALL +pass: +#endif + /* + * Process options and, if not destined for us, + * ship it on. ip_dooptions returns 1 when an + * error was detected (causing an icmp message + * to be sent and the original packet to be freed). + */ + ip_nhops = 0; /* for source routed packets */ +#if IPFIREWALL + if (hlen > sizeof (struct ip) && + ip_dooptions(m, 0, args->fwai_next_hop)) { +#else /* !IPFIREWALL */ + if (hlen > sizeof (struct ip) && ip_dooptions(m, 0, NULL)) { +#endif /* !IPFIREWALL */ + ip_input_update_nstat(inifp, src_ip, 1, len); + KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0); + OSAddAtomic(1, &ipstat.ips_total); + return (IPINPUT_FREED); + } + + /* + * Don't chain fragmented packets as the process of determining + * if it is our fragment or someone else's plus the complexity of + * divert and fw args makes it harder to do chaining. + */ + if (ip->ip_off & ~(IP_DF | IP_RF)) + return (IPINPUT_DONTCHAIN); + + /* Allow DHCP/BootP responses through */ + if ((inifp->if_eflags & IFEF_AUTOCONFIGURING) && + hlen == sizeof (struct ip) && ip->ip_p == IPPROTO_UDP) { + struct udpiphdr *ui; + + if (m->m_len < sizeof (struct udpiphdr) && + (m = m_pullup(m, sizeof (struct udpiphdr))) == NULL) { + OSAddAtomic(1, &udpstat.udps_hdrops); + KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0); + OSAddAtomic(1, &ipstat.ips_total); + return (IPINPUT_FREED); + } + *modm = m; + ui = mtod(m, struct udpiphdr *); + if (ntohs(ui->ui_dport) == IPPORT_BOOTPC) { + ip_setdstifaddr_info(m, inifp->if_index, NULL); + return (IPINPUT_DONTCHAIN); + } + } + + /* Avoid chaining raw sockets as ipsec checks occur later for them */ + if (ip_protox[ip->ip_p]->pr_flags & PR_LASTHDR) + return (IPINPUT_DONTCHAIN); + + return (retval); +#if !defined(__i386__) && !defined(__x86_64__) +bad: + m_freem(m); + return (IPINPUT_FREED); +#endif +} + +static void +ip_input_second_pass(struct mbuf *m, struct ifnet *inifp, u_int32_t div_info, + int npkts_in_chain, int bytes_in_chain, struct ip_fw_in_args *args, int ours) +{ + unsigned int checkif; + struct mbuf *tmp_mbuf = NULL; + struct in_ifaddr *ia = NULL; + struct in_addr pkt_dst; + unsigned int hlen; + +#if !IPFIREWALL +#pragma unused (args) +#endif + +#if !IPDIVERT +#pragma unused (div_info) +#endif + + struct ip *ip = mtod(m, struct ip *); + hlen = IP_VHL_HL(ip->ip_vhl) << 2; + + OSAddAtomic(npkts_in_chain, &ipstat.ips_total); + + /* + * Naively assume we can attribute inbound data to the route we would + * use to send to this destination. Asymmetric routing breaks this + * assumption, but it still allows us to account for traffic from + * a remote node in the routing table. + * this has a very significant performance impact so we bypass + * if nstat_collect is disabled. We may also bypass if the + * protocol is tcp in the future because tcp will have a route that + * we can use to attribute the data to. That does mean we would not + * account for forwarded tcp traffic. + */ + ip_input_update_nstat(inifp, ip->ip_src, npkts_in_chain, + bytes_in_chain); + + if (ours) + goto ours; + + /* + * Check our list of addresses, to see if the packet is for us. + * If we don't have any addresses, assume any unicast packet + * we receive might be for us (and let the upper layers deal + * with it). + */ + tmp_mbuf = m; + if (TAILQ_EMPTY(&in_ifaddrhead)) { + while (tmp_mbuf) { + if (!(tmp_mbuf->m_flags & (M_MCAST|M_BCAST))) { + ip_setdstifaddr_info(tmp_mbuf, inifp->if_index, + NULL); + } + tmp_mbuf = mbuf_nextpkt(tmp_mbuf); + } + goto ours; + } + /* + * Cache the destination address of the packet; this may be + * changed by use of 'ipfw fwd'. + */ +#if IPFIREWALL + pkt_dst = args->fwai_next_hop == NULL ? + ip->ip_dst : args->fwai_next_hop->sin_addr; +#else /* !IPFIREWALL */ + pkt_dst = ip->ip_dst; +#endif /* !IPFIREWALL */ + + /* + * Enable a consistency check between the destination address + * and the arrival interface for a unicast packet (the RFC 1122 + * strong ES model) if IP forwarding is disabled and the packet + * is not locally generated and the packet is not subject to + * 'ipfw fwd'. + * + * XXX - Checking also should be disabled if the destination + * address is ipnat'ed to a different interface. + * + * XXX - Checking is incompatible with IP aliases added + * to the loopback interface instead of the interface where + * the packets are received. + */ + checkif = ip_checkinterface && (ipforwarding == 0) && + !(inifp->if_flags & IFF_LOOPBACK) && + !(m->m_pkthdr.pkt_flags & PKTF_LOOP) +#if IPFIREWALL + && (args->fwai_next_hop == NULL); +#else /* !IPFIREWALL */ + ; +#endif /* !IPFIREWALL */ + + /* + * Check for exact addresses in the hash bucket. + */ + lck_rw_lock_shared(in_ifaddr_rwlock); + TAILQ_FOREACH(ia, INADDR_HASH(pkt_dst.s_addr), ia_hash) { + /* + * If the address matches, verify that the packet + * arrived via the correct interface if checking is + * enabled. + */ + if (IA_SIN(ia)->sin_addr.s_addr == pkt_dst.s_addr && + (!checkif || ia->ia_ifp == inifp)) { + ip_input_setdst_chain(m, 0, ia); + lck_rw_done(in_ifaddr_rwlock); + goto ours; + } + } + lck_rw_done(in_ifaddr_rwlock); + + /* + * Check for broadcast addresses. + * + * Only accept broadcast packets that arrive via the matching + * interface. Reception of forwarded directed broadcasts would be + * handled via ip_forward() and ether_frameout() with the loopback + * into the stack for SIMPLEX interfaces handled by ether_frameout(). + */ + if (inifp->if_flags & IFF_BROADCAST) { + struct ifaddr *ifa; + + ifnet_lock_shared(inifp); + TAILQ_FOREACH(ifa, &inifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family != AF_INET) { + continue; + } + ia = ifatoia(ifa); + if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == + pkt_dst.s_addr || ia->ia_netbroadcast.s_addr == + pkt_dst.s_addr) { + ip_input_setdst_chain(m, 0, ia); + ifnet_lock_done(inifp); + goto ours; + } + } + ifnet_lock_done(inifp); + } + + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { + struct in_multi *inm; + /* + * See if we belong to the destination multicast group on the + * arrival interface. + */ + in_multihead_lock_shared(); + IN_LOOKUP_MULTI(&ip->ip_dst, inifp, inm); + in_multihead_lock_done(); + if (inm == NULL) { + OSAddAtomic(npkts_in_chain, &ipstat.ips_notmember); + m_freem_list(m); + KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0); + return; + } + ip_input_setdst_chain(m, inifp->if_index, NULL); + INM_REMREF(inm); + goto ours; + } + + if (ip->ip_dst.s_addr == (u_int32_t)INADDR_BROADCAST || + ip->ip_dst.s_addr == INADDR_ANY) { + ip_input_setdst_chain(m, inifp->if_index, NULL); + goto ours; + } + + if (ip->ip_p == IPPROTO_UDP) { + struct udpiphdr *ui; + ui = mtod(m, struct udpiphdr *); + if (ntohs(ui->ui_dport) == IPPORT_BOOTPC) { + goto ours; + } + } + + tmp_mbuf = m; + struct mbuf *nxt_mbuf = NULL; + while (tmp_mbuf) { + nxt_mbuf = mbuf_nextpkt(tmp_mbuf); + /* + * Not for us; forward if possible and desirable. + */ + mbuf_setnextpkt(tmp_mbuf, NULL); + if (ipforwarding == 0) { + OSAddAtomic(1, &ipstat.ips_cantforward); + m_freem(tmp_mbuf); + } else { +#if IPFIREWALL + ip_forward(tmp_mbuf, 0, args->fwai_next_hop); +#else + ip_forward(tmp_mbuf, 0, NULL); +#endif + } + tmp_mbuf = nxt_mbuf; + } + KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0); + return; +ours: + /* + * If offset or IP_MF are set, must reassemble. + */ + if (ip->ip_off & ~(IP_DF | IP_RF)) { + VERIFY(npkts_in_chain == 1); + /* + * ip_reass() will return a different mbuf, and update + * the divert info in div_info and args->fwai_divert_rule. + */ +#if IPDIVERT + m = ip_reass(m, (u_int16_t *)&div_info, &args->fwai_divert_rule); +#else + m = ip_reass(m); +#endif + if (m == NULL) + return; + ip = mtod(m, struct ip *); + /* Get the header length of the reassembled packet */ + hlen = IP_VHL_HL(ip->ip_vhl) << 2; +#if IPDIVERT + /* Restore original checksum before diverting packet */ + if (div_info != 0) { + VERIFY(npkts_in_chain == 1); +#if BYTE_ORDER != BIG_ENDIAN + HTONS(ip->ip_len); + HTONS(ip->ip_off); +#endif + ip->ip_sum = 0; + ip->ip_sum = ip_cksum_hdr_in(m, hlen); +#if BYTE_ORDER != BIG_ENDIAN + NTOHS(ip->ip_off); + NTOHS(ip->ip_len); +#endif + } +#endif + } + + /* + * Further protocols expect the packet length to be w/o the + * IP header. + */ + ip->ip_len -= hlen; + +#if IPDIVERT + /* + * Divert or tee packet to the divert protocol if required. + * + * If div_info is zero then cookie should be too, so we shouldn't + * need to clear them here. Assume divert_packet() does so also. + */ + if (div_info != 0) { + struct mbuf *clone = NULL; + VERIFY(npkts_in_chain == 1); + + /* Clone packet if we're doing a 'tee' */ + if (div_info & IP_FW_PORT_TEE_FLAG) + clone = m_dup(m, M_DONTWAIT); + + /* Restore packet header fields to original values */ + ip->ip_len += hlen; + +#if BYTE_ORDER != BIG_ENDIAN + HTONS(ip->ip_len); + HTONS(ip->ip_off); +#endif + /* Deliver packet to divert input routine */ + OSAddAtomic(1, &ipstat.ips_delivered); + divert_packet(m, 1, div_info & 0xffff, args->fwai_divert_rule); - MALLOC(in_ifaddrhashtbl, struct in_ifaddrhashhead *, - inaddr_nhash * sizeof (*in_ifaddrhashtbl), - M_IFADDR, M_WAITOK | M_ZERO); - if (in_ifaddrhashtbl == NULL) - panic("in_ifaddrhashtbl_init allocation failed"); + /* If 'tee', continue with original packet */ + if (clone == NULL) { + return; + } + m = clone; + ip = mtod(m, struct ip *); + } +#endif +#if IPSEC /* - * Generate the next largest prime greater than inaddr_nhash. + * enforce IPsec policy checking if we are seeing last header. + * note that we do not visit this with protocols with pcb layer + * code - like udp/tcp/raw ip. */ - k = (inaddr_nhash % 2 == 0) ? inaddr_nhash + 1 : inaddr_nhash + 2; - for (;;) { - p = 1; - for (i = 3; i * i <= k; i += 2) { - if (k % i == 0) - p = 0; + if (ipsec_bypass == 0 && (ip_protox[ip->ip_p]->pr_flags & PR_LASTHDR)) { + VERIFY(npkts_in_chain == 1); + if (ipsec4_in_reject(m, NULL)) { + IPSEC_STAT_INCREMENT(ipsecstat.in_polvio); + goto bad; } - if (p == 1) - break; - k += 2; } - inaddr_hashp = k; -} +#endif /* IPSEC */ -u_int32_t -inaddr_hashval(u_int32_t key) -{ /* - * The hash index is the computed prime times the key modulo - * the hash size, as documented in "Introduction to Algorithms" - * (Cormen, Leiserson, Rivest). + * Switch out to protocol's input routine. */ - if (inaddr_nhash > 1) - return ((key * inaddr_hashp) % inaddr_nhash); - else - return (0); -} + OSAddAtomic(npkts_in_chain, &ipstat.ips_delivered); -void -ip_proto_dispatch_in_wrapper(struct mbuf *m, int hlen, u_int8_t proto) -{ - ip_proto_dispatch_in(m, hlen, proto, 0); -} +#if IPFIREWALL + if (args->fwai_next_hop && ip->ip_p == IPPROTO_TCP) { + /* TCP needs IPFORWARD info if available */ + struct m_tag *fwd_tag; + struct ip_fwd_tag *ipfwd_tag; -__private_extern__ void -ip_proto_dispatch_in(struct mbuf *m, int hlen, u_int8_t proto, - ipfilter_t inject_ipfref) -{ - struct ipfilter *filter; - int seen = (inject_ipfref == NULL); - int changed_header = 0; - struct ip *ip; - void (*pr_input)(struct mbuf *, int len); + VERIFY(npkts_in_chain == 1); + fwd_tag = m_tag_create(KERNEL_MODULE_TAG_ID, + KERNEL_TAG_TYPE_IPFORWARD, sizeof (*ipfwd_tag), + M_NOWAIT, m); + if (fwd_tag == NULL) + goto bad; - if (!TAILQ_EMPTY(&ipv4_filters)) { - ipf_ref(); - TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) { - if (seen == 0) { - if ((struct ipfilter *)inject_ipfref == filter) - seen = 1; - } else if (filter->ipf_filter.ipf_input) { - errno_t result; + ipfwd_tag = (struct ip_fwd_tag *)(fwd_tag+1); + ipfwd_tag->next_hop = args->fwai_next_hop; - if (changed_header == 0) { - /* - * Perform IP header alignment fixup, - * if needed, before passing packet - * into filter(s). - */ - IP_HDR_ALIGNMENT_FIXUP(m, - m->m_pkthdr.rcvif, ipf_unref()); + m_tag_prepend(m, fwd_tag); - /* ipf_unref() already called */ - if (m == NULL) - return; + KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr, + ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len); - changed_header = 1; - ip = mtod(m, struct ip *); - ip->ip_len = htons(ip->ip_len + hlen); - ip->ip_off = htons(ip->ip_off); - ip->ip_sum = 0; - ip->ip_sum = ip_cksum_hdr_in(m, hlen); - } - result = filter->ipf_filter.ipf_input( - filter->ipf_filter.cookie, (mbuf_t *)&m, - hlen, proto); - if (result == EJUSTRETURN) { - ipf_unref(); - return; - } - if (result != 0) { - ipf_unref(); - m_freem(m); - return; - } + /* TCP deals with its own locking */ + ip_proto_dispatch_in(m, hlen, ip->ip_p, 0); + } else { + KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr, + ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len); + + ip_input_dispatch_chain(m); + + } +#else /* !IPFIREWALL */ + ip_input_dispatch_chain(m); + +#endif /* !IPFIREWALL */ + KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0); + return; +bad: + KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0); + m_freem(m); +} + +void +ip_input_process_list(struct mbuf *packet_list) +{ + pktchain_elm_t pktchain_tbl[PKTTBL_SZ]; + + struct mbuf *packet = NULL; + struct mbuf *modm = NULL; /* modified mbuf */ + int retval = 0; + u_int32_t div_info = 0; + int ours = 0; +#if (DEBUG || DEVELOPMENT) + struct timeval start_tv; +#endif /* (DEBUG || DEVELOPMENT) */ + int num_pkts = 0; + int chain = 0; + struct ip_fw_in_args args; + + if (ip_chaining == 0) { + struct mbuf *m = packet_list; +#if (DEBUG || DEVELOPMENT) + if (ip_input_measure) + net_perf_start_time(&net_perf, &start_tv); +#endif /* (DEBUG || DEVELOPMENT) */ + + while (m) { + packet_list = mbuf_nextpkt(m); + mbuf_setnextpkt(m, NULL); + ip_input(m); + m = packet_list; + num_pkts++; + } +#if (DEBUG || DEVELOPMENT) + if (ip_input_measure) + net_perf_measure_time(&net_perf, &start_tv, num_pkts); +#endif /* (DEBUG || DEVELOPMENT) */ + return; + } +#if (DEBUG || DEVELOPMENT) + if (ip_input_measure) + net_perf_start_time(&net_perf, &start_tv); +#endif /* (DEBUG || DEVELOPMENT) */ + + bzero(&pktchain_tbl, sizeof(pktchain_tbl)); +restart_list_process: + chain = 0; + for (packet = packet_list; packet; packet = packet_list) { + packet_list = mbuf_nextpkt(packet); + mbuf_setnextpkt(packet, NULL); + + num_pkts++; + modm = NULL; + div_info = 0; + bzero(&args, sizeof (args)); + + retval = ip_input_first_pass(packet, &div_info, &args, + &ours, &modm); + + if (retval == IPINPUT_DOCHAIN) { + if (modm) + packet = modm; + packet = ip_chain_insert(packet, &pktchain_tbl[0]); + if (packet == NULL) { + ipstat.ips_rxc_chained++; + chain++; + if (chain > ip_chainsz) + break; + } else { + ipstat.ips_rxc_collisions++; + break; } + } else if (retval == IPINPUT_DONTCHAIN) { + /* in order to preserve order, exit from chaining */ + if (modm) + packet = modm; + ipstat.ips_rxc_notchain++; + break; + } else { + /* packet was freed or delivered, do nothing. */ } - ipf_unref(); } - /* Perform IP header alignment fixup (post-filters), if needed */ - IP_HDR_ALIGNMENT_FIXUP(m, m->m_pkthdr.rcvif, return); - - /* - * If there isn't a specific lock for the protocol - * we're about to call, use the generic lock for AF_INET. - * otherwise let the protocol deal with its own locking - */ - ip = mtod(m, struct ip *); + /* do second pass here for pktchain_tbl */ + if (chain) + ip_input_second_pass_loop_tbl(&pktchain_tbl[0], &args); - if (changed_header) { - ip->ip_len = ntohs(ip->ip_len) - hlen; - ip->ip_off = ntohs(ip->ip_off); + if (packet) { + /* + * equivalent update in chaining case if performed in + * ip_input_second_pass_loop_tbl(). + */ +#if (DEBUG || DEVELOPMENT) + if (ip_input_measure) + net_perf_histogram(&net_perf, 1); +#endif /* (DEBUG || DEVELOPMENT) */ + ip_input_second_pass(packet, packet->m_pkthdr.rcvif, div_info, + 1, packet->m_pkthdr.len, &args, ours); } - if ((pr_input = ip_protox[ip->ip_p]->pr_input) == NULL) { - m_freem(m); - } else if (!(ip_protox[ip->ip_p]->pr_flags & PR_PROTOLOCK)) { - lck_mtx_lock(inet_domain_mutex); - pr_input(m, hlen); - lck_mtx_unlock(inet_domain_mutex); - } else { - pr_input(m, hlen); - } -} + if (packet_list) + goto restart_list_process; +#if (DEBUG || DEVELOPMENT) + if (ip_input_measure) + net_perf_measure_time(&net_perf, &start_tv, num_pkts); +#endif /* (DEBUG || DEVELOPMENT) */ +} /* * Ip input routine. Checksum and byte swap header. If fragmented * try to reassemble. Process options. Pass to next level. @@ -669,6 +1798,8 @@ ip_input(struct mbuf *m) inifp = m->m_pkthdr.rcvif; VERIFY(inifp != NULL); + ipstat.ips_rxc_notlist++; + /* Perform IP header alignment fixup, if needed */ IP_HDR_ALIGNMENT_FIXUP(m, inifp, goto bad); @@ -838,7 +1969,7 @@ ipfw_tags_done: /* * Naively assume we can attribute inbound data to the route we would - * use to send to this destination. Asymetric routing breaks this + * use to send to this destination. Asymmetric routing breaks this * assumption, but it still allows us to account for traffic from * a remote node in the routing table. * this has a very significant performance impact so we bypass @@ -1019,20 +2150,6 @@ pass: return; } -#if MROUTING - /* - * greedy RSVP, snatches any PATH packet of the RSVP protocol and no - * matter if it is destined to another node, or whether it is - * a multicast one, RSVP wants it! and prevents it from being forwarded - * anywhere else. Also checks if the rsvp daemon is running before - * grabbing the packet. - */ - if (rsvp_on && ip->ip_p == IPPROTO_RSVP) { - ip_setdstifaddr_info(m, inifp->if_index, NULL); - goto ours; - } -#endif /* MROUTING */ - /* * Check our list of addresses, to see if the packet is for us. * If we don't have any addresses, assume any unicast packet @@ -1127,34 +2244,6 @@ pass: if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { struct in_multi *inm; -#if MROUTING - if (ip_mrouter) { - /* - * If we are acting as a multicast router, all - * incoming multicast packets are passed to the - * kernel-level multicast forwarding function. - * The packet is returned (relatively) intact; if - * ip_mforward() returns a non-zero value, the packet - * must be discarded, else it may be accepted below. - */ - if (ip_mforward && ip_mforward(ip, inifp, m, 0) != 0) { - OSAddAtomic(1, &ipstat.ips_cantforward); - m_freem(m); - return; - } - - /* - * The process-level routing daemon needs to receive - * all multicast IGMP packets, whether or not this - * host belongs to their destination groups. - */ - if (ip->ip_p == IPPROTO_IGMP) { - ip_setdstifaddr_info(m, inifp->if_index, NULL); - goto ours; - } - OSAddAtomic(1, &ipstat.ips_forward); - } -#endif /* MROUTING */ /* * See if we belong to the destination multicast group on the * arrival interface. @@ -2088,7 +3177,7 @@ ip_dooptions(struct mbuf *m, int pass, struct sockaddr_in *next_hop) struct in_ifaddr *ia; int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0; struct in_addr *sin, dst; - n_time ntime; + u_int32_t ntime; struct sockaddr_in ipaddr = { sizeof (ipaddr), AF_INET, 0, { 0 }, { 0, } }; @@ -2352,8 +3441,6 @@ nosourcerouting: } return (0); bad: - /* XXX icmp_error adds in hdr length */ - ip->ip_len -= IP_VHL_HL(ip->ip_vhl) << 2; icmp_error(m, type, code, 0, 0); OSAddAtomic(1, &ipstat.ips_badoptions); return (1); @@ -2680,7 +3767,8 @@ ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop) n_long dest; struct in_addr pkt_dst; u_int32_t nextmtu = 0, len; - struct ip_out_args ipoa = { IFSCOPE_NONE, { 0 }, 0, 0 }; + struct ip_out_args ipoa = { IFSCOPE_NONE, { 0 }, 0, 0, + SO_TC_UNSPEC, _NET_SERVICE_TYPE_UNSPEC }; struct ifnet *rcvifp = m->m_pkthdr.rcvif; #if IPSEC struct secpolicy *sp = NULL; @@ -3143,6 +4231,13 @@ makedummy: goto no_mbufs; } } + if (inp->inp_flags & INP_RECVTOS) { + mp = sbcreatecontrol_mbuf((caddr_t)&ip->ip_tos, + sizeof(u_char), IP_RECVTOS, IPPROTO_IP, mp); + if (*mp == NULL) { + goto no_mbufs; + } + } return (0); no_mbufs: @@ -3150,45 +4245,6 @@ no_mbufs: return (ENOBUFS); } -#if MROUTING -int -ip_rsvp_init(struct socket *so) -{ - if (so->so_type != SOCK_RAW || SOCK_PROTO(so) != IPPROTO_RSVP) - return (EOPNOTSUPP); - - if (ip_rsvpd != NULL) - return (EADDRINUSE); - - ip_rsvpd = so; - /* - * This may seem silly, but we need to be sure we don't over-increment - * the RSVP counter, in case something slips up. - */ - if (!ip_rsvp_on) { - ip_rsvp_on = 1; - rsvp_on++; - } - - return (0); -} - -int -ip_rsvp_done(void) -{ - ip_rsvpd = NULL; - /* - * This may seem silly, but we need to be sure we don't over-decrement - * the RSVP counter, in case something slips up. - */ - if (ip_rsvp_on) { - ip_rsvp_on = 0; - rsvp_on--; - } - return (0); -} -#endif /* MROUTING */ - static inline u_short ip_cksum(struct mbuf *m, int hlen) { @@ -3337,3 +4393,59 @@ ip_gre_register_input(gre_input_func_t fn) return (0); } + +#if (DEBUG || DEVELOPMENT) +static int +sysctl_reset_ip_input_stats SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int error, i; + + i = ip_input_measure; + error = sysctl_handle_int(oidp, &i, 0, req); + if (error || req->newptr == USER_ADDR_NULL) + goto done; + /* impose bounds */ + if (i < 0 || i > 1) { + error = EINVAL; + goto done; + } + if (ip_input_measure != i && i == 1) { + net_perf_initialize(&net_perf, ip_input_measure_bins); + } + ip_input_measure = i; +done: + return (error); +} + +static int +sysctl_ip_input_measure_bins SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int error; + uint64_t i; + + i = ip_input_measure_bins; + error = sysctl_handle_quad(oidp, &i, 0, req); + if (error || req->newptr == USER_ADDR_NULL) + goto done; + /* validate data */ + if (!net_perf_validate_bins(i)) { + error = EINVAL; + goto done; + } + ip_input_measure_bins = i; +done: + return (error); +} + +static int +sysctl_ip_input_getperf SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + if (req->oldptr == USER_ADDR_NULL) + req->oldlen = (size_t)sizeof (struct ipstat); + + return (SYSCTL_OUT(req, &net_perf, MIN(sizeof (net_perf), req->oldlen))); +} +#endif /* (DEBUG || DEVELOPMENT) */