]> git.saurik.com Git - apple/xnu.git/blobdiff - bsd/netinet/ip_output.c
xnu-4570.31.3.tar.gz
[apple/xnu.git] / bsd / netinet / ip_output.c
index f59d299a91141ee7a9093b58eada14ed7a40f239..f5b51ac5251a76661d22467733c9cd2a2218578e 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -95,6 +95,7 @@
 #include <net/ntstat.h>
 #include <net/net_osdep.h>
 #include <net/dlil.h>
+#include <net/net_perf.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #include <netinet/kpi_ipfilter_var.h>
+#include <netinet/in_tclass.h>
 
 #if CONFIG_MACF_NET
 #include <security/mac_framework.h>
 
 u_short ip_id;
 
+static int sysctl_reset_ip_output_stats SYSCTL_HANDLER_ARGS;
+static int sysctl_ip_output_measure_bins SYSCTL_HANDLER_ARGS;
+static int sysctl_ip_output_getperf SYSCTL_HANDLER_ARGS;
 static void ip_out_cksum_stats(int, u_int32_t);
 static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *);
 static int ip_optcopy(struct ip *, struct ip *);
@@ -184,6 +189,28 @@ SYSCTL_INT(_net_inet_ip, OID_AUTO, select_srcif_debug,
        CTLFLAG_RW | CTLFLAG_LOCKED, &ip_select_srcif_debug, 0,
        "log source interface selection debug info");
 
+static int ip_output_measure = 0;
+SYSCTL_PROC(_net_inet_ip, OID_AUTO, output_perf,
+       CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
+       &ip_output_measure, 0, sysctl_reset_ip_output_stats, "I",
+       "Do time measurement");
+
+static uint64_t ip_output_measure_bins = 0;
+SYSCTL_PROC(_net_inet_ip, OID_AUTO, output_perf_bins,
+       CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &ip_output_measure_bins, 0,
+       sysctl_ip_output_measure_bins, "I",
+       "bins for chaining performance data histogram");
+
+static net_perf_t net_perf;
+SYSCTL_PROC(_net_inet_ip, OID_AUTO, output_perf_data,
+       CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
+       0, 0, sysctl_ip_output_getperf, "S,net_perf",
+       "IP output performance data (struct net_perf, net/net_perf.h)");
+
+__private_extern__ int rfc6864 = 1;
+SYSCTL_INT(_net_inet_ip, OID_AUTO, rfc6864, CTLFLAG_RW | CTLFLAG_LOCKED,
+       &rfc6864, 0, "updated ip id field behavior");
+
 #define        IMO_TRACE_HIST_SIZE     32      /* size of trace history */
 
 /* For gdb */
@@ -259,8 +286,10 @@ ip_output_list(struct mbuf *m0, int packetchain, struct mbuf *opt,
        ipfilter_t inject_filter_ref = NULL;
        struct mbuf *packetlist;
        uint32_t sw_csum, pktcnt = 0, scnt = 0, bytecnt = 0;
+       uint32_t packets_processed = 0;
        unsigned int ifscope = IFSCOPE_NONE;
        struct flowadv *adv = NULL;
+       struct timeval start_tv;
 #if IPSEC
        struct socket *so = NULL;
        struct secpolicy *sp = NULL;
@@ -321,11 +350,18 @@ ip_output_list(struct mbuf *m0, int packetchain, struct mbuf *opt,
                uint32_t raw;
        } ipobf = { .raw = 0 };
 
+/*
+ * Here we check for restrictions when sending frames.
+ * N.B.: IPv4 over internal co-processor interfaces is not allowed.
+ */
 #define        IP_CHECK_RESTRICTIONS(_ifp, _ipobf)                             \
        (((_ipobf).nocell && IFNET_IS_CELLULAR(_ifp)) ||                \
         ((_ipobf).noexpensive && IFNET_IS_EXPENSIVE(_ifp)) ||          \
+          (IFNET_IS_INTCOPROC(_ifp)) ||                                        \
         (!(_ipobf).awdl_unrestricted && IFNET_IS_AWDL_RESTRICTED(_ifp)))
 
+       if (ip_output_measure)
+               net_perf_start_time(&net_perf, &start_tv);
        KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
 
        VERIFY(m0->m_flags & M_PKTHDR);
@@ -408,10 +444,10 @@ ipfw_tags_done:
                }
        }
 #endif /* IPSEC */
-       
+
        VERIFY(ro != NULL);
 
-       if (ip_doscopedroute && (flags & IP_OUTARGS)) {
+       if (flags & IP_OUTARGS) {
                /*
                 * In the forwarding case, only the ifscope value is used,
                 * as source interface selection doesn't take place.
@@ -458,7 +494,7 @@ ipfw_tags_done:
                adv->code = FADV_SUCCESS;
                ipoa->ipoa_retflags = 0;
        }
-       
+
 #if IPSEC
        if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) {
                so = ipsec_getsocket(m);
@@ -495,6 +531,7 @@ ipfw_tags_done:
 #endif /* DUMMYNET */
 
 loopit:
+       packets_processed++;
        ipobf.isbroadcast = FALSE;
        ipobf.didfilter = FALSE;
 #if IPFIREWALL_FORWARD
@@ -553,7 +590,12 @@ loopit:
        if (!(flags & (IP_FORWARDING|IP_RAWOUTPUT))) {
                ip->ip_vhl = IP_MAKE_VHL(IPVERSION, hlen >> 2);
                ip->ip_off &= IP_DF;
-               ip->ip_id = ip_randomid();
+               if (rfc6864 && IP_OFF_IS_ATOMIC(ip->ip_off)) {
+                       // Per RFC6864, value of ip_id is undefined for atomic ip packets
+                       ip->ip_id = 0;
+               } else {
+                       ip->ip_id = ip_randomid();
+               }
                OSAddAtomic(1, &ipstat.ips_localout);
        } else {
                hlen = IP_VHL_HL(ip->ip_vhl) << 2;
@@ -630,6 +672,7 @@ loopit:
                        if (ia == NULL) {
                                OSAddAtomic(1, &ipstat.ips_noroute);
                                error = ENETUNREACH;
+                               /* XXX IPv6 APN fallback notification?? */
                                goto bad;
                        }
                }
@@ -677,11 +720,11 @@ loopit:
 
                        /*
                         * If the source address belongs to a restricted
-                        * interface and the caller forbids our using 
+                        * interface and the caller forbids our using
                         * interfaces of such type, pretend that there is no
                         * route.
                         */
-                       if (ia0 != NULL && 
+                       if (ia0 != NULL &&
                            IP_CHECK_RESTRICTIONS(ia0->ifa_ifp, ipobf)) {
                                IFA_REMREF(ia0);
                                ia0 = NULL;
@@ -774,7 +817,7 @@ loopit:
                                rtalloc_scoped_ign(ro, ign, ifscope);
 
                        /*
-                        * If the route points to a cellular/expensive interface 
+                        * If the route points to a cellular/expensive interface
                         * and the caller forbids our using interfaces of such type,
                         * pretend that there is no route.
                         */
@@ -867,7 +910,7 @@ loopit:
        if (IN_MULTICAST(ntohl(pkt_dst.s_addr))) {
                struct ifnet *srcifp = NULL;
                struct in_multi *inm;
-               u_int32_t vif;
+               u_int32_t vif = 0;
                u_int8_t ttl = IP_DEFAULT_MULTICAST_TTL;
                u_int8_t loop = IP_DEFAULT_MULTICAST_LOOP;
 
@@ -1172,36 +1215,62 @@ sendit:
                necp_mark_packet_from_ip(m, necp_matched_policy_id);
                switch (necp_result) {
                        case NECP_KERNEL_POLICY_RESULT_PASS:
+                               /* Check if the interface is allowed */
+                               if (!necp_packet_is_allowed_over_interface(m, ifp)) {
+                                       error = EHOSTUNREACH;
+                                       OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
+                                       goto bad;
+                               }
                                goto skip_ipsec;
                        case NECP_KERNEL_POLICY_RESULT_DROP:
                        case NECP_KERNEL_POLICY_RESULT_SOCKET_DIVERT:
                                /* Flow divert packets should be blocked at the IP layer */
                                error = EHOSTUNREACH;
+                               OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
                                goto bad;
                        case NECP_KERNEL_POLICY_RESULT_IP_TUNNEL: {
                                /* Verify that the packet is being routed to the tunnel */
                                struct ifnet *policy_ifp = necp_get_ifnet_from_result_parameter(&necp_result_parameter);
                                if (policy_ifp == ifp) {
+                                       /* Check if the interface is allowed */
+                                       if (!necp_packet_is_allowed_over_interface(m, ifp)) {
+                                               error = EHOSTUNREACH;
+                                               OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
+                                               goto bad;
+                                       }
                                        goto skip_ipsec;
                                } else {
                                        if (necp_packet_can_rebind_to_ifnet(m, policy_ifp, &necp_route, AF_INET)) {
+                                               /* Check if the interface is allowed */
+                                               if (!necp_packet_is_allowed_over_interface(m, policy_ifp)) {
+                                                       error = EHOSTUNREACH;
+                                                       OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
+                                                       goto bad;
+                                               }
+
                                                /* Set ifp to the tunnel interface, since it is compatible with the packet */
                                                ifp = policy_ifp;
                                                ro = &necp_route;
                                                goto skip_ipsec;
                                        } else {
                                                error = ENETUNREACH;
+                                               OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
                                                goto bad;
                                        }
                                }
-                               break;
                        }
                        default:
                                break;
                }
        }
+       /* Catch-all to check if the interface is allowed */
+       if (!necp_packet_is_allowed_over_interface(m, ifp)) {
+               error = EHOSTUNREACH;
+               OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
+               goto bad;
+       }
 #endif /* NECP */
-       
+
 #if IPSEC
        if (ipsec_bypass != 0 || (flags & IP_NOIPSEC))
                goto skip_ipsec;
@@ -1272,7 +1341,7 @@ sendit:
        if (flags & IP_ROUTETOIF) {
                bzero(&ipsec_state.ro, sizeof (ipsec_state.ro));
        } else {
-               route_copyout(&ipsec_state.ro, ro, sizeof (ipsec_state.ro));
+               route_copyout((struct route *)&ipsec_state.ro, ro, sizeof (struct route));
        }
        ipsec_state.dst = SA(dst);
 
@@ -1295,6 +1364,11 @@ sendit:
            struct ip *, ip, struct ip6_hdr *, NULL);
 
        error = ipsec4_output(&ipsec_state, sp, flags);
+       if (ipsec_state.tunneled == 6) {
+               m0 = m = NULL;
+               error = 0;
+               goto bad;
+       }
 
        m0 = m = ipsec_state.m;
 
@@ -1315,10 +1389,10 @@ sendit:
                 */
                if (ipsec_state.tunneled) {
                        flags &= ~IP_ROUTETOIF;
-                       ro = &ipsec_state.ro;
+                       ro = (struct route *)&ipsec_state.ro;
                }
        } else {
-               ro = &ipsec_state.ro;
+               ro = (struct route *)&ipsec_state.ro;
        }
        dst = SIN(ipsec_state.dst);
        if (error) {
@@ -1648,7 +1722,7 @@ skip_ipsec:
                        ROUTE_RELEASE(ro_fwd);
                        bcopy(dst, &ro_fwd->ro_dst, sizeof (*dst));
 
-                       rtalloc_ign(ro_fwd, RTF_PRCLONING);
+                       rtalloc_ign(ro_fwd, RTF_PRCLONING, false);
 
                        if (ro_fwd->ro_rt == NULL) {
                                OSAddAtomic(1, &ipstat.ips_noroute);
@@ -1720,6 +1794,31 @@ pass:
                goto bad;
        }
 
+       if (ipoa != NULL) {
+               u_int8_t dscp = ip->ip_tos >> IPTOS_DSCP_SHIFT;
+
+               error = set_packet_qos(m, ifp,
+                   ipoa->ipoa_flags & IPOAF_QOSMARKING_ALLOWED ? TRUE : FALSE,
+                   ipoa->ipoa_sotc, ipoa->ipoa_netsvctype, &dscp);
+               if (error == 0) {
+                       ip->ip_tos &= IPTOS_ECN_MASK;
+                       ip->ip_tos |= dscp << IPTOS_DSCP_SHIFT;
+               } else {
+                       printf("%s if_dscp_for_mbuf() error %d\n", __func__, error);
+                       error = 0;
+               }
+       }
+
+       /*
+        * Some Wi-Fi AP implementations do not correctly handle multicast IP
+        * packets with DSCP bits set -- see radr://9331522 -- so as a
+        * workaround we clear the DSCP bits and set the service class to BE
+        */
+       if (IN_MULTICAST(ntohl(pkt_dst.s_addr)) && IFNET_IS_WIFI_INFRA(ifp)) {
+               ip->ip_tos &= IPTOS_ECN_MASK;
+               mbuf_set_service_class(m, MBUF_SC_BE);
+       }
+
        ip_output_checksum(ifp, m, (IP_VHL_HL(ip->ip_vhl) << 2),
            ip->ip_len, &sw_csum);
 
@@ -1896,6 +1995,10 @@ done:
 #endif /* IPFIREWALL_FORWARD */
 
        KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_END, error, 0, 0, 0, 0);
+       if (ip_output_measure) {
+               net_perf_measure_time(&net_perf, &start_tv, packets_processed);
+               net_perf_histogram(&net_perf, packets_processed);
+       }
        return (error);
 bad:
        if (pktcnt > 0)
@@ -2168,7 +2271,8 @@ in_finalize_cksum(struct mbuf *m, uint32_t hoff, uint32_t csum_flags)
                ip_out_cksum_stats(ip->ip_p, len);
 
                /* RFC1122 4.1.3.4 */
-               if (csum == 0 && (m->m_pkthdr.csum_flags & CSUM_UDP))
+               if (csum == 0 &&
+                   (m->m_pkthdr.csum_flags & (CSUM_UDP|CSUM_ZERO_INVERT)))
                        csum = 0xffff;
 
                /* Insert the checksum in the ULP csum field */
@@ -2180,8 +2284,8 @@ in_finalize_cksum(struct mbuf *m, uint32_t hoff, uint32_t csum_flags)
                } else {
                        bcopy(&csum, (mtod(m, char *) + offset), sizeof (csum));
                }
-               m->m_pkthdr.csum_flags &=
-                   ~(CSUM_DELAY_DATA | CSUM_DATA_VALID | CSUM_PARTIAL);
+               m->m_pkthdr.csum_flags &= ~(CSUM_DELAY_DATA | CSUM_DATA_VALID |
+                   CSUM_PARTIAL | CSUM_ZERO_INVERT);
        }
 
        if (sw_csum & CSUM_DELAY_IP) {
@@ -2361,8 +2465,10 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
                        m->m_len = sopt->sopt_valsize;
                        error = sooptcopyin(sopt, mtod(m, char *),
                            m->m_len, m->m_len);
-                       if (error)
+                       if (error) {
+                               m_freem(m);
                                break;
+                       }
 
                        return (ip_pcbopts(sopt->sopt_name,
                            &inp->inp_options, m));
@@ -2376,6 +2482,7 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
                case IP_RECVIF:
                case IP_RECVTTL:
                case IP_RECVPKTINFO:
+               case IP_RECVTOS:
                        error = sooptcopyin(sopt, &optval, sizeof (optval),
                            sizeof (optval));
                        if (error)
@@ -2418,73 +2525,13 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
                        case IP_RECVPKTINFO:
                                OPTSET(INP_PKTINFO);
                                break;
-                       }
-                       break;
-#undef OPTSET
 
-#if CONFIG_FORCE_OUT_IFP
-               /*
-                * Apple private interface, similar to IP_BOUND_IF, except
-                * that the parameter is a NULL-terminated string containing
-                * the name of the network interface; an emptry string means
-                * unbind.  Applications are encouraged to use IP_BOUND_IF
-                * instead, as that is the current "official" API.
-                */
-               case IP_FORCE_OUT_IFP: {
-                       char ifname[IFNAMSIZ];
-                       unsigned int ifscope;
-
-                       /* This option is settable only for IPv4 */
-                       if (!(inp->inp_vflag & INP_IPV4)) {
-                               error = EINVAL;
-                               break;
-                       }
-
-                       /* Verify interface name parameter is sane */
-                       if (sopt->sopt_valsize > sizeof (ifname)) {
-                               error = EINVAL;
+                       case IP_RECVTOS:
+                               OPTSET(INP_RECVTOS);
                                break;
+ #undef OPTSET
                        }
-
-                       /* Copy the interface name */
-                       if (sopt->sopt_valsize != 0) {
-                               error = sooptcopyin(sopt, ifname,
-                                   sizeof (ifname), sopt->sopt_valsize);
-                               if (error)
-                                       break;
-                       }
-
-                       if (sopt->sopt_valsize == 0 || ifname[0] == '\0') {
-                               /* Unbind this socket from any interface */
-                               ifscope = IFSCOPE_NONE;
-                       } else {
-                               ifnet_t ifp;
-
-                               /* Verify name is NULL terminated */
-                               if (ifname[sopt->sopt_valsize - 1] != '\0') {
-                                       error = EINVAL;
-                                       break;
-                               }
-
-                               /* Bail out if given bogus interface name */
-                               if (ifnet_find_by_name(ifname, &ifp) != 0) {
-                                       error = ENXIO;
-                                       break;
-                               }
-
-                               /* Bind this socket to this interface */
-                               ifscope = ifp->if_index;
-
-                               /*
-                                * Won't actually free; since we don't release
-                                * this later, we should do it now.
-                                */
-                               ifnet_release(ifp);
-                       }
-                       error = inp_bindif(inp, ifscope, NULL);
-               }
-               break;
-#endif /* CONFIG_FORCE_OUT_IFP */
+                       break;
                /*
                 * Multicast socket options are processed by the in_mcast
                 * module.
@@ -2545,7 +2592,7 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
                        int priv;
                        struct mbuf *m;
                        int optname;
-                       
+
                        if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
                                break;
                        if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
@@ -2673,8 +2720,8 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
                case IP_RECVTTL:
                case IP_PORTRANGE:
                case IP_RECVPKTINFO:
+               case IP_RECVTOS:
                        switch (sopt->sopt_name) {
-
                        case IP_TOS:
                                optval = inp->inp_ip_tos;
                                break;
@@ -2717,6 +2764,10 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
                        case IP_RECVPKTINFO:
                                optval = OPTBIT(INP_PKTINFO);
                                break;
+
+                       case IP_RECVTOS:
+                               optval = OPTBIT(INP_RECVTOS);
+                               break;
                        }
                        error = sooptcopyout(sopt, &optval, sizeof (optval));
                        break;
@@ -2739,11 +2790,10 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
 
 #if TRAFFIC_MGT
                case IP_TRAFFIC_MGT_BACKGROUND: {
-                       unsigned background = (so->so_traffic_mgt_flags &
-                           TRAFFIC_MGT_SO_BACKGROUND) ? 1 : 0;
+                       unsigned background = (so->so_flags1 &
+                           SOF1_TRAFFIC_MGT_SO_BACKGROUND) ? 1 : 0;
                        return (sooptcopyout(sopt, &background,
                            sizeof (background)));
-                       break;
                }
 #endif /* TRAFFIC_MGT */
 
@@ -3057,7 +3107,7 @@ ip_mloopback(struct ifnet *srcifp, struct ifnet *origifp, struct mbuf *m,
         * interface itself is lo0, this will be overridden by if_loop.
         */
        if (hwcksum_rx) {
-               copym->m_pkthdr.csum_flags &= ~CSUM_PARTIAL;
+               copym->m_pkthdr.csum_flags &= ~(CSUM_PARTIAL|CSUM_ZERO_INVERT);
                copym->m_pkthdr.csum_flags |=
                    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
                copym->m_pkthdr.csum_data = 0xffff;
@@ -3411,10 +3461,13 @@ ip_output_checksum(struct ifnet *ifp, struct mbuf *m, int hlen, int ip_len,
                /*
                 * Partial checksum offload, if non-IP fragment, and TCP only
                 * (no UDP support, as the hardware may not be able to convert
-                * +0 to -0 (0xffff) per RFC1122 4.1.3.4.)
+                * +0 to -0 (0xffff) per RFC1122 4.1.3.4. unless the interface
+                * supports "invert zero" capability.)
                 */
                if (hwcksum_tx && !tso &&
-                   (m->m_pkthdr.csum_flags & CSUM_TCP) &&
+                   ((m->m_pkthdr.csum_flags & CSUM_TCP) ||
+                   ((hwcap & CSUM_ZERO_INVERT) &&
+                   (m->m_pkthdr.csum_flags & CSUM_ZERO_INVERT))) &&
                    ip_len <= ifp->if_mtu) {
                        uint16_t start = sizeof (struct ip);
                        uint16_t ulpoff = m->m_pkthdr.csum_data & 0xffff;
@@ -3466,3 +3519,57 @@ ip_gre_output(struct mbuf *m)
 
        return (error);
 }
+
+static int
+sysctl_reset_ip_output_stats SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2)
+       int error, i;
+
+       i = ip_output_measure;
+       error = sysctl_handle_int(oidp, &i, 0, req);
+       if (error || req->newptr == USER_ADDR_NULL)
+               goto done;
+       /* impose bounds */
+       if (i < 0 || i > 1) {
+               error = EINVAL;
+               goto done;
+       }
+       if (ip_output_measure != i && i == 1) {
+               net_perf_initialize(&net_perf, ip_output_measure_bins);
+       }
+       ip_output_measure = i;
+done:
+       return (error);
+}
+
+static int
+sysctl_ip_output_measure_bins SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2)
+       int error;
+       uint64_t i;
+
+       i = ip_output_measure_bins;
+       error = sysctl_handle_quad(oidp, &i, 0, req);
+       if (error || req->newptr == USER_ADDR_NULL)
+               goto done;
+       /* validate data */
+       if (!net_perf_validate_bins(i)) {
+               error = EINVAL;
+               goto done;
+       }
+       ip_output_measure_bins = i;
+done:
+       return (error);
+}
+
+static int
+sysctl_ip_output_getperf SYSCTL_HANDLER_ARGS
+{
+#pragma unused(oidp, arg1, arg2)
+       if (req->oldptr == USER_ADDR_NULL)
+               req->oldlen = (size_t)sizeof (struct ipstat);
+
+       return (SYSCTL_OUT(req, &net_perf, MIN(sizeof (net_perf), req->oldlen)));
+}