X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/fe8ab488e9161c46dd9885d58fc52996dc0249ff..5c9f46613a83ebfc29a5b1f099448259e96a98f0:/bsd/netinet/ip_output.c diff --git a/bsd/netinet/ip_output.c b/bsd/netinet/ip_output.c index f59d299a9..f5b51ac52 100644 --- a/bsd/netinet/ip_output.c +++ b/bsd/netinet/ip_output.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. + * Copyright (c) 2000-2017 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -95,6 +95,7 @@ #include #include #include +#include #include #include @@ -103,6 +104,7 @@ #include #include #include +#include #if CONFIG_MACF_NET #include @@ -152,6 +154,9 @@ u_short ip_id; +static int sysctl_reset_ip_output_stats SYSCTL_HANDLER_ARGS; +static int sysctl_ip_output_measure_bins SYSCTL_HANDLER_ARGS; +static int sysctl_ip_output_getperf SYSCTL_HANDLER_ARGS; static void ip_out_cksum_stats(int, u_int32_t); static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *); static int ip_optcopy(struct ip *, struct ip *); @@ -184,6 +189,28 @@ SYSCTL_INT(_net_inet_ip, OID_AUTO, select_srcif_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_select_srcif_debug, 0, "log source interface selection debug info"); +static int ip_output_measure = 0; +SYSCTL_PROC(_net_inet_ip, OID_AUTO, output_perf, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + &ip_output_measure, 0, sysctl_reset_ip_output_stats, "I", + "Do time measurement"); + +static uint64_t ip_output_measure_bins = 0; +SYSCTL_PROC(_net_inet_ip, OID_AUTO, output_perf_bins, + CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &ip_output_measure_bins, 0, + sysctl_ip_output_measure_bins, "I", + "bins for chaining performance data histogram"); + +static net_perf_t net_perf; +SYSCTL_PROC(_net_inet_ip, OID_AUTO, output_perf_data, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, + 0, 0, sysctl_ip_output_getperf, "S,net_perf", + "IP output performance data (struct net_perf, net/net_perf.h)"); + +__private_extern__ int rfc6864 = 1; +SYSCTL_INT(_net_inet_ip, OID_AUTO, rfc6864, CTLFLAG_RW | CTLFLAG_LOCKED, + &rfc6864, 0, "updated ip id field behavior"); + #define IMO_TRACE_HIST_SIZE 32 /* size of trace history */ /* For gdb */ @@ -259,8 +286,10 @@ ip_output_list(struct mbuf *m0, int packetchain, struct mbuf *opt, ipfilter_t inject_filter_ref = NULL; struct mbuf *packetlist; uint32_t sw_csum, pktcnt = 0, scnt = 0, bytecnt = 0; + uint32_t packets_processed = 0; unsigned int ifscope = IFSCOPE_NONE; struct flowadv *adv = NULL; + struct timeval start_tv; #if IPSEC struct socket *so = NULL; struct secpolicy *sp = NULL; @@ -321,11 +350,18 @@ ip_output_list(struct mbuf *m0, int packetchain, struct mbuf *opt, uint32_t raw; } ipobf = { .raw = 0 }; +/* + * Here we check for restrictions when sending frames. + * N.B.: IPv4 over internal co-processor interfaces is not allowed. + */ #define IP_CHECK_RESTRICTIONS(_ifp, _ipobf) \ (((_ipobf).nocell && IFNET_IS_CELLULAR(_ifp)) || \ ((_ipobf).noexpensive && IFNET_IS_EXPENSIVE(_ifp)) || \ + (IFNET_IS_INTCOPROC(_ifp)) || \ (!(_ipobf).awdl_unrestricted && IFNET_IS_AWDL_RESTRICTED(_ifp))) + if (ip_output_measure) + net_perf_start_time(&net_perf, &start_tv); KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0); VERIFY(m0->m_flags & M_PKTHDR); @@ -408,10 +444,10 @@ ipfw_tags_done: } } #endif /* IPSEC */ - + VERIFY(ro != NULL); - if (ip_doscopedroute && (flags & IP_OUTARGS)) { + if (flags & IP_OUTARGS) { /* * In the forwarding case, only the ifscope value is used, * as source interface selection doesn't take place. @@ -458,7 +494,7 @@ ipfw_tags_done: adv->code = FADV_SUCCESS; ipoa->ipoa_retflags = 0; } - + #if IPSEC if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) { so = ipsec_getsocket(m); @@ -495,6 +531,7 @@ ipfw_tags_done: #endif /* DUMMYNET */ loopit: + packets_processed++; ipobf.isbroadcast = FALSE; ipobf.didfilter = FALSE; #if IPFIREWALL_FORWARD @@ -553,7 +590,12 @@ loopit: if (!(flags & (IP_FORWARDING|IP_RAWOUTPUT))) { ip->ip_vhl = IP_MAKE_VHL(IPVERSION, hlen >> 2); ip->ip_off &= IP_DF; - ip->ip_id = ip_randomid(); + if (rfc6864 && IP_OFF_IS_ATOMIC(ip->ip_off)) { + // Per RFC6864, value of ip_id is undefined for atomic ip packets + ip->ip_id = 0; + } else { + ip->ip_id = ip_randomid(); + } OSAddAtomic(1, &ipstat.ips_localout); } else { hlen = IP_VHL_HL(ip->ip_vhl) << 2; @@ -630,6 +672,7 @@ loopit: if (ia == NULL) { OSAddAtomic(1, &ipstat.ips_noroute); error = ENETUNREACH; + /* XXX IPv6 APN fallback notification?? */ goto bad; } } @@ -677,11 +720,11 @@ loopit: /* * If the source address belongs to a restricted - * interface and the caller forbids our using + * interface and the caller forbids our using * interfaces of such type, pretend that there is no * route. */ - if (ia0 != NULL && + if (ia0 != NULL && IP_CHECK_RESTRICTIONS(ia0->ifa_ifp, ipobf)) { IFA_REMREF(ia0); ia0 = NULL; @@ -774,7 +817,7 @@ loopit: rtalloc_scoped_ign(ro, ign, ifscope); /* - * If the route points to a cellular/expensive interface + * If the route points to a cellular/expensive interface * and the caller forbids our using interfaces of such type, * pretend that there is no route. */ @@ -867,7 +910,7 @@ loopit: if (IN_MULTICAST(ntohl(pkt_dst.s_addr))) { struct ifnet *srcifp = NULL; struct in_multi *inm; - u_int32_t vif; + u_int32_t vif = 0; u_int8_t ttl = IP_DEFAULT_MULTICAST_TTL; u_int8_t loop = IP_DEFAULT_MULTICAST_LOOP; @@ -1172,36 +1215,62 @@ sendit: necp_mark_packet_from_ip(m, necp_matched_policy_id); switch (necp_result) { case NECP_KERNEL_POLICY_RESULT_PASS: + /* Check if the interface is allowed */ + if (!necp_packet_is_allowed_over_interface(m, ifp)) { + error = EHOSTUNREACH; + OSAddAtomic(1, &ipstat.ips_necp_policy_drop); + goto bad; + } goto skip_ipsec; case NECP_KERNEL_POLICY_RESULT_DROP: case NECP_KERNEL_POLICY_RESULT_SOCKET_DIVERT: /* Flow divert packets should be blocked at the IP layer */ error = EHOSTUNREACH; + OSAddAtomic(1, &ipstat.ips_necp_policy_drop); goto bad; case NECP_KERNEL_POLICY_RESULT_IP_TUNNEL: { /* Verify that the packet is being routed to the tunnel */ struct ifnet *policy_ifp = necp_get_ifnet_from_result_parameter(&necp_result_parameter); if (policy_ifp == ifp) { + /* Check if the interface is allowed */ + if (!necp_packet_is_allowed_over_interface(m, ifp)) { + error = EHOSTUNREACH; + OSAddAtomic(1, &ipstat.ips_necp_policy_drop); + goto bad; + } goto skip_ipsec; } else { if (necp_packet_can_rebind_to_ifnet(m, policy_ifp, &necp_route, AF_INET)) { + /* Check if the interface is allowed */ + if (!necp_packet_is_allowed_over_interface(m, policy_ifp)) { + error = EHOSTUNREACH; + OSAddAtomic(1, &ipstat.ips_necp_policy_drop); + goto bad; + } + /* Set ifp to the tunnel interface, since it is compatible with the packet */ ifp = policy_ifp; ro = &necp_route; goto skip_ipsec; } else { error = ENETUNREACH; + OSAddAtomic(1, &ipstat.ips_necp_policy_drop); goto bad; } } - break; } default: break; } } + /* Catch-all to check if the interface is allowed */ + if (!necp_packet_is_allowed_over_interface(m, ifp)) { + error = EHOSTUNREACH; + OSAddAtomic(1, &ipstat.ips_necp_policy_drop); + goto bad; + } #endif /* NECP */ - + #if IPSEC if (ipsec_bypass != 0 || (flags & IP_NOIPSEC)) goto skip_ipsec; @@ -1272,7 +1341,7 @@ sendit: if (flags & IP_ROUTETOIF) { bzero(&ipsec_state.ro, sizeof (ipsec_state.ro)); } else { - route_copyout(&ipsec_state.ro, ro, sizeof (ipsec_state.ro)); + route_copyout((struct route *)&ipsec_state.ro, ro, sizeof (struct route)); } ipsec_state.dst = SA(dst); @@ -1295,6 +1364,11 @@ sendit: struct ip *, ip, struct ip6_hdr *, NULL); error = ipsec4_output(&ipsec_state, sp, flags); + if (ipsec_state.tunneled == 6) { + m0 = m = NULL; + error = 0; + goto bad; + } m0 = m = ipsec_state.m; @@ -1315,10 +1389,10 @@ sendit: */ if (ipsec_state.tunneled) { flags &= ~IP_ROUTETOIF; - ro = &ipsec_state.ro; + ro = (struct route *)&ipsec_state.ro; } } else { - ro = &ipsec_state.ro; + ro = (struct route *)&ipsec_state.ro; } dst = SIN(ipsec_state.dst); if (error) { @@ -1648,7 +1722,7 @@ skip_ipsec: ROUTE_RELEASE(ro_fwd); bcopy(dst, &ro_fwd->ro_dst, sizeof (*dst)); - rtalloc_ign(ro_fwd, RTF_PRCLONING); + rtalloc_ign(ro_fwd, RTF_PRCLONING, false); if (ro_fwd->ro_rt == NULL) { OSAddAtomic(1, &ipstat.ips_noroute); @@ -1720,6 +1794,31 @@ pass: goto bad; } + if (ipoa != NULL) { + u_int8_t dscp = ip->ip_tos >> IPTOS_DSCP_SHIFT; + + error = set_packet_qos(m, ifp, + ipoa->ipoa_flags & IPOAF_QOSMARKING_ALLOWED ? TRUE : FALSE, + ipoa->ipoa_sotc, ipoa->ipoa_netsvctype, &dscp); + if (error == 0) { + ip->ip_tos &= IPTOS_ECN_MASK; + ip->ip_tos |= dscp << IPTOS_DSCP_SHIFT; + } else { + printf("%s if_dscp_for_mbuf() error %d\n", __func__, error); + error = 0; + } + } + + /* + * Some Wi-Fi AP implementations do not correctly handle multicast IP + * packets with DSCP bits set -- see radr://9331522 -- so as a + * workaround we clear the DSCP bits and set the service class to BE + */ + if (IN_MULTICAST(ntohl(pkt_dst.s_addr)) && IFNET_IS_WIFI_INFRA(ifp)) { + ip->ip_tos &= IPTOS_ECN_MASK; + mbuf_set_service_class(m, MBUF_SC_BE); + } + ip_output_checksum(ifp, m, (IP_VHL_HL(ip->ip_vhl) << 2), ip->ip_len, &sw_csum); @@ -1896,6 +1995,10 @@ done: #endif /* IPFIREWALL_FORWARD */ KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_END, error, 0, 0, 0, 0); + if (ip_output_measure) { + net_perf_measure_time(&net_perf, &start_tv, packets_processed); + net_perf_histogram(&net_perf, packets_processed); + } return (error); bad: if (pktcnt > 0) @@ -2168,7 +2271,8 @@ in_finalize_cksum(struct mbuf *m, uint32_t hoff, uint32_t csum_flags) ip_out_cksum_stats(ip->ip_p, len); /* RFC1122 4.1.3.4 */ - if (csum == 0 && (m->m_pkthdr.csum_flags & CSUM_UDP)) + if (csum == 0 && + (m->m_pkthdr.csum_flags & (CSUM_UDP|CSUM_ZERO_INVERT))) csum = 0xffff; /* Insert the checksum in the ULP csum field */ @@ -2180,8 +2284,8 @@ in_finalize_cksum(struct mbuf *m, uint32_t hoff, uint32_t csum_flags) } else { bcopy(&csum, (mtod(m, char *) + offset), sizeof (csum)); } - m->m_pkthdr.csum_flags &= - ~(CSUM_DELAY_DATA | CSUM_DATA_VALID | CSUM_PARTIAL); + m->m_pkthdr.csum_flags &= ~(CSUM_DELAY_DATA | CSUM_DATA_VALID | + CSUM_PARTIAL | CSUM_ZERO_INVERT); } if (sw_csum & CSUM_DELAY_IP) { @@ -2361,8 +2465,10 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) m->m_len = sopt->sopt_valsize; error = sooptcopyin(sopt, mtod(m, char *), m->m_len, m->m_len); - if (error) + if (error) { + m_freem(m); break; + } return (ip_pcbopts(sopt->sopt_name, &inp->inp_options, m)); @@ -2376,6 +2482,7 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) case IP_RECVIF: case IP_RECVTTL: case IP_RECVPKTINFO: + case IP_RECVTOS: error = sooptcopyin(sopt, &optval, sizeof (optval), sizeof (optval)); if (error) @@ -2418,73 +2525,13 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) case IP_RECVPKTINFO: OPTSET(INP_PKTINFO); break; - } - break; -#undef OPTSET -#if CONFIG_FORCE_OUT_IFP - /* - * Apple private interface, similar to IP_BOUND_IF, except - * that the parameter is a NULL-terminated string containing - * the name of the network interface; an emptry string means - * unbind. Applications are encouraged to use IP_BOUND_IF - * instead, as that is the current "official" API. - */ - case IP_FORCE_OUT_IFP: { - char ifname[IFNAMSIZ]; - unsigned int ifscope; - - /* This option is settable only for IPv4 */ - if (!(inp->inp_vflag & INP_IPV4)) { - error = EINVAL; - break; - } - - /* Verify interface name parameter is sane */ - if (sopt->sopt_valsize > sizeof (ifname)) { - error = EINVAL; + case IP_RECVTOS: + OPTSET(INP_RECVTOS); break; + #undef OPTSET } - - /* Copy the interface name */ - if (sopt->sopt_valsize != 0) { - error = sooptcopyin(sopt, ifname, - sizeof (ifname), sopt->sopt_valsize); - if (error) - break; - } - - if (sopt->sopt_valsize == 0 || ifname[0] == '\0') { - /* Unbind this socket from any interface */ - ifscope = IFSCOPE_NONE; - } else { - ifnet_t ifp; - - /* Verify name is NULL terminated */ - if (ifname[sopt->sopt_valsize - 1] != '\0') { - error = EINVAL; - break; - } - - /* Bail out if given bogus interface name */ - if (ifnet_find_by_name(ifname, &ifp) != 0) { - error = ENXIO; - break; - } - - /* Bind this socket to this interface */ - ifscope = ifp->if_index; - - /* - * Won't actually free; since we don't release - * this later, we should do it now. - */ - ifnet_release(ifp); - } - error = inp_bindif(inp, ifscope, NULL); - } - break; -#endif /* CONFIG_FORCE_OUT_IFP */ + break; /* * Multicast socket options are processed by the in_mcast * module. @@ -2545,7 +2592,7 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) int priv; struct mbuf *m; int optname; - + if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ break; if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ @@ -2673,8 +2720,8 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) case IP_RECVTTL: case IP_PORTRANGE: case IP_RECVPKTINFO: + case IP_RECVTOS: switch (sopt->sopt_name) { - case IP_TOS: optval = inp->inp_ip_tos; break; @@ -2717,6 +2764,10 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) case IP_RECVPKTINFO: optval = OPTBIT(INP_PKTINFO); break; + + case IP_RECVTOS: + optval = OPTBIT(INP_RECVTOS); + break; } error = sooptcopyout(sopt, &optval, sizeof (optval)); break; @@ -2739,11 +2790,10 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) #if TRAFFIC_MGT case IP_TRAFFIC_MGT_BACKGROUND: { - unsigned background = (so->so_traffic_mgt_flags & - TRAFFIC_MGT_SO_BACKGROUND) ? 1 : 0; + unsigned background = (so->so_flags1 & + SOF1_TRAFFIC_MGT_SO_BACKGROUND) ? 1 : 0; return (sooptcopyout(sopt, &background, sizeof (background))); - break; } #endif /* TRAFFIC_MGT */ @@ -3057,7 +3107,7 @@ ip_mloopback(struct ifnet *srcifp, struct ifnet *origifp, struct mbuf *m, * interface itself is lo0, this will be overridden by if_loop. */ if (hwcksum_rx) { - copym->m_pkthdr.csum_flags &= ~CSUM_PARTIAL; + copym->m_pkthdr.csum_flags &= ~(CSUM_PARTIAL|CSUM_ZERO_INVERT); copym->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; copym->m_pkthdr.csum_data = 0xffff; @@ -3411,10 +3461,13 @@ ip_output_checksum(struct ifnet *ifp, struct mbuf *m, int hlen, int ip_len, /* * Partial checksum offload, if non-IP fragment, and TCP only * (no UDP support, as the hardware may not be able to convert - * +0 to -0 (0xffff) per RFC1122 4.1.3.4.) + * +0 to -0 (0xffff) per RFC1122 4.1.3.4. unless the interface + * supports "invert zero" capability.) */ if (hwcksum_tx && !tso && - (m->m_pkthdr.csum_flags & CSUM_TCP) && + ((m->m_pkthdr.csum_flags & CSUM_TCP) || + ((hwcap & CSUM_ZERO_INVERT) && + (m->m_pkthdr.csum_flags & CSUM_ZERO_INVERT))) && ip_len <= ifp->if_mtu) { uint16_t start = sizeof (struct ip); uint16_t ulpoff = m->m_pkthdr.csum_data & 0xffff; @@ -3466,3 +3519,57 @@ ip_gre_output(struct mbuf *m) return (error); } + +static int +sysctl_reset_ip_output_stats SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int error, i; + + i = ip_output_measure; + error = sysctl_handle_int(oidp, &i, 0, req); + if (error || req->newptr == USER_ADDR_NULL) + goto done; + /* impose bounds */ + if (i < 0 || i > 1) { + error = EINVAL; + goto done; + } + if (ip_output_measure != i && i == 1) { + net_perf_initialize(&net_perf, ip_output_measure_bins); + } + ip_output_measure = i; +done: + return (error); +} + +static int +sysctl_ip_output_measure_bins SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int error; + uint64_t i; + + i = ip_output_measure_bins; + error = sysctl_handle_quad(oidp, &i, 0, req); + if (error || req->newptr == USER_ADDR_NULL) + goto done; + /* validate data */ + if (!net_perf_validate_bins(i)) { + error = EINVAL; + goto done; + } + ip_output_measure_bins = i; +done: + return (error); +} + +static int +sysctl_ip_output_getperf SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + if (req->oldptr == USER_ADDR_NULL) + req->oldlen = (size_t)sizeof (struct ipstat); + + return (SYSCTL_OUT(req, &net_perf, MIN(sizeof (net_perf), req->oldlen))); +}