/*
- * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
#include <net/ntstat.h>
#include <net/net_osdep.h>
#include <net/dlil.h>
+#include <net/net_perf.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
#include <netinet/ip_var.h>
#include <netinet/kpi_ipfilter_var.h>
+#include <netinet/in_tclass.h>
#if CONFIG_MACF_NET
#include <security/mac_framework.h>
u_short ip_id;
+static int sysctl_reset_ip_output_stats SYSCTL_HANDLER_ARGS;
+static int sysctl_ip_output_measure_bins SYSCTL_HANDLER_ARGS;
+static int sysctl_ip_output_getperf SYSCTL_HANDLER_ARGS;
static void ip_out_cksum_stats(int, u_int32_t);
static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *);
static int ip_optcopy(struct ip *, struct ip *);
CTLFLAG_RW | CTLFLAG_LOCKED, &ip_select_srcif_debug, 0,
"log source interface selection debug info");
+static int ip_output_measure = 0;
+SYSCTL_PROC(_net_inet_ip, OID_AUTO, output_perf,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
+ &ip_output_measure, 0, sysctl_reset_ip_output_stats, "I",
+ "Do time measurement");
+
+static uint64_t ip_output_measure_bins = 0;
+SYSCTL_PROC(_net_inet_ip, OID_AUTO, output_perf_bins,
+ CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &ip_output_measure_bins, 0,
+ sysctl_ip_output_measure_bins, "I",
+ "bins for chaining performance data histogram");
+
+static net_perf_t net_perf;
+SYSCTL_PROC(_net_inet_ip, OID_AUTO, output_perf_data,
+ CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
+ 0, 0, sysctl_ip_output_getperf, "S,net_perf",
+ "IP output performance data (struct net_perf, net/net_perf.h)");
+
+__private_extern__ int rfc6864 = 1;
+SYSCTL_INT(_net_inet_ip, OID_AUTO, rfc6864, CTLFLAG_RW | CTLFLAG_LOCKED,
+ &rfc6864, 0, "updated ip id field behavior");
+
#define IMO_TRACE_HIST_SIZE 32 /* size of trace history */
/* For gdb */
ipfilter_t inject_filter_ref = NULL;
struct mbuf *packetlist;
uint32_t sw_csum, pktcnt = 0, scnt = 0, bytecnt = 0;
+ uint32_t packets_processed = 0;
unsigned int ifscope = IFSCOPE_NONE;
struct flowadv *adv = NULL;
+ struct timeval start_tv;
#if IPSEC
struct socket *so = NULL;
struct secpolicy *sp = NULL;
uint32_t raw;
} ipobf = { .raw = 0 };
+/*
+ * Here we check for restrictions when sending frames.
+ * N.B.: IPv4 over internal co-processor interfaces is not allowed.
+ */
#define IP_CHECK_RESTRICTIONS(_ifp, _ipobf) \
(((_ipobf).nocell && IFNET_IS_CELLULAR(_ifp)) || \
((_ipobf).noexpensive && IFNET_IS_EXPENSIVE(_ifp)) || \
+ (IFNET_IS_INTCOPROC(_ifp)) || \
(!(_ipobf).awdl_unrestricted && IFNET_IS_AWDL_RESTRICTED(_ifp)))
+ if (ip_output_measure)
+ net_perf_start_time(&net_perf, &start_tv);
KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
VERIFY(m0->m_flags & M_PKTHDR);
}
}
#endif /* IPSEC */
-
+
VERIFY(ro != NULL);
- if (ip_doscopedroute && (flags & IP_OUTARGS)) {
+ if (flags & IP_OUTARGS) {
/*
* In the forwarding case, only the ifscope value is used,
* as source interface selection doesn't take place.
adv->code = FADV_SUCCESS;
ipoa->ipoa_retflags = 0;
}
-
+
#if IPSEC
if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) {
so = ipsec_getsocket(m);
#endif /* DUMMYNET */
loopit:
+ packets_processed++;
ipobf.isbroadcast = FALSE;
ipobf.didfilter = FALSE;
#if IPFIREWALL_FORWARD
if (!(flags & (IP_FORWARDING|IP_RAWOUTPUT))) {
ip->ip_vhl = IP_MAKE_VHL(IPVERSION, hlen >> 2);
ip->ip_off &= IP_DF;
- ip->ip_id = ip_randomid();
+ if (rfc6864 && IP_OFF_IS_ATOMIC(ip->ip_off)) {
+ // Per RFC6864, value of ip_id is undefined for atomic ip packets
+ ip->ip_id = 0;
+ } else {
+ ip->ip_id = ip_randomid();
+ }
OSAddAtomic(1, &ipstat.ips_localout);
} else {
hlen = IP_VHL_HL(ip->ip_vhl) << 2;
if (ia == NULL) {
OSAddAtomic(1, &ipstat.ips_noroute);
error = ENETUNREACH;
+ /* XXX IPv6 APN fallback notification?? */
goto bad;
}
}
/*
* If the source address belongs to a restricted
- * interface and the caller forbids our using
+ * interface and the caller forbids our using
* interfaces of such type, pretend that there is no
* route.
*/
- if (ia0 != NULL &&
+ if (ia0 != NULL &&
IP_CHECK_RESTRICTIONS(ia0->ifa_ifp, ipobf)) {
IFA_REMREF(ia0);
ia0 = NULL;
rtalloc_scoped_ign(ro, ign, ifscope);
/*
- * If the route points to a cellular/expensive interface
+ * If the route points to a cellular/expensive interface
* and the caller forbids our using interfaces of such type,
* pretend that there is no route.
*/
if (IN_MULTICAST(ntohl(pkt_dst.s_addr))) {
struct ifnet *srcifp = NULL;
struct in_multi *inm;
- u_int32_t vif;
+ u_int32_t vif = 0;
u_int8_t ttl = IP_DEFAULT_MULTICAST_TTL;
u_int8_t loop = IP_DEFAULT_MULTICAST_LOOP;
necp_mark_packet_from_ip(m, necp_matched_policy_id);
switch (necp_result) {
case NECP_KERNEL_POLICY_RESULT_PASS:
+ /* Check if the interface is allowed */
+ if (!necp_packet_is_allowed_over_interface(m, ifp)) {
+ error = EHOSTUNREACH;
+ OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
+ goto bad;
+ }
goto skip_ipsec;
case NECP_KERNEL_POLICY_RESULT_DROP:
case NECP_KERNEL_POLICY_RESULT_SOCKET_DIVERT:
/* Flow divert packets should be blocked at the IP layer */
error = EHOSTUNREACH;
+ OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
goto bad;
case NECP_KERNEL_POLICY_RESULT_IP_TUNNEL: {
/* Verify that the packet is being routed to the tunnel */
struct ifnet *policy_ifp = necp_get_ifnet_from_result_parameter(&necp_result_parameter);
if (policy_ifp == ifp) {
+ /* Check if the interface is allowed */
+ if (!necp_packet_is_allowed_over_interface(m, ifp)) {
+ error = EHOSTUNREACH;
+ OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
+ goto bad;
+ }
goto skip_ipsec;
} else {
if (necp_packet_can_rebind_to_ifnet(m, policy_ifp, &necp_route, AF_INET)) {
+ /* Check if the interface is allowed */
+ if (!necp_packet_is_allowed_over_interface(m, policy_ifp)) {
+ error = EHOSTUNREACH;
+ OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
+ goto bad;
+ }
+
/* Set ifp to the tunnel interface, since it is compatible with the packet */
ifp = policy_ifp;
ro = &necp_route;
goto skip_ipsec;
} else {
error = ENETUNREACH;
+ OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
goto bad;
}
}
- break;
}
default:
break;
}
}
+ /* Catch-all to check if the interface is allowed */
+ if (!necp_packet_is_allowed_over_interface(m, ifp)) {
+ error = EHOSTUNREACH;
+ OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
+ goto bad;
+ }
#endif /* NECP */
-
+
#if IPSEC
if (ipsec_bypass != 0 || (flags & IP_NOIPSEC))
goto skip_ipsec;
if (flags & IP_ROUTETOIF) {
bzero(&ipsec_state.ro, sizeof (ipsec_state.ro));
} else {
- route_copyout(&ipsec_state.ro, ro, sizeof (ipsec_state.ro));
+ route_copyout((struct route *)&ipsec_state.ro, ro, sizeof (struct route));
}
ipsec_state.dst = SA(dst);
struct ip *, ip, struct ip6_hdr *, NULL);
error = ipsec4_output(&ipsec_state, sp, flags);
+ if (ipsec_state.tunneled == 6) {
+ m0 = m = NULL;
+ error = 0;
+ goto bad;
+ }
m0 = m = ipsec_state.m;
*/
if (ipsec_state.tunneled) {
flags &= ~IP_ROUTETOIF;
- ro = &ipsec_state.ro;
+ ro = (struct route *)&ipsec_state.ro;
}
} else {
- ro = &ipsec_state.ro;
+ ro = (struct route *)&ipsec_state.ro;
}
dst = SIN(ipsec_state.dst);
if (error) {
ROUTE_RELEASE(ro_fwd);
bcopy(dst, &ro_fwd->ro_dst, sizeof (*dst));
- rtalloc_ign(ro_fwd, RTF_PRCLONING);
+ rtalloc_ign(ro_fwd, RTF_PRCLONING, false);
if (ro_fwd->ro_rt == NULL) {
OSAddAtomic(1, &ipstat.ips_noroute);
goto bad;
}
+ if (ipoa != NULL) {
+ u_int8_t dscp = ip->ip_tos >> IPTOS_DSCP_SHIFT;
+
+ error = set_packet_qos(m, ifp,
+ ipoa->ipoa_flags & IPOAF_QOSMARKING_ALLOWED ? TRUE : FALSE,
+ ipoa->ipoa_sotc, ipoa->ipoa_netsvctype, &dscp);
+ if (error == 0) {
+ ip->ip_tos &= IPTOS_ECN_MASK;
+ ip->ip_tos |= dscp << IPTOS_DSCP_SHIFT;
+ } else {
+ printf("%s if_dscp_for_mbuf() error %d\n", __func__, error);
+ error = 0;
+ }
+ }
+
+ /*
+ * Some Wi-Fi AP implementations do not correctly handle multicast IP
+ * packets with DSCP bits set -- see radr://9331522 -- so as a
+ * workaround we clear the DSCP bits and set the service class to BE
+ */
+ if (IN_MULTICAST(ntohl(pkt_dst.s_addr)) && IFNET_IS_WIFI_INFRA(ifp)) {
+ ip->ip_tos &= IPTOS_ECN_MASK;
+ mbuf_set_service_class(m, MBUF_SC_BE);
+ }
+
ip_output_checksum(ifp, m, (IP_VHL_HL(ip->ip_vhl) << 2),
ip->ip_len, &sw_csum);
#endif /* IPFIREWALL_FORWARD */
KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_END, error, 0, 0, 0, 0);
+ if (ip_output_measure) {
+ net_perf_measure_time(&net_perf, &start_tv, packets_processed);
+ net_perf_histogram(&net_perf, packets_processed);
+ }
return (error);
bad:
if (pktcnt > 0)
ip_out_cksum_stats(ip->ip_p, len);
/* RFC1122 4.1.3.4 */
- if (csum == 0 && (m->m_pkthdr.csum_flags & CSUM_UDP))
+ if (csum == 0 &&
+ (m->m_pkthdr.csum_flags & (CSUM_UDP|CSUM_ZERO_INVERT)))
csum = 0xffff;
/* Insert the checksum in the ULP csum field */
} else {
bcopy(&csum, (mtod(m, char *) + offset), sizeof (csum));
}
- m->m_pkthdr.csum_flags &=
- ~(CSUM_DELAY_DATA | CSUM_DATA_VALID | CSUM_PARTIAL);
+ m->m_pkthdr.csum_flags &= ~(CSUM_DELAY_DATA | CSUM_DATA_VALID |
+ CSUM_PARTIAL | CSUM_ZERO_INVERT);
}
if (sw_csum & CSUM_DELAY_IP) {
m->m_len = sopt->sopt_valsize;
error = sooptcopyin(sopt, mtod(m, char *),
m->m_len, m->m_len);
- if (error)
+ if (error) {
+ m_freem(m);
break;
+ }
return (ip_pcbopts(sopt->sopt_name,
&inp->inp_options, m));
case IP_RECVIF:
case IP_RECVTTL:
case IP_RECVPKTINFO:
+ case IP_RECVTOS:
error = sooptcopyin(sopt, &optval, sizeof (optval),
sizeof (optval));
if (error)
case IP_RECVPKTINFO:
OPTSET(INP_PKTINFO);
break;
- }
- break;
-#undef OPTSET
-#if CONFIG_FORCE_OUT_IFP
- /*
- * Apple private interface, similar to IP_BOUND_IF, except
- * that the parameter is a NULL-terminated string containing
- * the name of the network interface; an emptry string means
- * unbind. Applications are encouraged to use IP_BOUND_IF
- * instead, as that is the current "official" API.
- */
- case IP_FORCE_OUT_IFP: {
- char ifname[IFNAMSIZ];
- unsigned int ifscope;
-
- /* This option is settable only for IPv4 */
- if (!(inp->inp_vflag & INP_IPV4)) {
- error = EINVAL;
- break;
- }
-
- /* Verify interface name parameter is sane */
- if (sopt->sopt_valsize > sizeof (ifname)) {
- error = EINVAL;
+ case IP_RECVTOS:
+ OPTSET(INP_RECVTOS);
break;
+ #undef OPTSET
}
-
- /* Copy the interface name */
- if (sopt->sopt_valsize != 0) {
- error = sooptcopyin(sopt, ifname,
- sizeof (ifname), sopt->sopt_valsize);
- if (error)
- break;
- }
-
- if (sopt->sopt_valsize == 0 || ifname[0] == '\0') {
- /* Unbind this socket from any interface */
- ifscope = IFSCOPE_NONE;
- } else {
- ifnet_t ifp;
-
- /* Verify name is NULL terminated */
- if (ifname[sopt->sopt_valsize - 1] != '\0') {
- error = EINVAL;
- break;
- }
-
- /* Bail out if given bogus interface name */
- if (ifnet_find_by_name(ifname, &ifp) != 0) {
- error = ENXIO;
- break;
- }
-
- /* Bind this socket to this interface */
- ifscope = ifp->if_index;
-
- /*
- * Won't actually free; since we don't release
- * this later, we should do it now.
- */
- ifnet_release(ifp);
- }
- error = inp_bindif(inp, ifscope, NULL);
- }
- break;
-#endif /* CONFIG_FORCE_OUT_IFP */
+ break;
/*
* Multicast socket options are processed by the in_mcast
* module.
int priv;
struct mbuf *m;
int optname;
-
+
if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
break;
if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
case IP_RECVTTL:
case IP_PORTRANGE:
case IP_RECVPKTINFO:
+ case IP_RECVTOS:
switch (sopt->sopt_name) {
-
case IP_TOS:
optval = inp->inp_ip_tos;
break;
case IP_RECVPKTINFO:
optval = OPTBIT(INP_PKTINFO);
break;
+
+ case IP_RECVTOS:
+ optval = OPTBIT(INP_RECVTOS);
+ break;
}
error = sooptcopyout(sopt, &optval, sizeof (optval));
break;
#if TRAFFIC_MGT
case IP_TRAFFIC_MGT_BACKGROUND: {
- unsigned background = (so->so_traffic_mgt_flags &
- TRAFFIC_MGT_SO_BACKGROUND) ? 1 : 0;
+ unsigned background = (so->so_flags1 &
+ SOF1_TRAFFIC_MGT_SO_BACKGROUND) ? 1 : 0;
return (sooptcopyout(sopt, &background,
sizeof (background)));
- break;
}
#endif /* TRAFFIC_MGT */
* interface itself is lo0, this will be overridden by if_loop.
*/
if (hwcksum_rx) {
- copym->m_pkthdr.csum_flags &= ~CSUM_PARTIAL;
+ copym->m_pkthdr.csum_flags &= ~(CSUM_PARTIAL|CSUM_ZERO_INVERT);
copym->m_pkthdr.csum_flags |=
CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
copym->m_pkthdr.csum_data = 0xffff;
/*
* Partial checksum offload, if non-IP fragment, and TCP only
* (no UDP support, as the hardware may not be able to convert
- * +0 to -0 (0xffff) per RFC1122 4.1.3.4.)
+ * +0 to -0 (0xffff) per RFC1122 4.1.3.4. unless the interface
+ * supports "invert zero" capability.)
*/
if (hwcksum_tx && !tso &&
- (m->m_pkthdr.csum_flags & CSUM_TCP) &&
+ ((m->m_pkthdr.csum_flags & CSUM_TCP) ||
+ ((hwcap & CSUM_ZERO_INVERT) &&
+ (m->m_pkthdr.csum_flags & CSUM_ZERO_INVERT))) &&
ip_len <= ifp->if_mtu) {
uint16_t start = sizeof (struct ip);
uint16_t ulpoff = m->m_pkthdr.csum_data & 0xffff;
return (error);
}
+
+static int
+sysctl_reset_ip_output_stats SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2)
+ int error, i;
+
+ i = ip_output_measure;
+ error = sysctl_handle_int(oidp, &i, 0, req);
+ if (error || req->newptr == USER_ADDR_NULL)
+ goto done;
+ /* impose bounds */
+ if (i < 0 || i > 1) {
+ error = EINVAL;
+ goto done;
+ }
+ if (ip_output_measure != i && i == 1) {
+ net_perf_initialize(&net_perf, ip_output_measure_bins);
+ }
+ ip_output_measure = i;
+done:
+ return (error);
+}
+
+static int
+sysctl_ip_output_measure_bins SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2)
+ int error;
+ uint64_t i;
+
+ i = ip_output_measure_bins;
+ error = sysctl_handle_quad(oidp, &i, 0, req);
+ if (error || req->newptr == USER_ADDR_NULL)
+ goto done;
+ /* validate data */
+ if (!net_perf_validate_bins(i)) {
+ error = EINVAL;
+ goto done;
+ }
+ ip_output_measure_bins = i;
+done:
+ return (error);
+}
+
+static int
+sysctl_ip_output_getperf SYSCTL_HANDLER_ARGS
+{
+#pragma unused(oidp, arg1, arg2)
+ if (req->oldptr == USER_ADDR_NULL)
+ req->oldlen = (size_t)sizeof (struct ipstat);
+
+ return (SYSCTL_OUT(req, &net_perf, MIN(sizeof (net_perf), req->oldlen)));
+}