+/*
+ * Convert between a regular and a packet header mbuf. Caller is responsible
+ * for setting or clearing M_PKTHDR; this routine does the rest of the work.
+ */
+int
+m_reinit(struct mbuf *m, int hdr)
+{
+ int ret = 0;
+
+ if (hdr) {
+ VERIFY(!(m->m_flags & M_PKTHDR));
+ if (!(m->m_flags & M_EXT) &&
+ (m->m_data != m->m_dat || m->m_len > 0)) {
+ /*
+ * If there's no external cluster attached and the
+ * mbuf appears to contain user data, we cannot
+ * safely convert this to a packet header mbuf,
+ * as the packet header structure might overlap
+ * with the data.
+ */
+ printf("%s: cannot set M_PKTHDR on altered mbuf %llx, "
+ "m_data %llx (expected %llx), "
+ "m_len %d (expected 0)\n",
+ __func__,
+ (uint64_t)VM_KERNEL_ADDRPERM(m),
+ (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
+ (uint64_t)VM_KERNEL_ADDRPERM(m->m_dat), m->m_len);
+ ret = EBUSY;
+ } else {
+ VERIFY((m->m_flags & M_EXT) || m->m_data == m->m_dat);
+ m->m_flags |= M_PKTHDR;
+ MBUF_INIT_PKTHDR(m);
+ }
+ } else {
+ /* Check for scratch area overflow */
+ m_redzone_verify(m);
+ /* Free the aux data and tags if there is any */
+ m_tag_delete_chain(m, NULL);
+ m->m_flags &= ~M_PKTHDR;
+ }
+
+ return (ret);
+}
+
+int
+m_ext_set_prop(struct mbuf *m, uint32_t o, uint32_t n)
+{
+ ASSERT(m->m_flags & M_EXT);
+ return (atomic_test_set_32(&MEXT_PRIV(m), o, n));
+}
+
+uint32_t
+m_ext_get_prop(struct mbuf *m)
+{
+ ASSERT(m->m_flags & M_EXT);
+ return (MEXT_PRIV(m));
+}
+
+int
+m_ext_paired_is_active(struct mbuf *m)
+{
+ return (MBUF_IS_PAIRED(m) ? (MEXT_PREF(m) > MEXT_MINREF(m)) : 1);
+}
+
+void
+m_ext_paired_activate(struct mbuf *m)
+{
+ struct ext_ref *rfa;
+ int hdr, type;
+ caddr_t extbuf;
+ m_ext_free_func_t extfree;
+ u_int extsize;
+
+ VERIFY(MBUF_IS_PAIRED(m));
+ VERIFY(MEXT_REF(m) == MEXT_MINREF(m));
+ VERIFY(MEXT_PREF(m) == MEXT_MINREF(m));
+
+ hdr = (m->m_flags & M_PKTHDR);
+ type = m->m_type;
+ extbuf = m->m_ext.ext_buf;
+ extfree = m_get_ext_free(m);
+ extsize = m->m_ext.ext_size;
+ rfa = m_get_rfa(m);
+
+ VERIFY(extbuf != NULL && rfa != NULL);
+
+ /*
+ * Safe to reinitialize packet header tags, since it's
+ * already taken care of at m_free() time. Similar to
+ * what's done in m_clattach() for the cluster. Bump
+ * up MEXT_PREF to indicate activation.
+ */
+ MBUF_INIT(m, hdr, type);
+ MEXT_INIT(m, extbuf, extsize, extfree, (caddr_t)m, rfa,
+ 1, 1, 2, EXTF_PAIRED, MEXT_PRIV(m), m);
+}
+
+void
+m_scratch_init(struct mbuf *m)
+{
+ struct pkthdr *pkt = &m->m_pkthdr;
+
+ VERIFY(m->m_flags & M_PKTHDR);
+
+ /* See comments in <rdar://problem/14040693> */
+ if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
+ panic_plain("Invalid attempt to modify guarded module-private "
+ "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
+ /* NOTREACHED */
+ }
+
+ bzero(&pkt->pkt_mpriv, sizeof (pkt->pkt_mpriv));
+}
+
+/*
+ * This routine is reserved for mbuf_get_driver_scratch(); clients inside
+ * xnu that intend on utilizing the module-private area should directly
+ * refer to the pkt_mpriv structure in the pkthdr. They are also expected
+ * to set and clear PKTF_PRIV_GUARDED, while owning the packet and prior
+ * to handing it off to another module, respectively.
+ */
+u_int32_t
+m_scratch_get(struct mbuf *m, u_int8_t **p)
+{
+ struct pkthdr *pkt = &m->m_pkthdr;
+
+ VERIFY(m->m_flags & M_PKTHDR);
+
+ /* See comments in <rdar://problem/14040693> */
+ if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
+ panic_plain("Invalid attempt to access guarded module-private "
+ "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
+ /* NOTREACHED */
+ }
+
+ if (mcltrace) {
+ mcache_audit_t *mca;
+
+ lck_mtx_lock(mbuf_mlock);
+ mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
+ if (mca->mca_uflags & MB_SCVALID)
+ mcl_audit_scratch(mca);
+ lck_mtx_unlock(mbuf_mlock);
+ }
+
+ *p = (u_int8_t *)&pkt->pkt_mpriv;
+ return (sizeof (pkt->pkt_mpriv));
+}
+
+static void
+m_redzone_init(struct mbuf *m)
+{
+ VERIFY(m->m_flags & M_PKTHDR);
+ /*
+ * Each mbuf has a unique red zone pattern, which is a XOR
+ * of the red zone cookie and the address of the mbuf.
+ */
+ m->m_pkthdr.redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
+}
+
+static void
+m_redzone_verify(struct mbuf *m)
+{
+ u_int32_t mb_redzone;
+
+ VERIFY(m->m_flags & M_PKTHDR);
+
+ mb_redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
+ if (m->m_pkthdr.redzone != mb_redzone) {
+ panic("mbuf %p redzone violation with value 0x%x "
+ "(instead of 0x%x, using cookie 0x%x)\n",
+ m, m->m_pkthdr.redzone, mb_redzone, mb_redzone_cookie);
+ /* NOTREACHED */
+ }
+}
+
+__private_extern__ inline void
+m_set_ext(struct mbuf *m, struct ext_ref *rfa, m_ext_free_func_t ext_free,
+ caddr_t ext_arg)
+{
+ VERIFY(m->m_flags & M_EXT);
+ if (rfa != NULL) {
+ m->m_ext.ext_refflags =
+ (struct ext_ref *)(((uintptr_t)rfa) ^ mb_obscure_extref);
+ if (ext_free != NULL) {
+ rfa->ext_token = ((uintptr_t)&rfa->ext_token) ^
+ mb_obscure_extfree;
+ m->m_ext.ext_free = (m_ext_free_func_t)
+ (((uintptr_t)ext_free) ^ rfa->ext_token);
+ if (ext_arg != NULL) {
+ m->m_ext.ext_arg =
+ (caddr_t)(((uintptr_t)ext_arg) ^ rfa->ext_token);
+ } else {
+ m->m_ext.ext_arg = NULL;
+ }
+ } else {
+ rfa->ext_token = 0;
+ m->m_ext.ext_free = NULL;
+ m->m_ext.ext_arg = NULL;
+ }
+ } else {
+ /*
+ * If we are going to loose the cookie in ext_token by
+ * resetting the rfa, we should use the global cookie
+ * to obscure the ext_free and ext_arg pointers.
+ */
+ if (ext_free != NULL) {
+ m->m_ext.ext_free =
+ (m_ext_free_func_t)((uintptr_t)ext_free ^
+ mb_obscure_extfree);
+ if (ext_arg != NULL) {
+ m->m_ext.ext_arg =
+ (caddr_t)((uintptr_t)ext_arg ^
+ mb_obscure_extfree);
+ } else {
+ m->m_ext.ext_arg = NULL;
+ }
+ } else {
+ m->m_ext.ext_free = NULL;
+ m->m_ext.ext_arg = NULL;
+ }
+ m->m_ext.ext_refflags = NULL;
+ }
+}
+
+__private_extern__ inline struct ext_ref *
+m_get_rfa(struct mbuf *m)
+{
+ if (m->m_ext.ext_refflags == NULL)
+ return (NULL);
+ else
+ return ((struct ext_ref *)(((uintptr_t)m->m_ext.ext_refflags) ^ mb_obscure_extref));
+}
+
+__private_extern__ inline m_ext_free_func_t
+m_get_ext_free(struct mbuf *m)
+{
+ struct ext_ref *rfa;
+ if (m->m_ext.ext_free == NULL)
+ return (NULL);
+
+ rfa = m_get_rfa(m);
+ if (rfa == NULL)
+ return ((m_ext_free_func_t)((uintptr_t)m->m_ext.ext_free ^ mb_obscure_extfree));
+ else
+ return ((m_ext_free_func_t)(((uintptr_t)m->m_ext.ext_free)
+ ^ rfa->ext_token));
+}
+
+__private_extern__ inline caddr_t
+m_get_ext_arg(struct mbuf *m)
+{
+ struct ext_ref *rfa;
+ if (m->m_ext.ext_arg == NULL)
+ return (NULL);
+
+ rfa = m_get_rfa(m);
+ if (rfa == NULL) {
+ return ((caddr_t)((uintptr_t)m->m_ext.ext_arg ^ mb_obscure_extfree));
+ } else {
+ return ((caddr_t)(((uintptr_t)m->m_ext.ext_arg) ^
+ rfa->ext_token));
+ }
+}
+
+/*
+ * Send a report of mbuf usage if the usage is at least 6% of max limit
+ * or if there has been at least 3% increase since the last report.
+ *
+ * The values 6% and 3% are chosen so that we can do simple arithmetic
+ * with shift operations.
+ */
+static boolean_t
+mbuf_report_usage(mbuf_class_t cl)
+{
+ /* if a report is already in progress, nothing to do */
+ if (mb_peak_newreport)
+ return (TRUE);
+
+ if (m_total(cl) > m_peak(cl) &&
+ m_total(cl) >= (m_maxlimit(cl) >> 4) &&
+ (m_total(cl) - m_peak(cl)) >= (m_peak(cl) >> 5))
+ return (TRUE);
+ return (FALSE);
+}
+
+__private_extern__ void
+mbuf_report_peak_usage(void)
+{
+ int i = 0;
+ u_int64_t uptime;
+ struct nstat_sysinfo_data ns_data;
+ uint32_t memreleased = 0;
+ static uint32_t prevmemreleased;
+
+ uptime = net_uptime();
+ lck_mtx_lock(mbuf_mlock);
+
+ /* Generate an initial report after 1 week of uptime */
+ if (!mb_peak_firstreport &&
+ uptime > MBUF_PEAK_FIRST_REPORT_THRESHOLD) {
+ mb_peak_newreport = TRUE;
+ mb_peak_firstreport = TRUE;
+ }
+
+ if (!mb_peak_newreport) {
+ lck_mtx_unlock(mbuf_mlock);
+ return;
+ }
+
+ /*
+ * Since a report is being generated before 1 week,
+ * we do not need to force another one later
+ */
+ if (uptime < MBUF_PEAK_FIRST_REPORT_THRESHOLD)
+ mb_peak_firstreport = TRUE;
+
+ for (i = 0; i < NELEM(mbuf_table); i++) {
+ m_peak(m_class(i)) = m_total(m_class(i));
+ memreleased += m_release_cnt(i);
+ }
+ memreleased = memreleased - prevmemreleased;
+ prevmemreleased = memreleased;
+ mb_peak_newreport = FALSE;
+ lck_mtx_unlock(mbuf_mlock);
+
+ bzero(&ns_data, sizeof(ns_data));
+ ns_data.flags = NSTAT_SYSINFO_MBUF_STATS;
+ ns_data.u.mb_stats.total_256b = m_peak(MC_MBUF);
+ ns_data.u.mb_stats.total_2kb = m_peak(MC_CL);
+ ns_data.u.mb_stats.total_4kb = m_peak(MC_BIGCL);
+ ns_data.u.mb_stats.total_16kb = m_peak(MC_16KCL);
+ ns_data.u.mb_stats.sbmb_total = total_sbmb_cnt_peak;
+ ns_data.u.mb_stats.sb_atmbuflimit = sbmb_limreached;
+ ns_data.u.mb_stats.draincnt = mbstat.m_drain;
+ ns_data.u.mb_stats.memreleased = memreleased;
+ ns_data.u.mb_stats.sbmb_floor = total_sbmb_cnt_floor;
+
+ nstat_sysinfo_send_data(&ns_data);
+
+ /*
+ * Reset the floor whenever we report a new
+ * peak to track the trend (increase peek usage
+ * is not a leak if mbufs get released
+ * between reports and the floor stays low)
+ */
+ total_sbmb_cnt_floor = total_sbmb_cnt_peak;
+}
+
+/*
+ * Called by the VM when there's memory pressure.
+ */
+__private_extern__ void
+m_drain(void)
+{
+ mbuf_class_t mc;
+ mcl_slab_t *sp, *sp_tmp, *nsp;
+ unsigned int num, k, interval, released = 0;
+ unsigned long total_mem = 0, use_mem = 0;
+ boolean_t ret, purge_caches = FALSE;
+ ppnum_t offset;
+ mcache_obj_t *obj;
+ unsigned long per;
+ static uint64_t last_drain = 0;
+ static unsigned char scratch[32];
+ static ppnum_t scratch_pa = 0;
+
+ if (mb_drain_maxint == 0 || mb_waiters)
+ return;
+ if (scratch_pa == 0) {
+ bzero(scratch, sizeof(scratch));
+ scratch_pa = pmap_find_phys(kernel_pmap, (addr64_t)scratch);
+ VERIFY(scratch_pa);
+ } else if (mclverify) {
+ /*
+ * Panic if a driver wrote to our scratch memory.
+ */
+ for (k = 0; k < sizeof(scratch); k++)
+ if (scratch[k])
+ panic("suspect DMA to freed address");
+ }
+ /*
+ * Don't free memory too often as that could cause excessive
+ * waiting times for mbufs. Purge caches if we were asked to drain
+ * in the last 5 minutes.
+ */
+ lck_mtx_lock(mbuf_mlock);
+ if (last_drain == 0) {
+ last_drain = net_uptime();
+ lck_mtx_unlock(mbuf_mlock);
+ return;
+ }
+ interval = net_uptime() - last_drain;
+ if (interval <= mb_drain_maxint) {
+ lck_mtx_unlock(mbuf_mlock);
+ return;
+ }
+ if (interval <= mb_drain_maxint * 5)
+ purge_caches = TRUE;
+ last_drain = net_uptime();
+ /*
+ * Don't free any memory if we're using 60% or more.
+ */
+ for (mc = 0; mc < NELEM(mbuf_table); mc++) {
+ total_mem += m_total(mc) * m_maxsize(mc);
+ use_mem += m_active(mc) * m_maxsize(mc);
+ }
+ per = (use_mem * 100) / total_mem;
+ if (per >= 60) {
+ lck_mtx_unlock(mbuf_mlock);
+ return;
+ }
+ /*
+ * Purge all the caches. This effectively disables
+ * caching for a few seconds, but the mbuf worker thread will
+ * re-enable them again.
+ */
+ if (purge_caches == TRUE)
+ for (mc = 0; mc < NELEM(mbuf_table); mc++) {
+ if (m_total(mc) < m_avgtotal(mc))
+ continue;
+ lck_mtx_unlock(mbuf_mlock);
+ ret = mcache_purge_cache(m_cache(mc), FALSE);
+ lck_mtx_lock(mbuf_mlock);
+ if (ret == TRUE)
+ m_purge_cnt(mc)++;
+ }
+ /*
+ * Move the objects from the composite class freelist to
+ * the rudimentary slabs list, but keep at least 10% of the average
+ * total in the freelist.
+ */
+ for (mc = 0; mc < NELEM(mbuf_table); mc++) {
+ while (m_cobjlist(mc) &&
+ m_total(mc) < m_avgtotal(mc) &&
+ m_infree(mc) > 0.1 * m_avgtotal(mc) + m_minlimit(mc)) {
+ obj = m_cobjlist(mc);
+ m_cobjlist(mc) = obj->obj_next;
+ obj->obj_next = NULL;
+ num = cslab_free(mc, obj, 1);
+ VERIFY(num == 1);
+ m_free_cnt(mc)++;
+ m_infree(mc)--;
+ /* cslab_free() handles m_total */
+ }
+ }
+ /*
+ * Free the buffers present in the slab list up to 10% of the total
+ * average per class.
+ *
+ * We walk the list backwards in an attempt to reduce fragmentation.
+ */
+ for (mc = NELEM(mbuf_table) - 1; (int)mc >= 0; mc--) {
+ TAILQ_FOREACH_SAFE(sp, &m_slablist(mc), sl_link, sp_tmp) {
+ /*
+ * Process only unused slabs occupying memory.
+ */
+ if (sp->sl_refcnt != 0 || sp->sl_len == 0 ||
+ sp->sl_base == NULL)
+ continue;
+ if (m_total(mc) < m_avgtotal(mc) ||
+ m_infree(mc) < 0.1 * m_avgtotal(mc) + m_minlimit(mc))
+ break;
+ slab_remove(sp, mc);
+ switch (mc) {
+ case MC_MBUF:
+ m_infree(mc) -= NMBPG;
+ m_total(mc) -= NMBPG;
+ if (mclaudit != NULL)
+ mcl_audit_free(sp->sl_base, NMBPG);
+ break;
+ case MC_CL:
+ m_infree(mc) -= NCLPG;
+ m_total(mc) -= NCLPG;
+ if (mclaudit != NULL)
+ mcl_audit_free(sp->sl_base, NMBPG);
+ break;
+ case MC_BIGCL:
+ {
+ m_infree(mc) -= NBCLPG;
+ m_total(mc) -= NBCLPG;
+ if (mclaudit != NULL)
+ mcl_audit_free(sp->sl_base, NMBPG);
+ break;
+ }
+ case MC_16KCL:
+ m_infree(mc)--;
+ m_total(mc)--;
+ for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
+ nsp = nsp->sl_next;
+ VERIFY(nsp->sl_refcnt == 0 &&
+ nsp->sl_base != NULL &&
+ nsp->sl_len == 0);
+ slab_init(nsp, 0, 0, NULL, NULL, 0, 0,
+ 0);
+ nsp->sl_flags = 0;
+ }
+ if (mclaudit != NULL) {
+ if (sp->sl_len == PAGE_SIZE) {
+ mcl_audit_free(sp->sl_base,
+ NMBPG);
+ } else {
+ mcl_audit_free(sp->sl_base, 1);
+ }
+ }
+ break;
+ default:
+ /*
+ * The composite classes have their own
+ * freelist (m_cobjlist), so we only
+ * process rudimentary classes here.
+ */
+ VERIFY(0);
+ }
+ m_release_cnt(mc) += m_size(mc);
+ released += m_size(mc);
+ VERIFY(sp->sl_base != NULL &&
+ sp->sl_len >= PAGE_SIZE);
+ offset = MTOPG(sp->sl_base);
+ /*
+ * Make sure the IOMapper points to a valid, but
+ * bogus, address. This should prevent further DMA
+ * accesses to freed memory.
+ */
+ IOMapperInsertPage(mcl_paddr_base, offset, scratch_pa);
+ mcl_paddr[offset] = 0;
+ kmem_free(mb_map, (vm_offset_t)sp->sl_base,
+ sp->sl_len);
+ slab_init(sp, 0, 0, NULL, NULL, 0, 0, 0);
+ sp->sl_flags = 0;
+ }
+ }
+ mbstat.m_drain++;
+ mbstat.m_bigclusters = m_total(MC_BIGCL);
+ mbstat.m_clusters = m_total(MC_CL);
+ mbstat.m_mbufs = m_total(MC_MBUF);
+ mbuf_stat_sync();
+ mbuf_mtypes_sync(TRUE);
+ lck_mtx_unlock(mbuf_mlock);
+}
+
+static int
+m_drain_force_sysctl SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2)
+ int val = 0, err;
+
+ err = sysctl_handle_int(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == USER_ADDR_NULL)
+ return (err);
+ if (val)
+ m_drain();
+
+ return (err);
+}
+
+#if DEBUG || DEVELOPMENT
+
+static int mbtest_val;
+static int mbtest_running;
+
+static void mbtest_thread(__unused void *arg)
+{
+ int i;
+
+ printf("%s thread starting\n", __func__);
+
+ for (i = 0; i < 1000; i++) {
+ unsigned int needed = 100000;
+ struct mbuf *m1, *m2, *m3;
+
+ if (njcl > 0) {
+ needed = 100000;
+ m3 = m_getpackets_internal(&needed, 0, M_DONTWAIT, 0, M16KCLBYTES);
+ m_freem_list(m3);
+ }
+
+ needed = 100000;
+ m2 = m_getpackets_internal(&needed, 0, M_DONTWAIT, 0, MBIGCLBYTES);
+ m_freem_list(m2);
+
+ m1 = m_getpackets_internal(&needed, 0, M_DONTWAIT, 0, MCLBYTES);
+ m_freem_list(m1);
+ }
+
+ printf("%s thread ending\n", __func__);
+
+ OSDecrementAtomic(&mbtest_running);
+ wakeup_one((caddr_t)&mbtest_running);
+}
+
+static void sysctl_mbtest(void)
+{
+ /* We launch three threads - wait for all of them */
+ OSIncrementAtomic(&mbtest_running);
+ OSIncrementAtomic(&mbtest_running);
+ OSIncrementAtomic(&mbtest_running);
+
+ thread_call_func_delayed((thread_call_func_t)mbtest_thread, NULL, 10);
+ thread_call_func_delayed((thread_call_func_t)mbtest_thread, NULL, 10);
+ thread_call_func_delayed((thread_call_func_t)mbtest_thread, NULL, 10);
+
+ while (mbtest_running) {
+ msleep((caddr_t)&mbtest_running, NULL, PUSER, "mbtest_running", NULL);
+ }
+}
+
+static int
+mbtest SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2)
+ int error = 0, val, oldval = mbtest_val;
+
+ val = oldval;
+ error = sysctl_handle_int(oidp, &val, 0, req);
+ if (error || !req->newptr)
+ return (error);
+
+ if (val != oldval)
+ sysctl_mbtest();
+
+ mbtest_val = val;
+
+ return (error);
+}
+#endif
+