X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/0c530ab8987f0ae6a1a3d9284f40182b88852816..2d21ac55c334faf3a56e5634905ed6987fc787d4:/bsd/kern/uipc_mbuf.c diff --git a/bsd/kern/uipc_mbuf.c b/bsd/kern/uipc_mbuf.c index aca7fcc51..38fe262e6 100644 --- a/bsd/kern/uipc_mbuf.c +++ b/bsd/kern/uipc_mbuf.c @@ -1,23 +1,29 @@ /* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2007 Apple Inc. All rights reserved. * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ /* @@ -54,15 +60,11 @@ * * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94 */ -/* HISTORY - * - * 10/15/97 Annette DeSchon (deschon@apple.com) - * Fixed bug in which all cluster mbufs were broken up - * into regular mbufs: Some clusters are now reserved. - * When a cluster is needed, regular mbufs are no longer - * used. (Radar 1683621) - * 20-May-95 Mac Gillon (mgillon) at NeXT - * New version based on 4.4 +/* + * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce + * support for mandatory and extensible security protections. This notice + * is included in support of clause 2.2 (b) of the Apple Public License, + * Version 2.0. */ #include @@ -74,737 +76,2945 @@ #include #include #include +#include -#include #include +#include +#include #include +#include + +#include +#include #include -extern vm_offset_t kmem_mb_alloc(vm_map_t , int ); -extern boolean_t PE_parse_boot_arg(const char *, void *); - -#define _MCLREF(p) (++mclrefcnt[mtocl(p)]) -#define _MCLUNREF(p) (--mclrefcnt[mtocl(p)] == 0) -#define _M_CLEAR_PKTHDR(mbuf_ptr) (mbuf_ptr)->m_pkthdr.rcvif = NULL; \ - (mbuf_ptr)->m_pkthdr.len = 0; \ - (mbuf_ptr)->m_pkthdr.header = NULL; \ - (mbuf_ptr)->m_pkthdr.csum_flags = 0; \ - (mbuf_ptr)->m_pkthdr.csum_data = 0; \ - (mbuf_ptr)->m_pkthdr.aux = (struct mbuf*)NULL; \ - (mbuf_ptr)->m_pkthdr.vlan_tag = 0; \ - (mbuf_ptr)->m_pkthdr.socket_id = 0; \ - SLIST_INIT(&(mbuf_ptr)->m_pkthdr.tags); +#include +#include -/* kernel translater */ -extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va); +#if CONFIG_MACF_NET +#include +#endif /* MAC_NET */ + +#include -lck_mtx_t * mbuf_mlock; -lck_grp_t * mbuf_mlock_grp; -lck_grp_attr_t * mbuf_mlock_grp_attr; -lck_attr_t * mbuf_mlock_attr; -extern lck_mtx_t *domain_proto_mtx; +/* + * MBUF IMPLEMENTATION NOTES. + * + * There is a total of 5 per-CPU caches: + * + * MC_MBUF: + * This is a cache of rudimentary objects of MSIZE in size; each + * object represents an mbuf structure. This cache preserves only + * the m_type field of the mbuf during its transactions. + * + * MC_CL: + * This is a cache of rudimentary objects of MCLBYTES in size; each + * object represents a mcluster structure. This cache does not + * preserve the contents of the objects during its transactions. + * + * MC_BIGCL: + * This is a cache of rudimentary objects of NBPG in size; each + * object represents a mbigcluster structure. This cache does not + * preserve the contents of the objects during its transaction. + * + * MC_MBUF_CL: + * This is a cache of mbufs each having a cluster attached to it. + * It is backed by MC_MBUF and MC_CL rudimentary caches. Several + * fields of the mbuf related to the external cluster are preserved + * during transactions. + * + * MC_MBUF_BIGCL: + * This is a cache of mbufs each having a big cluster attached to it. + * It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several + * fields of the mbuf related to the external cluster are preserved + * during transactions. + * + * OBJECT ALLOCATION: + * + * Allocation requests are handled first at the per-CPU (mcache) layer + * before falling back to the slab layer. Performance is optimal when + * the request is satisfied at the CPU layer because global data/lock + * never gets accessed. When the slab layer is entered for allocation, + * the slab freelist will be checked first for available objects before + * the VM backing store is invoked. Slab layer operations are serialized + * for all of the caches as the mbuf global lock is held most of the time. + * Allocation paths are different depending on the class of objects: + * + * a. Rudimentary object: + * + * { m_get_common(), m_clattach(), m_mclget(), + * m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(), + * composite object allocation } + * | ^ + * | | + * | +-----------------------+ + * v | + * mcache_alloc/mcache_alloc_ext() mbuf_slab_audit() + * | ^ + * v | + * [CPU cache] -------> (found?) -------+ + * | | + * v | + * mbuf_slab_alloc() | + * | | + * v | + * +---------> [freelist] -------> (found?) -------+ + * | | + * | v + * | m_clalloc() + * | | + * | v + * +---<<---- kmem_mb_alloc() + * + * b. Composite object: + * + * { m_getpackets_internal(), m_allocpacket_internal() } + * | ^ + * | | + * | +------ (done) ---------+ + * v | + * mcache_alloc/mcache_alloc_ext() mbuf_cslab_audit() + * | ^ + * v | + * [CPU cache] -------> (found?) -------+ + * | | + * v | + * mbuf_cslab_alloc() | + * | | + * v | + * [freelist] -------> (found?) -------+ + * | | + * v | + * (rudimentary object) | + * mcache_alloc/mcache_alloc_ext() ------>>-----+ + * + * Auditing notes: If auditing is enabled, buffers will be subjected to + * integrity checks by the audit routine. This is done by verifying their + * contents against DEADBEEF (free) pattern before returning them to caller. + * As part of this step, the routine will also record the transaction and + * pattern-fill the buffers with BADDCAFE (uninitialized) pattern. It will + * also restore any constructed data structure fields if necessary. + * + * OBJECT DEALLOCATION: + * + * Freeing an object simply involves placing it into the CPU cache; this + * pollutes the cache to benefit subsequent allocations. The slab layer + * will only be entered if the object is to be purged out of the cache. + * During normal operations, this happens only when the CPU layer resizes + * its bucket while it's adjusting to the allocation load. Deallocation + * paths are different depending on the class of objects: + * + * a. Rudimentary object: + * + * { m_free(), m_freem_list(), composite object deallocation } + * | ^ + * | | + * | +------ (done) ---------+ + * v | + * mcache_free/mcache_free_ext() | + * | | + * v | + * mbuf_slab_audit() | + * | | + * v | + * [CPU cache] ---> (not purging?) -----+ + * | | + * v | + * mbuf_slab_free() | + * | | + * v | + * [freelist] ----------->>------------+ + * (objects never get purged to VM) + * + * b. Composite object: + * + * { m_free(), m_freem_list() } + * | ^ + * | | + * | +------ (done) ---------+ + * v | + * mcache_free/mcache_free_ext() | + * | | + * v | + * mbuf_cslab_audit() | + * | | + * v | + * [CPU cache] ---> (not purging?) -----+ + * | | + * v | + * mbuf_cslab_free() | + * | | + * v | + * [freelist] ---> (not purging?) -----+ + * | | + * v | + * (rudimentary object) | + * mcache_free/mcache_free_ext() ------->>------+ + * + * Auditing notes: If auditing is enabled, the audit routine will save + * any constructed data structure fields (if necessary) before filling the + * contents of the buffers with DEADBEEF (free) pattern and recording the + * transaction. Buffers that are freed (whether at CPU or slab layer) are + * expected to contain the free pattern. + * + * DEBUGGING: + * + * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this + * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT). Additionally, + * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag, + * i.e. modify the boot argument parameter to "mbuf_debug=0x13". Note + * that debugging consumes more CPU and memory. + * + * Each object is associated with exactly one mcache_audit_t structure that + * contains the information related to its last buffer transaction. Given + * an address of an object, the audit structure can be retrieved by finding + * the position of the object relevant to the base address of the cluster: + * + * +------------+ +=============+ + * | mbuf addr | | mclaudit[i] | + * +------------+ +=============+ + * | | cl_audit[0] | + * i = MTOCL(addr) +-------------+ + * | +-----> | cl_audit[1] | -----> mcache_audit_t + * b = CLTOM(i) | +-------------+ + * | | | ... | + * x = MCLIDX(b, addr) | +-------------+ + * | | | cl_audit[7] | + * +-----------------+ +-------------+ + * (e.g. x == 1) + * + * The mclaudit[] array is allocated at initialization time, but its contents + * get populated when the corresponding cluster is created. Because a cluster + * can be turned into NMBPCL number of mbufs, we preserve enough space for the + * mbufs so that there is a 1-to-1 mapping between them. A cluster that never + * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the + * remaining entries unused. For big clusters, only one entry is allocated + * and used for the entire cluster pair. + */ -struct mbuf *mfree; /* mbuf free list */ -struct mbuf *mfreelater; /* mbuf deallocation list */ +/* TODO: should be in header file */ +/* kernel translater */ +extern vm_offset_t kmem_mb_alloc(vm_map_t, int); +extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va); extern vm_map_t mb_map; /* special map */ -int m_want; /* sleepers on mbufs */ -short *mclrefcnt; /* mapped cluster reference counts */ -int *mcl_paddr; + +/* Global lock */ +static lck_mtx_t *mbuf_mlock; +static lck_attr_t *mbuf_mlock_attr; +static lck_grp_t *mbuf_mlock_grp; +static lck_grp_attr_t *mbuf_mlock_grp_attr; + +/* Back-end (common) layer */ +static void *mbuf_worker_run; /* wait channel for worker thread */ +static int mbuf_worker_ready; /* worker thread is runnable */ +static int mbuf_expand_mcl; /* number of cluster creation requets */ +static int mbuf_expand_big; /* number of big cluster creation requests */ +static int mbuf_expand_16k; /* number of 16K cluster creation requests */ +static int ncpu; /* number of CPUs */ +static int *mcl_paddr; /* Array of cluster physical addresses */ static ppnum_t mcl_paddr_base; /* Handle returned by IOMapper::iovmAlloc() */ -union mcluster *mclfree; /* mapped cluster free list */ -union mbigcluster *mbigfree; /* mapped cluster free list */ -int max_linkhdr; /* largest link-level header */ -int max_protohdr; /* largest protocol header */ -int max_hdr; /* largest link+protocol header */ -int max_datalen; /* MHLEN - max_hdr */ -struct mbstat mbstat; /* statistics */ -union mcluster *mbutl; /* first mapped cluster address */ -union mcluster *embutl; /* ending virtual address of mclusters */ - -static int nclpp; /* # clusters per physical page */ - -static int m_howmany(int, size_t ); -void m_reclaim(void); -static int m_clalloc(const int , const int, const size_t, int); -int do_reclaim = 0; +static mcache_t *ref_cache; /* Cache of cluster reference & flags */ +static mcache_t *mcl_audit_con_cache; /* Audit contents cache */ +static unsigned int mbuf_debug; /* patchable mbuf mcache flags */ +static unsigned int mb_normalized; /* number of packets "normalized" */ + +typedef enum { + MC_MBUF = 0, /* Regular mbuf */ + MC_CL, /* Cluster */ + MC_BIGCL, /* Large (4K) cluster */ + MC_16KCL, /* Jumbo (16K) cluster */ + MC_MBUF_CL, /* mbuf + cluster */ + MC_MBUF_BIGCL, /* mbuf + large (4K) cluster */ + MC_MBUF_16KCL /* mbuf + jumbo (16K) cluster */ +} mbuf_class_t; + +#define MBUF_CLASS_MIN MC_MBUF +#define MBUF_CLASS_MAX MC_MBUF_16KCL +#define MBUF_CLASS_LAST MC_16KCL +#define MBUF_CLASS_VALID(c) \ + ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX) +#define MBUF_CLASS_COMPOSITE(c) \ + ((int)(c) > MBUF_CLASS_LAST) -#define MF_NOWAIT 0x1 -#define MF_BIG 0x2 - -/* The number of cluster mbufs that are allocated, to start. */ -#define MINCL max(16, 2) - -static int mbuf_expand_thread_wakeup = 0; -static int mbuf_expand_mcl = 0; -static int mbuf_expand_big = 0; -static int mbuf_expand_thread_initialized = 0; - -static void mbuf_expand_thread_init(void); -static void mbuf_expand_thread(void); -static int m_expand(int ); -static caddr_t m_bigalloc(int ); -static void m_bigfree(caddr_t , u_int , caddr_t ); -__private_extern__ struct mbuf * m_mbigget(struct mbuf *, int ); -void mbinit(void); -static void m_range_check(void *addr); - - -#if 0 -static int mfree_munge = 0; -#if 0 -#define _MFREE_MUNGE(m) { \ - if (mfree_munge) \ - { int i; \ - vm_offset_t *element = (vm_offset_t *)(m); \ - for (i = 0; \ - i < sizeof(struct mbuf)/sizeof(vm_offset_t); \ - i++) \ - (element)[i] = 0xdeadbeef; \ - } \ -} -#else -void -munge_mbuf(struct mbuf *m) -{ - int i; - vm_offset_t *element = (vm_offset_t *)(m); - for (i = 0; - i < sizeof(struct mbuf)/sizeof(vm_offset_t); - i++) - (element)[i] = 0xdeadbeef; -} -#define _MFREE_MUNGE(m) { \ - if (mfree_munge) \ - munge_mbuf(m); \ -} -#endif -#else -#define _MFREE_MUNGE(m) -#endif +/* + * mbuf specific mcache allocation request flags. + */ +#define MCR_COMP MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */ -#define _MINTGET(m, type) { \ - MBUF_LOCK(); \ - if (((m) = mfree) != 0) { \ - MCHECK(m); \ - ++mclrefcnt[mtocl(m)]; \ - mbstat.m_mtypes[MT_FREE]--; \ - mbstat.m_mtypes[(type)]++; \ - mfree = (m)->m_next; \ - } \ - MBUF_UNLOCK(); \ -} - +/* + * Per-cluster slab structure. + * + * A slab is a cluster control structure that contains one or more object + * chunks; the available chunks are chained in the slab's freelist (sl_head). + * Each time a chunk is taken out of the slab, the slab's reference count + * gets incremented. When all chunks have been taken out, the empty slab + * gets removed (SLF_DETACHED) from the class's slab list. A chunk that is + * returned to a slab causes the slab's reference count to be decremented; + * it also causes the slab to be reinserted back to class's slab list, if + * it's not already done. + * + * Compartmentalizing of the object chunks into slabs allows us to easily + * merge one or more slabs together when the adjacent slabs are idle, as + * well as to convert or move a slab from one class to another; e.g. the + * mbuf cluster slab can be converted to a regular cluster slab when all + * mbufs in the slab have been freed. + * + * A slab may also span across multiple clusters for chunks larger than + * a cluster's size. In this case, only the slab of the first cluster is + * used. The rest of the slabs are marked with SLF_PARTIAL to indicate + * that they are part of the larger slab. + */ +typedef struct mcl_slab { + struct mcl_slab *sl_next; /* neighboring slab */ + u_int8_t sl_class; /* controlling mbuf class */ + int8_t sl_refcnt; /* outstanding allocations */ + int8_t sl_chunks; /* chunks (bufs) in this slab */ + u_int16_t sl_flags; /* slab flags (see below) */ + u_int16_t sl_len; /* slab length */ + void *sl_base; /* base of allocated memory */ + void *sl_head; /* first free buffer */ + TAILQ_ENTRY(mcl_slab) sl_link; /* next/prev slab on freelist */ +} mcl_slab_t; + +#define SLF_MAPPED 0x0001 /* backed by a mapped page */ +#define SLF_PARTIAL 0x0002 /* part of another slab */ +#define SLF_DETACHED 0x0004 /* not in slab freelist */ -static void -m_range_check(void *addr) -{ - if (addr && (addr < (void *)mbutl || addr >= (void *)embutl)) - panic("mbuf address out of range 0x%x", addr); -} +/* + * The array of slabs are broken into groups of arrays per 1MB of kernel + * memory to reduce the footprint. Each group is allocated on demand + * whenever a new piece of memory mapped in from the VM crosses the 1MB + * boundary. + */ +#define MBSHIFT 20 /* 1MB */ +#define NSLABSPMB ((1 << MBSHIFT) >> MCLSHIFT) /* 512 slabs/grp */ -__private_extern__ void -mbinit(void) -{ - int m; - int initmcl = 32; - int mcl_pages; +typedef struct mcl_slabg { + mcl_slab_t slg_slab[NSLABSPMB]; /* group of slabs */ +} mcl_slabg_t; - if (nclpp) - return; - nclpp = round_page_32(MCLBYTES) / MCLBYTES; /* see mbufgc() */ - if (nclpp < 1) nclpp = 1; - mbuf_mlock_grp_attr = lck_grp_attr_alloc_init(); +/* + * Per-cluster audit structure. + */ +typedef struct { + mcache_audit_t *cl_audit[NMBPCL]; /* array of audits */ +} mcl_audit_t; - mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr); - mbuf_mlock_attr = lck_attr_alloc_init(); +#if CONFIG_MBUF_NOEXPAND +static unsigned int maxmbufcl; +#endif /* CONFIG_MBUF_NOEXPAND */ - mbuf_mlock = lck_mtx_alloc_init(mbuf_mlock_grp, mbuf_mlock_attr); +/* + * Size of data from the beginning of an mbuf that covers m_hdr, pkthdr + * and m_ext structures. If auditing is enabled, we allocate a shadow + * mbuf structure of this size inside each audit structure, and the + * contents of the real mbuf gets copied into it when the mbuf is freed. + * This allows us to pattern-fill the mbuf for integrity check, and to + * preserve any constructed mbuf fields (e.g. mbuf + cluster cache case). + * Note that we don't save the contents of clusters when they are freed; + * we simply pattern-fill them. + */ +#if defined(__LP64__) +#define AUDIT_CONTENTS_SIZE 160 +#else +#define AUDIT_CONTENTS_SIZE 80 +#endif /* __LP64__ */ - mbstat.m_msize = MSIZE; - mbstat.m_mclbytes = MCLBYTES; - mbstat.m_minclsize = MINCLSIZE; - mbstat.m_mlen = MLEN; - mbstat.m_mhlen = MHLEN; - mbstat.m_bigmclbytes = NBPG; +/* + * mbuf specific mcache audit flags + */ +#define MB_INUSE 0x01 /* object has not been returned to slab */ +#define MB_COMP_INUSE 0x02 /* object has not been returned to cslab */ +#define MB_SCVALID 0x04 /* object has valid saved contents */ - if (nmbclusters == 0) - nmbclusters = NMBCLUSTERS; - MALLOC(mclrefcnt, short *, nmbclusters * sizeof (short), - M_TEMP, M_WAITOK); - if (mclrefcnt == 0) - panic("mbinit"); - for (m = 0; m < nmbclusters; m++) - mclrefcnt[m] = -1; +/* + * Each of the following two arrays hold up to nmbclusters elements. + */ +static mcl_audit_t *mclaudit; /* array of cluster audit information */ +static mcl_slabg_t **slabstbl; /* cluster slabs table */ +static unsigned int maxslabgrp; /* max # of entries in slabs table */ +static unsigned int slabgrp; /* # of entries in slabs table */ + +/* Globals */ +int nclusters; /* # of clusters for non-jumbo (legacy) sizes */ +int njcl; /* # of clusters for jumbo sizes */ +int njclbytes; /* size of a jumbo cluster */ +union mcluster *mbutl; /* first mapped cluster address */ +union mcluster *embutl; /* ending virtual address of mclusters */ +int max_linkhdr; /* largest link-level header */ +int max_protohdr; /* largest protocol header */ +int max_hdr; /* largest link+protocol header */ +int max_datalen; /* MHLEN - max_hdr */ + +/* TODO: should be in header file */ +int do_reclaim = 0; - /* Calculate the number of pages assigned to the cluster pool */ - mcl_pages = nmbclusters/(NBPG/CLBYTES); - MALLOC(mcl_paddr, int *, mcl_pages * sizeof(int), M_TEMP, M_WAITOK); - if (mcl_paddr == 0) - panic("mbinit1"); - /* Register with the I/O Bus mapper */ - mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages); - bzero((char *)mcl_paddr, mcl_pages * sizeof(int)); +/* The minimum number of objects that are allocated, to start. */ +#define MINCL 32 +#define MINBIGCL (MINCL >> 1) +#define MIN16KCL (MINCL >> 2) + +/* Low watermarks (only map in pages once free counts go below) */ +#define MCL_LOWAT MINCL +#define MBIGCL_LOWAT MINBIGCL +#define M16KCL_LOWAT MIN16KCL + +typedef struct { + mbuf_class_t mtbl_class; /* class type */ + mcache_t *mtbl_cache; /* mcache for this buffer class */ + TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */ + mcache_obj_t *mtbl_cobjlist; /* composite objects freelist */ + mb_class_stat_t *mtbl_stats; /* statistics fetchable via sysctl */ + u_int32_t mtbl_maxsize; /* maximum buffer size */ + int mtbl_minlimit; /* minimum allowed */ + int mtbl_maxlimit; /* maximum allowed */ + u_int32_t mtbl_wantpurge; /* purge during next reclaim */ +} mbuf_table_t; + +#define m_class(c) mbuf_table[c].mtbl_class +#define m_cache(c) mbuf_table[c].mtbl_cache +#define m_slablist(c) mbuf_table[c].mtbl_slablist +#define m_cobjlist(c) mbuf_table[c].mtbl_cobjlist +#define m_maxsize(c) mbuf_table[c].mtbl_maxsize +#define m_minlimit(c) mbuf_table[c].mtbl_minlimit +#define m_maxlimit(c) mbuf_table[c].mtbl_maxlimit +#define m_wantpurge(c) mbuf_table[c].mtbl_wantpurge +#define m_cname(c) mbuf_table[c].mtbl_stats->mbcl_cname +#define m_size(c) mbuf_table[c].mtbl_stats->mbcl_size +#define m_total(c) mbuf_table[c].mtbl_stats->mbcl_total +#define m_active(c) mbuf_table[c].mtbl_stats->mbcl_active +#define m_infree(c) mbuf_table[c].mtbl_stats->mbcl_infree +#define m_slab_cnt(c) mbuf_table[c].mtbl_stats->mbcl_slab_cnt +#define m_alloc_cnt(c) mbuf_table[c].mtbl_stats->mbcl_alloc_cnt +#define m_free_cnt(c) mbuf_table[c].mtbl_stats->mbcl_free_cnt +#define m_notified(c) mbuf_table[c].mtbl_stats->mbcl_notified +#define m_purge_cnt(c) mbuf_table[c].mtbl_stats->mbcl_purge_cnt +#define m_fail_cnt(c) mbuf_table[c].mtbl_stats->mbcl_fail_cnt +#define m_ctotal(c) mbuf_table[c].mtbl_stats->mbcl_ctotal + +static mbuf_table_t mbuf_table[] = { + /* + * The caches for mbufs, regular clusters and big clusters. + */ + { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)), + NULL, NULL, 0, 0, 0, 0 }, + { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)), + NULL, NULL, 0, 0, 0, 0 }, + { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)), + NULL, NULL, 0, 0, 0, 0 }, + { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)), + NULL, NULL, 0, 0, 0, 0 }, + /* + * The following are special caches; they serve as intermediate + * caches backed by the above rudimentary caches. Each object + * in the cache is an mbuf with a cluster attached to it. Unlike + * the above caches, these intermediate caches do not directly + * deal with the slab structures; instead, the constructed + * cached elements are simply stored in the freelists. + */ + { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 }, + { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 }, + { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 }, +}; + +#define NELEM(a) (sizeof (a) / sizeof ((a)[0])) + +static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */ +static int mb_waiters; /* number of sleepers */ + +/* The following are used to serialize m_clalloc() */ +static boolean_t mb_clalloc_busy; +static void *mb_clalloc_waitchan = &mb_clalloc_busy; +static int mb_clalloc_waiters; + +static int mbstat_sysctl SYSCTL_HANDLER_ARGS; +static int mb_stat_sysctl SYSCTL_HANDLER_ARGS; +static void mbuf_table_init(void); +static inline void m_incref(struct mbuf *); +static inline u_int32_t m_decref(struct mbuf *); +static int m_clalloc(const u_int32_t, const int, const u_int32_t); +static void mbuf_worker_thread_init(void); +static mcache_obj_t *slab_alloc(mbuf_class_t, int); +static void slab_free(mbuf_class_t, mcache_obj_t *); +static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***, + unsigned int, int); +static void mbuf_slab_free(void *, mcache_obj_t *, int); +static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t); +static void mbuf_slab_notify(void *, u_int32_t); +static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***, + unsigned int); +static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int); +static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***, + unsigned int, int); +static void mbuf_cslab_free(void *, mcache_obj_t *, int); +static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t); +static int freelist_populate(mbuf_class_t, unsigned int, int); +static boolean_t mbuf_cached_above(mbuf_class_t, int); +static boolean_t mbuf_steal(mbuf_class_t, unsigned int); +static void m_reclaim(mbuf_class_t, unsigned int, boolean_t); +static int m_howmany(int, size_t); +static void mbuf_worker_thread(void); +static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int); + +static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **, + size_t, unsigned int); +static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *); +static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t); +static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t, + boolean_t); +static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t); +static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *); +static void mcl_audit_mcheck_panic(struct mbuf *); +static void mcl_audit_verify_nextptr(void *, mcache_audit_t *); + +static mcl_slab_t *slab_get(void *); +static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t, + void *, void *, unsigned int, int, int); +static void slab_insert(mcl_slab_t *, mbuf_class_t); +static void slab_remove(mcl_slab_t *, mbuf_class_t); +static boolean_t slab_inrange(mcl_slab_t *, void *); +static void slab_nextptr_panic(mcl_slab_t *, void *); +static void slab_detach(mcl_slab_t *); +static boolean_t slab_is_detached(mcl_slab_t *); + +/* + * This flag is set for all mbufs that come out of and into the composite + * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL. mbufs that + * are marked with such a flag have clusters attached to them, and will be + * treated differently when they are freed; instead of being placed back + * into the mbuf and cluster freelists, the composite mbuf + cluster objects + * are placed back into the appropriate composite cache's freelist, and the + * actual freeing is deferred until the composite objects are purged. At + * such a time, this flag will be cleared from the mbufs and the objects + * will be freed into their own separate freelists. + */ +#define EXTF_COMPOSITE 0x1 - embutl = (union mcluster *)((unsigned char *)mbutl + (nmbclusters * MCLBYTES)); +#define MEXT_RFA(m) ((m)->m_ext.ext_refflags) +#define MEXT_REF(m) (MEXT_RFA(m)->refcnt) +#define MEXT_FLAGS(m) (MEXT_RFA(m)->flags) +#define MBUF_IS_COMPOSITE(m) \ + (MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_COMPOSITE)) - PE_parse_boot_arg("initmcl", &initmcl); - - if (m_clalloc(max(NBPG/CLBYTES, 1) * initmcl, M_WAIT, MCLBYTES, 0) == 0) - goto bad; - MBUF_UNLOCK(); +/* + * Macros used to verify the integrity of the mbuf. + */ +#define _MCHECK(m) { \ + if ((m)->m_type != MT_FREE) { \ + if (mclaudit == NULL) \ + panic("MCHECK: m_type=%d m=%p", \ + (u_int16_t)(m)->m_type, m); \ + else \ + mcl_audit_mcheck_panic(m); \ + } \ +} - (void) kernel_thread(kernel_task, mbuf_expand_thread_init); +#define MBUF_IN_MAP(addr) \ + ((void *)(addr) >= (void *)mbutl && (void *)(addr) < (void *)embutl) - return; -bad: - panic("mbinit"); +#define MRANGE(addr) { \ + if (!MBUF_IN_MAP(addr)) \ + panic("MRANGE: address out of range 0x%p", addr); \ } /* - * Allocate some number of mbuf clusters - * and place on cluster free list. - * Take the mbuf lock (if not already locked) and do not release it + * Macro version of mtod. */ -/* ARGSUSED */ -static int -m_clalloc( - const int num, - const int nowait, - const size_t bufsize, - int locked) -{ - int i; - vm_size_t size = 0; - int numpages = 0; - vm_offset_t page = 0; - - if (locked == 0) - MBUF_LOCK(); - /* - * Honor the caller's wish to block or not block. - * We have a way to grow the pool asynchronously, - * by kicking the dlil_input_thread. - */ - i = m_howmany(num, bufsize); - if (i == 0 || nowait == M_DONTWAIT) - goto out; +#define MTOD(m, t) ((t)((m)->m_data)) - MBUF_UNLOCK(); - size = round_page_32(i * bufsize); - page = kmem_mb_alloc(mb_map, size); +/* + * Macros to obtain cluster index and base cluster address. + */ +#define MTOCL(x) (((char *)(x) - (char *)mbutl) >> MCLSHIFT) +#define CLTOM(x) ((union mcluster *)(mbutl + (x))) - if (page == 0) { - size = NBPG; /* Try for 1 if failed */ - page = kmem_mb_alloc(mb_map, size); - } - MBUF_LOCK(); - - if (page) { - numpages = size / NBPG; - for (i = 0; i < numpages; i++, page += NBPG) { - if (((int)page & PGOFSET) == 0) { - ppnum_t offset = ((char *)page - (char *)mbutl)/NBPG; - ppnum_t new_page = pmap_find_phys(kernel_pmap, (vm_address_t) page); - - /* - * In the case of no mapper being available - * the following code nops and returns the - * input page, if there is a mapper the I/O - * page appropriate is returned. - */ - new_page = IOMapperInsertPage(mcl_paddr_base, offset, new_page); - mcl_paddr[offset] = new_page << 12; - } - if (bufsize == MCLBYTES) { - union mcluster *mcl = (union mcluster *)page; - - if (++mclrefcnt[mtocl(mcl)] != 0) - panic("m_clalloc already there"); - mcl->mcl_next = mclfree; - mclfree = mcl++; - if (++mclrefcnt[mtocl(mcl)] != 0) - panic("m_clalloc already there"); - mcl->mcl_next = mclfree; - mclfree = mcl++; - } else { - union mbigcluster *mbc = (union mbigcluster *)page; +/* + * Macro to find the mbuf index relative to the cluster base. + */ +#define MCLIDX(c, m) (((char *)(m) - (char *)(c)) >> 8) - if (++mclrefcnt[mtocl(mbc)] != 0) - panic("m_clalloc already there"); - if (++mclrefcnt[mtocl(mbc) + 1] != 0) - panic("m_clalloc already there"); +/* + * Macros used during mbuf and cluster initialization. + */ +#define MBUF_INIT(m, pkthdr, type) { \ + _MCHECK(m); \ + (m)->m_next = (m)->m_nextpkt = NULL; \ + (m)->m_len = 0; \ + (m)->m_type = type; \ + if ((pkthdr) == 0) { \ + (m)->m_data = (m)->m_dat; \ + (m)->m_flags = 0; \ + } else { \ + (m)->m_data = (m)->m_pktdat; \ + (m)->m_flags = M_PKTHDR; \ + (m)->m_pkthdr.rcvif = NULL; \ + (m)->m_pkthdr.len = 0; \ + (m)->m_pkthdr.header = NULL; \ + (m)->m_pkthdr.csum_flags = 0; \ + (m)->m_pkthdr.csum_data = 0; \ + (m)->m_pkthdr.reserved0 = NULL; \ + (m)->m_pkthdr.vlan_tag = 0; \ + (m)->m_pkthdr.socket_id = 0; \ + m_tag_init(m); \ + } \ +} - mbc->mbc_next = mbigfree; - mbigfree = mbc; - } - } - if (bufsize == MCLBYTES) { - int numcl = numpages << 1; - mbstat.m_clfree += numcl; - mbstat.m_clusters += numcl; - return (numcl); - } else { - mbstat.m_bigclfree += numpages; - mbstat.m_bigclusters += numpages; - return (numpages); - } - } /* else ... */ -out: - /* - * When non-blocking we kick a thread if we havve to grow the - * pool or if the number of free clusters is less than requested. - */ - if (bufsize == MCLBYTES) { - if (i > 0) { - /* Remember total number of clusters needed at this time */ - i += mbstat.m_clusters; - if (i > mbuf_expand_mcl) { - mbuf_expand_mcl = i; - if (mbuf_expand_thread_initialized) - wakeup((caddr_t)&mbuf_expand_thread_wakeup); - } - } - - if (mbstat.m_clfree >= num) - return 1; - } else { - if (i > 0) { - /* Remember total number of 4KB clusters needed at this time */ - i += mbstat.m_bigclusters; - if (i > mbuf_expand_big) { - mbuf_expand_big = i; - if (mbuf_expand_thread_initialized) - wakeup((caddr_t)&mbuf_expand_thread_wakeup); - } - } - - if (mbstat.m_bigclfree >= num) - return 1; - } - return 0; +#define MEXT_INIT(m, buf, size, free, arg, rfa, ref, flag) { \ + (m)->m_data = (m)->m_ext.ext_buf = (buf); \ + (m)->m_flags |= M_EXT; \ + (m)->m_ext.ext_size = (size); \ + (m)->m_ext.ext_free = (free); \ + (m)->m_ext.ext_arg = (arg); \ + (m)->m_ext.ext_refs.forward = (m)->m_ext.ext_refs.backward = \ + &(m)->m_ext.ext_refs; \ + MEXT_RFA(m) = (rfa); \ + MEXT_REF(m) = (ref); \ + MEXT_FLAGS(m) = (flag); \ } +#define MBUF_CL_INIT(m, buf, rfa, ref, flag) \ + MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, ref, flag) + +#define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \ + MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, ref, flag) + +#define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \ + MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, ref, flag) + /* - * Add more free mbufs by cutting up a cluster. + * Macro to convert BSD malloc sleep flag to mcache's */ -static int -m_expand(int canwait) -{ - caddr_t mcl; +#define MSLEEPF(f) ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP) - if (mbstat.m_clfree < (mbstat.m_clusters >> 4)) { - /* - * 1/16th of the total number of cluster mbufs allocated is - * reserved for large packets. The number reserved must - * always be < 1/2, or future allocation will be prevented. - */ - (void)m_clalloc(1, canwait, MCLBYTES, 0); - MBUF_UNLOCK(); - if (mbstat.m_clfree < (mbstat.m_clusters >> 4)) - return 0; - } +/* + * The structure that holds all mbuf class statistics exportable via sysctl. + * Similar to mbstat structure, the mb_stat structure is protected by the + * global mbuf lock. It contains additional information about the classes + * that allows for a more accurate view of the state of the allocator. + */ +struct mb_stat *mb_stat; - MCLALLOC(mcl, canwait); - if (mcl) { - struct mbuf *m = (struct mbuf *)mcl; - int i = NMBPCL; - MBUF_LOCK(); - mbstat.m_mtypes[MT_FREE] += i; - mbstat.m_mbufs += i; - while (i--) { - _MFREE_MUNGE(m); - m->m_type = MT_FREE; - m->m_next = mfree; - mfree = m++; - } - i = m_want; - m_want = 0; - MBUF_UNLOCK(); - if (i) wakeup((caddr_t)&mfree); - return 1; - } - return 0; -} +#define MB_STAT_SIZE(n) \ + ((size_t)(&((mb_stat_t *)0)->mbs_class[n])) /* - * When MGET failes, ask protocols to free space when short of memory, - * then re-attempt to allocate an mbuf. + * The legacy structure holding all of the mbuf allocation statistics. + * The actual statistics used by the kernel are stored in the mbuf_table + * instead, and are updated atomically while the global mbuf lock is held. + * They are mirrored in mbstat to support legacy applications (e.g. netstat). + * Unlike before, the kernel no longer relies on the contents of mbstat for + * its operations (e.g. cluster expansion) because the structure is exposed + * to outside and could possibly be modified, therefore making it unsafe. + * With the exception of the mbstat.m_mtypes array (see below), all of the + * statistics are updated as they change. */ -struct mbuf * -m_retry( - int canwait, - int type) -{ - struct mbuf *m; - int wait; +struct mbstat mbstat; - for (;;) { - (void) m_expand(canwait); - _MINTGET(m, type); - if (m) { - (m)->m_next = (m)->m_nextpkt = 0; - (m)->m_type = (type); - (m)->m_data = (m)->m_dat; - (m)->m_flags = 0; - (m)->m_len = 0; - } - if (m || canwait == M_DONTWAIT) - break; - MBUF_LOCK(); - wait = m_want++; - mbuf_expand_mcl++; - if (wait == 0) - mbstat.m_drain++; - else - mbstat.m_wait++; - MBUF_UNLOCK(); - - if (mbuf_expand_thread_initialized) - wakeup((caddr_t)&mbuf_expand_thread_wakeup); - - if (wait == 0) { - m_reclaim(); - } else { - struct timespec ts; - ts.tv_sec = 1; - ts.tv_nsec = 0; - (void) msleep((caddr_t)&mfree, 0, (PZERO-1) | PDROP, "m_retry", &ts); - } - } - if (m == 0) - mbstat.m_drops++; - return (m); -} +#define MBSTAT_MTYPES_MAX \ + (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0])) /* - * As above; retry an MGETHDR. + * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated + * atomically and stored in a per-CPU structure which is lock-free; this is + * done in order to avoid writing to the global mbstat data structure which + * would cause false sharing. During sysctl request for kern.ipc.mbstat, + * the statistics across all CPUs will be converged into the mbstat.m_mtypes + * array and returned to the application. Any updates for types greater or + * equal than MT_MAX would be done atomically to the mbstat; this slows down + * performance but is okay since the kernel uses only up to MT_MAX-1 while + * anything beyond that (up to type 255) is considered a corner case. */ -struct mbuf * -m_retryhdr( - int canwait, - int type) -{ - struct mbuf *m; +typedef struct { + unsigned int cpu_mtypes[MT_MAX]; +} __attribute__((aligned(CPU_CACHE_SIZE), packed)) mtypes_cpu_t; - if ((m = m_retry(canwait, type))) { - m->m_next = m->m_nextpkt = 0; - m->m_flags |= M_PKTHDR; - m->m_data = m->m_pktdat; - _M_CLEAR_PKTHDR(m); - } - return (m); -} +typedef struct { + mtypes_cpu_t mbs_cpu[1]; +} mbuf_mtypes_t; -void -m_reclaim(void) -{ - do_reclaim = 1; /* drain is performed in pfslowtimo(), to avoid deadlocks */ - mbstat.m_drain++; +static mbuf_mtypes_t *mbuf_mtypes; /* per-CPU statistics */ + +#define MBUF_MTYPES_SIZE(n) \ + ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n])) + +#define MTYPES_CPU(p) \ + ((mtypes_cpu_t *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number()))) + +/* This should be in a header file */ +#define atomic_add_32(a, n) ((void) OSAddAtomic(n, (volatile SInt32 *)a)) + +#define mtype_stat_add(type, n) { \ + if ((unsigned)(type) < MT_MAX) { \ + mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes); \ + atomic_add_32(&mbs->cpu_mtypes[type], n); \ + } else if ((unsigned)(type) < MBSTAT_MTYPES_MAX) { \ + atomic_add_32(&mbstat.m_mtypes[type], n); \ + } \ } -/* - * Space allocation routines. - * These are also available as macros - * for critical paths. - */ -struct mbuf * -m_get( - int nowait, - int type) -{ - struct mbuf *m; +#define mtype_stat_sub(t, n) mtype_stat_add(t, -(n)) +#define mtype_stat_inc(t) mtype_stat_add(t, 1) +#define mtype_stat_dec(t) mtype_stat_sub(t, 1) - m_range_check(mfree); - m_range_check(mclfree); - m_range_check(mbigfree); +static int +mbstat_sysctl SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + int m, n; + mtypes_cpu_t mtc; - _MINTGET(m, type); - if (m) { - m->m_next = m->m_nextpkt = 0; - m->m_type = type; - m->m_data = m->m_dat; - m->m_flags = 0; - m->m_len = 0; - } else - (m) = m_retry(nowait, type); + bzero(&mtc, sizeof (mtc)); + for (m = 0; m < ncpu; m++) { + mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m]; + mtypes_cpu_t temp; - m_range_check(mfree); - m_range_check(mclfree); - m_range_check(mbigfree); + bcopy(&scp->cpu_mtypes, &temp.cpu_mtypes, + sizeof (temp.cpu_mtypes)); + for (n = 0; n < MT_MAX; n++) + mtc.cpu_mtypes[n] += temp.cpu_mtypes[n]; + } + lck_mtx_lock(mbuf_mlock); + for (n = 0; n < MT_MAX; n++) + mbstat.m_mtypes[n] = mtc.cpu_mtypes[n]; + lck_mtx_unlock(mbuf_mlock); - return (m); + return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat))); } -struct mbuf * -m_gethdr( - int nowait, - int type) +static int +mb_stat_sysctl SYSCTL_HANDLER_ARGS { - struct mbuf *m; +#pragma unused(oidp, arg1, arg2) + mcache_t *cp; + mcache_cpu_t *ccp; + mb_class_stat_t *sp; + int k, m, bktsize; + + lck_mtx_lock(mbuf_mlock); + for (k = 0; k < NELEM(mbuf_table); k++) { + cp = m_cache(k); + ccp = &cp->mc_cpu[0]; + bktsize = ccp->cc_bktsize; + sp = mbuf_table[k].mtbl_stats; + + if (cp->mc_flags & MCF_NOCPUCACHE) + sp->mbcl_mc_state = MCS_DISABLED; + else if (cp->mc_purge_cnt > 0) + sp->mbcl_mc_state = MCS_PURGING; + else if (bktsize == 0) + sp->mbcl_mc_state = MCS_OFFLINE; + else + sp->mbcl_mc_state = MCS_ONLINE; + + sp->mbcl_mc_cached = 0; + for (m = 0; m < ncpu; m++) { + ccp = &cp->mc_cpu[m]; + if (ccp->cc_objs > 0) + sp->mbcl_mc_cached += ccp->cc_objs; + if (ccp->cc_pobjs > 0) + sp->mbcl_mc_cached += ccp->cc_pobjs; + } + sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize); + sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached - + sp->mbcl_infree; + + sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt; + sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt; + sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt; + + /* Calculate total count specific to each class */ + sp->mbcl_ctotal = sp->mbcl_total; + switch (m_class(k)) { + case MC_MBUF: + /* Deduct mbufs used in composite caches */ + sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) + + m_total(MC_MBUF_BIGCL)); + break; - m_range_check(mfree); - m_range_check(mclfree); - m_range_check(mbigfree); + case MC_CL: + /* Deduct clusters used in composite cache and mbufs */ + sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) + + (P2ROUNDUP(m_total(MC_MBUF), NMBPCL)/NMBPCL)); + break; + case MC_BIGCL: + /* Deduct clusters used in composite cache */ + sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL); + break; - _MINTGET(m, type); - if (m) { - m->m_next = m->m_nextpkt = 0; - m->m_type = type; - m->m_data = m->m_pktdat; - m->m_flags = M_PKTHDR; - m->m_len = 0; - _M_CLEAR_PKTHDR(m) - } else - m = m_retryhdr(nowait, type); + case MC_16KCL: + /* Deduct clusters used in composite cache */ + sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL); + break; + + default: + break; + } + } + lck_mtx_unlock(mbuf_mlock); - m_range_check(mfree); - m_range_check(mclfree); - m_range_check(mbigfree); + return (SYSCTL_OUT(req, mb_stat, MB_STAT_SIZE(NELEM(mbuf_table)))); +} +static inline void +m_incref(struct mbuf *m) +{ + UInt32 old, new; + volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m); - return m; + do { + old = *addr; + new = old + 1; + ASSERT(new != 0); + } while (!OSCompareAndSwap(old, new, addr)); } -struct mbuf * -m_getclr( - int nowait, - int type) +static inline u_int32_t +m_decref(struct mbuf *m) { - struct mbuf *m; + UInt32 old, new; + volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m); - MGET(m, nowait, type); - if (m == 0) - return (0); - bzero(mtod(m, caddr_t), MLEN); - return (m); + do { + old = *addr; + new = old - 1; + ASSERT(old != 0); + } while (!OSCompareAndSwap(old, new, addr)); + + return (new); } -struct mbuf * -m_free( - struct mbuf *m) +static void +mbuf_table_init(void) { - struct mbuf *n = m->m_next; - int i; + int m; - m_range_check(m); - m_range_check(mfree); - m_range_check(mclfree); + MALLOC(mb_stat, mb_stat_t *, MB_STAT_SIZE(NELEM(mbuf_table)), + M_TEMP, M_WAITOK | M_ZERO); + VERIFY(mb_stat != NULL); - if (m->m_type == MT_FREE) - panic("freeing free mbuf"); + mb_stat->mbs_cnt = NELEM(mbuf_table); + for (m = 0; m < NELEM(mbuf_table); m++) + mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m]; - /* Free the aux data if there is any */ - if ((m->m_flags & M_PKTHDR) && m->m_pkthdr.aux) - { - m_freem(m->m_pkthdr.aux); - } - if ((m->m_flags & M_PKTHDR) != 0) - m_tag_delete_chain(m, NULL); +#if CONFIG_MBUF_JUMBO + /* + * Set aside 1/3 of the mbuf cluster map for jumbo clusters; we do + * this only on platforms where jumbo cluster pool is enabled. + */ + njcl = nmbclusters / 3; + njclbytes = M16KCLBYTES; +#endif /* CONFIG_MBUF_JUMBO */ - MBUF_LOCK(); - if ((m->m_flags & M_EXT)) - { - if (MCLHASREFERENCE(m)) { - remque((queue_t)&m->m_ext.ext_refs); - } else if (m->m_ext.ext_free == NULL) { - union mcluster *mcl= (union mcluster *)m->m_ext.ext_buf; - - m_range_check(mcl); - - if (_MCLUNREF(mcl)) { - mcl->mcl_next = mclfree; - mclfree = mcl; - ++mbstat.m_clfree; - } -#ifdef COMMENT_OUT -/* *** Since m_split() increments "mclrefcnt[mtocl(m->m_ext.ext_buf)]", - and AppleTalk ADSP uses m_split(), this incorrect sanity check - caused a panic. -*** */ - else /* sanity check - not referenced this way */ - panic("m_free m_ext cluster not free"); -#endif - } else { - (*(m->m_ext.ext_free))(m->m_ext.ext_buf, - m->m_ext.ext_size, m->m_ext.ext_arg); - } + /* + * nclusters is going to be split in 2 to hold both the 2K + * and the 4K pools, so make sure each half is even. + */ + nclusters = P2ROUNDDOWN(nmbclusters - njcl, 4); + if (njcl > 0) { + /* + * Each jumbo cluster takes 8 2K clusters, so make + * sure that the pool size is evenly divisible by 8. + */ + njcl = P2ROUNDDOWN(nmbclusters - nclusters, 8); } - mbstat.m_mtypes[m->m_type]--; - (void) _MCLUNREF(m); - _MFREE_MUNGE(m); - m->m_type = MT_FREE; - mbstat.m_mtypes[m->m_type]++; - m->m_flags = 0; - m->m_next = mfree; - m->m_len = 0; - mfree = m; - i = m_want; - m_want = 0; - MBUF_UNLOCK(); - if (i) wakeup((caddr_t)&mfree); - return (n); -} -/* m_mclget() add an mbuf cluster to a normal mbuf */ -struct mbuf * -m_mclget( - struct mbuf *m, - int nowait) -{ - MCLALLOC(m->m_ext.ext_buf, nowait); - if (m->m_ext.ext_buf) { - m->m_data = m->m_ext.ext_buf; - m->m_flags |= M_EXT; - m->m_ext.ext_size = MCLBYTES; - m->m_ext.ext_free = 0; - m->m_ext.ext_refs.forward = m->m_ext.ext_refs.backward = - &m->m_ext.ext_refs; +#if CONFIG_MBUF_NOEXPAND + /* Only use 4k clusters if we're setting aside more than 256k */ + if (nmbclusters <= 128) { + maxmbufcl = nmbclusters / 4; + } else { + /* Half to big clusters, half to small */ + maxmbufcl = (nmbclusters / 4) * 3; } - - return m; -} +#endif /* CONFIG_MBUF_NOEXPAND */ + + /* + * 1/2 of the map is reserved for 2K clusters. Out of this, 1/16th + * of the total number of 2K clusters allocated is reserved and cannot + * be turned into mbufs. It can only be used for pure cluster objects. + */ + m_minlimit(MC_CL) = (nclusters >> 5); + m_maxlimit(MC_CL) = (nclusters >> 1); + m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES; + (void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl"); + + /* + * The remaining (15/16th) can be turned into mbufs. + */ + m_minlimit(MC_MBUF) = 0; + m_maxlimit(MC_MBUF) = (m_maxlimit(MC_CL) - m_minlimit(MC_CL)) * NMBPCL; + m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE; + (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf"); + + /* + * The other 1/2 of the map is reserved for 4K clusters. + */ + m_minlimit(MC_BIGCL) = 0; + m_maxlimit(MC_BIGCL) = m_maxlimit(MC_CL) >> 1; + m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = NBPG; + (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl"); + + /* + * Set limits for the composite classes. + */ + m_minlimit(MC_MBUF_CL) = 0; + m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL) - m_minlimit(MC_CL); + m_maxsize(MC_MBUF_CL) = MCLBYTES; + m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL); + (void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl"); + + m_minlimit(MC_MBUF_BIGCL) = 0; + m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL); + m_maxsize(MC_MBUF_BIGCL) = NBPG; + m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL); + (void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl"); + + /* + * And for jumbo classes. + */ + m_minlimit(MC_16KCL) = 0; + m_maxlimit(MC_16KCL) = (njcl >> 3); + m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES; + (void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl"); + + m_minlimit(MC_MBUF_16KCL) = 0; + m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL); + m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES; + m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL); + (void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl"); + + /* + * Initialize the legacy mbstat structure. + */ + bzero(&mbstat, sizeof (mbstat)); + mbstat.m_msize = m_maxsize(MC_MBUF); + mbstat.m_mclbytes = m_maxsize(MC_CL); + mbstat.m_minclsize = MINCLSIZE; + mbstat.m_mlen = MLEN; + mbstat.m_mhlen = MHLEN; + mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL); +} + +__private_extern__ void +mbinit(void) +{ + unsigned int m; + int initmcl = MINCL; + int mcl_pages; + void *buf; + + if (nmbclusters == 0) + nmbclusters = NMBCLUSTERS; + + /* Setup the mbuf table */ + mbuf_table_init(); + + /* Global lock for common layer */ + mbuf_mlock_grp_attr = lck_grp_attr_alloc_init(); + mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr); + mbuf_mlock_attr = lck_attr_alloc_init(); + mbuf_mlock = lck_mtx_alloc_init(mbuf_mlock_grp, mbuf_mlock_attr); + + /* Allocate cluster slabs table */ + maxslabgrp = P2ROUNDUP(nmbclusters, NSLABSPMB) / NSLABSPMB; + MALLOC(slabstbl, mcl_slabg_t **, maxslabgrp * sizeof (mcl_slabg_t *), + M_TEMP, M_WAITOK | M_ZERO); + VERIFY(slabstbl != NULL); + + /* Allocate audit structures if needed */ + PE_parse_boot_arg("mbuf_debug", &mbuf_debug); + mbuf_debug |= mcache_getflags(); + if (mbuf_debug & MCF_AUDIT) { + MALLOC(mclaudit, mcl_audit_t *, + nmbclusters * sizeof (*mclaudit), M_TEMP, + M_WAITOK | M_ZERO); + VERIFY(mclaudit != NULL); + + mcl_audit_con_cache = mcache_create("mcl_audit_contents", + AUDIT_CONTENTS_SIZE, 0, 0, MCR_SLEEP); + VERIFY(mcl_audit_con_cache != NULL); + } + + /* Calculate the number of pages assigned to the cluster pool */ + mcl_pages = nmbclusters/(NBPG/CLBYTES); + MALLOC(mcl_paddr, int *, mcl_pages * sizeof (int), M_TEMP, M_WAITOK); + VERIFY(mcl_paddr != NULL); + + /* Register with the I/O Bus mapper */ + mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages); + bzero((char *)mcl_paddr, mcl_pages * sizeof (int)); + + embutl = (union mcluster *) + ((unsigned char *)mbutl + (nmbclusters * MCLBYTES)); + + PE_parse_boot_arg("initmcl", &initmcl); + + lck_mtx_lock(mbuf_mlock); + + if (m_clalloc(MAX(NBPG/CLBYTES, 1) * initmcl, M_WAIT, MCLBYTES) == 0) + panic("mbinit: m_clalloc failed\n"); + + lck_mtx_unlock(mbuf_mlock); + + (void) kernel_thread(kernel_task, mbuf_worker_thread_init); + + ref_cache = mcache_create("mext_ref", sizeof (struct ext_ref), + 0, 0, MCR_SLEEP); + + /* Create the cache for each class */ + for (m = 0; m < NELEM(mbuf_table); m++) { + void *allocfunc, *freefunc, *auditfunc; + u_int32_t flags; + + flags = mbuf_debug; + if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL || + m_class(m) == MC_MBUF_16KCL) { + allocfunc = mbuf_cslab_alloc; + freefunc = mbuf_cslab_free; + auditfunc = mbuf_cslab_audit; + } else { + allocfunc = mbuf_slab_alloc; + freefunc = mbuf_slab_free; + auditfunc = mbuf_slab_audit; + } + + /* + * Disable per-CPU caches for jumbo classes if there + * is no jumbo cluster pool available in the system. + * The cache itself is still created (but will never + * be populated) since it simplifies the code. + */ + if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) && + njcl == 0) + flags |= MCF_NOCPUCACHE; + + m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m), + allocfunc, freefunc, auditfunc, mbuf_slab_notify, + (void *)m, flags, MCR_SLEEP); + } + + /* + * Allocate structure for per-CPU statistics that's aligned + * on the CPU cache boundary; this code assumes that we never + * uninitialize this framework, since the original address + * before alignment is not saved. + */ + ncpu = ml_get_max_cpus(); + MALLOC(buf, void *, MBUF_MTYPES_SIZE(ncpu) + CPU_CACHE_SIZE, + M_TEMP, M_WAITOK); + VERIFY(buf != NULL); + + mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf, CPU_CACHE_SIZE); + bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu)); + + printf("mbinit: done\n"); +} + +/* + * Obtain a slab of object(s) from the class's freelist. + */ +static mcache_obj_t * +slab_alloc(mbuf_class_t class, int wait) +{ + mcl_slab_t *sp; + mcache_obj_t *buf; + + lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + + VERIFY(class != MC_16KCL || njcl > 0); + + /* This should always be NULL for us */ + VERIFY(m_cobjlist(class) == NULL); + + /* + * Treat composite objects as having longer lifespan by using + * a slab from the reverse direction, in hoping that this could + * reduce the probability of fragmentation for slabs that hold + * more than one buffer chunks (e.g. mbuf slabs). For other + * slabs, this probably doesn't make much of a difference. + */ + if (class == MC_MBUF && (wait & MCR_COMP)) + sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead); + else + sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class)); + + if (sp == NULL) { + VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0); + /* The slab list for this class is empty */ + return (NULL); + } + + VERIFY(m_infree(class) > 0); + VERIFY(!slab_is_detached(sp)); + VERIFY(sp->sl_class == class && + (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED); + buf = sp->sl_head; + VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf)); + + if (class == MC_MBUF) { + sp->sl_head = buf->obj_next; + VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NMBPCL - 1)); + } else { + sp->sl_head = NULL; + } + if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) { + slab_nextptr_panic(sp, sp->sl_head); + /* In case sl_head is in the map but not in the slab */ + VERIFY(slab_inrange(sp, sp->sl_head)); + /* NOTREACHED */ + } + + /* Increment slab reference */ + sp->sl_refcnt++; + + if (mclaudit != NULL) { + mcache_audit_t *mca = mcl_audit_buf2mca(class, buf); + mca->mca_uflags = 0; + /* Save contents on mbuf objects only */ + if (class == MC_MBUF) + mca->mca_uflags |= MB_SCVALID; + } + + if (class == MC_CL) { + mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL); + /* + * A 2K cluster slab can have at most 1 reference. + */ + VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 && + sp->sl_len == m_maxsize(MC_CL) && sp->sl_head == NULL); + } else if (class == MC_BIGCL) { + mcl_slab_t *nsp = sp->sl_next; + mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) + + m_infree(MC_MBUF_BIGCL); + /* + * Increment 2nd slab. A 4K big cluster takes + * 2 slabs, each having at most 1 reference. + */ + VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 && + sp->sl_len == m_maxsize(MC_BIGCL) && sp->sl_head == NULL); + /* Next slab must already be present */ + VERIFY(nsp != NULL); + nsp->sl_refcnt++; + VERIFY(!slab_is_detached(nsp)); + VERIFY(nsp->sl_class == MC_BIGCL && + nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) && + nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 && + nsp->sl_len == 0 && nsp->sl_base == sp->sl_base && + nsp->sl_head == NULL); + } else if (class == MC_16KCL) { + mcl_slab_t *nsp; + int k; + + --m_infree(MC_16KCL); + VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 && + sp->sl_len == m_maxsize(MC_16KCL) && sp->sl_head == NULL); + /* + * Increment 2nd-8th slab. A 16K big cluster takes + * 8 cluster slabs, each having at most 1 reference. + */ + for (nsp = sp, k = 1; k < (M16KCLBYTES / MCLBYTES); k++) { + nsp = nsp->sl_next; + /* Next slab must already be present */ + VERIFY(nsp != NULL); + nsp->sl_refcnt++; + VERIFY(!slab_is_detached(nsp)); + VERIFY(nsp->sl_class == MC_16KCL && + nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) && + nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 && + nsp->sl_len == 0 && nsp->sl_base == sp->sl_base && + nsp->sl_head == NULL); + } + } else { + ASSERT(class == MC_MBUF); + --m_infree(MC_MBUF); + /* + * If auditing is turned on, this check is + * deferred until later in mbuf_slab_audit(). + */ + if (mclaudit == NULL) + _MCHECK((struct mbuf *)buf); + /* + * Since we have incremented the reference count above, + * an mbuf slab (formerly a 2K cluster slab that was cut + * up into mbufs) must have a reference count between 1 + * and NMBPCL at this point. + */ + VERIFY(sp->sl_refcnt >= 1 && + (unsigned short)sp->sl_refcnt <= NMBPCL && + sp->sl_chunks == NMBPCL && sp->sl_len == m_maxsize(MC_CL)); + VERIFY((unsigned short)sp->sl_refcnt < NMBPCL || + sp->sl_head == NULL); + } + + /* If empty, remove this slab from the class's freelist */ + if (sp->sl_head == NULL) { + VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPCL); + slab_remove(sp, class); + } + + return (buf); +} + +/* + * Place a slab of object(s) back into a class's slab list. + */ +static void +slab_free(mbuf_class_t class, mcache_obj_t *buf) +{ + mcl_slab_t *sp; + + lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + + VERIFY(class != MC_16KCL || njcl > 0); + VERIFY(buf->obj_next == NULL); + sp = slab_get(buf); + VERIFY(sp->sl_class == class && slab_inrange(sp, buf) && + (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED); + + /* Decrement slab reference */ + sp->sl_refcnt--; + + if (class == MC_CL || class == MC_BIGCL) { + VERIFY(IS_P2ALIGNED(buf, MCLBYTES)); + /* + * A 2K cluster slab can have at most 1 reference + * which must be 0 at this point. + */ + VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 && + sp->sl_len == m_maxsize(class) && sp->sl_head == NULL); + VERIFY(slab_is_detached(sp)); + if (class == MC_BIGCL) { + mcl_slab_t *nsp = sp->sl_next; + VERIFY(IS_P2ALIGNED(buf, NBPG)); + /* Next slab must already be present */ + VERIFY(nsp != NULL); + /* Decrement 2nd slab reference */ + nsp->sl_refcnt--; + /* + * A 4K big cluster takes 2 slabs, both + * must now have 0 reference. + */ + VERIFY(slab_is_detached(nsp)); + VERIFY(nsp->sl_class == MC_BIGCL && + (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) && + nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 && + nsp->sl_len == 0 && nsp->sl_base == sp->sl_base && + nsp->sl_head == NULL); + } + } else if (class == MC_16KCL) { + mcl_slab_t *nsp; + int k; + /* + * A 16K cluster takes 8 cluster slabs, all must + * now have 0 reference. + */ + VERIFY(IS_P2ALIGNED(buf, NBPG)); + VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 && + sp->sl_len == m_maxsize(MC_16KCL) && sp->sl_head == NULL); + VERIFY(slab_is_detached(sp)); + for (nsp = sp, k = 1; k < (M16KCLBYTES / MCLBYTES); k++) { + nsp = nsp->sl_next; + /* Next slab must already be present */ + VERIFY(nsp != NULL); + nsp->sl_refcnt--; + VERIFY(slab_is_detached(nsp)); + VERIFY(nsp->sl_class == MC_16KCL && + (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) && + nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 && + nsp->sl_len == 0 && nsp->sl_base == sp->sl_base && + nsp->sl_head == NULL); + } + } else { + /* + * An mbuf slab has a total of NMBPL reference counts. + * Since we have decremented the reference above, it + * must now be between 0 and NMBPCL-1. + */ + VERIFY(sp->sl_refcnt >= 0 && + (unsigned short)sp->sl_refcnt <= (NMBPCL - 1) && + sp->sl_chunks == NMBPCL && sp->sl_len == m_maxsize(MC_CL)); + VERIFY(sp->sl_refcnt < (NMBPCL - 1) || + (slab_is_detached(sp) && sp->sl_head == NULL)); + } + + /* + * When auditing is enabled, ensure that the buffer still + * contains the free pattern. Otherwise it got corrupted + * while at the CPU cache layer. + */ + if (mclaudit != NULL) { + mcache_audit_t *mca = mcl_audit_buf2mca(class, buf); + mcache_audit_free_verify(mca, buf, 0, m_maxsize(class)); + mca->mca_uflags &= ~MB_SCVALID; + } + + if (class == MC_CL) { + mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL); + } else if (class == MC_BIGCL) { + mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) + + m_infree(MC_MBUF_BIGCL); + } else if (class == MC_16KCL) { + ++m_infree(MC_16KCL); + } else { + ++m_infree(MC_MBUF); + buf->obj_next = sp->sl_head; + } + sp->sl_head = buf; + + /* All mbufs are freed; return the cluster that we stole earlier */ + if (sp->sl_refcnt == 0 && class == MC_MBUF) { + int i = NMBPCL; + + m_total(MC_MBUF) -= NMBPCL; + mbstat.m_mbufs = m_total(MC_MBUF); + m_infree(MC_MBUF) -= NMBPCL; + mtype_stat_add(MT_FREE, -NMBPCL); + + while (i--) { + struct mbuf *m = sp->sl_head; + VERIFY(m != NULL); + sp->sl_head = m->m_next; + m->m_next = NULL; + } + VERIFY(sp->sl_head == NULL); + + /* Remove the slab from the mbuf class's slab list */ + slab_remove(sp, class); + + /* Reinitialize it as a 2K cluster slab */ + slab_init(sp, MC_CL, sp->sl_flags, sp->sl_base, sp->sl_base, + sp->sl_len, 0, 1); + + if (mclaudit != NULL) + mcache_set_pattern(MCACHE_FREE_PATTERN, + (caddr_t)sp->sl_head, m_maxsize(MC_CL)); + + mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL); + + VERIFY(slab_is_detached(sp)); + /* And finally switch class */ + class = MC_CL; + } + + /* Reinsert the slab to the class's slab list */ + if (slab_is_detached(sp)) + slab_insert(sp, class); +} + +/* + * Common allocator for rudimentary objects called by the CPU cache layer + * during an allocation request whenever there is no available element in the + * bucket layer. It returns one or more elements from the appropriate global + * freelist. If the freelist is empty, it will attempt to populate it and + * retry the allocation. + */ +static unsigned int +mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait) +{ + mbuf_class_t class = (mbuf_class_t)arg; + unsigned int need = num; + mcache_obj_t **list = *plist; + + ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class)); + ASSERT(need > 0); + + lck_mtx_lock(mbuf_mlock); + + for (;;) { + if ((*list = slab_alloc(class, wait)) != NULL) { + (*list)->obj_next = NULL; + list = *plist = &(*list)->obj_next; + + if (--need == 0) { + /* + * If the number of elements in freelist has + * dropped below low watermark, asynchronously + * populate the freelist now rather than doing + * it later when we run out of elements. + */ + if (!mbuf_cached_above(class, wait) && + m_infree(class) < m_total(class) >> 5) { + (void) freelist_populate(class, 1, + M_DONTWAIT); + } + break; + } + } else { + VERIFY(m_infree(class) == 0 || class == MC_CL); + + (void) freelist_populate(class, 1, + (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT); + + if (m_infree(class) > 0) + continue; + + /* Check if there's anything at the cache layer */ + if (mbuf_cached_above(class, wait)) + break; + + /* We have nothing and cannot block; give up */ + if (wait & MCR_NOSLEEP) { + if (!(wait & MCR_TRYHARD)) { + m_fail_cnt(class)++; + mbstat.m_drops++; + break; + } + } + + /* + * If the freelist is still empty and the caller is + * willing to be blocked, sleep on the wait channel + * until an element is available. Otherwise, if + * MCR_TRYHARD is set, do our best to satisfy the + * request without having to go to sleep. + */ + if (mbuf_worker_ready && + mbuf_sleep(class, need, wait)) + break; + + lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + } + } + + m_alloc_cnt(class) += num - need; + lck_mtx_unlock(mbuf_mlock); + + return (num - need); +} + +/* + * Common de-allocator for rudimentary objects called by the CPU cache + * layer when one or more elements need to be returned to the appropriate + * global freelist. + */ +static void +mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged) +{ + mbuf_class_t class = (mbuf_class_t)arg; + mcache_obj_t *nlist; + unsigned int num = 0; + int w; + + ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class)); + + lck_mtx_lock(mbuf_mlock); + + for (;;) { + nlist = list->obj_next; + list->obj_next = NULL; + slab_free(class, list); + ++num; + if ((list = nlist) == NULL) + break; + } + m_free_cnt(class) += num; + + if ((w = mb_waiters) > 0) + mb_waiters = 0; + + lck_mtx_unlock(mbuf_mlock); + + if (w != 0) + wakeup(mb_waitchan); +} + +/* + * Common auditor for rudimentary objects called by the CPU cache layer + * during an allocation or free request. For the former, this is called + * after the objects are obtained from either the bucket or slab layer + * and before they are returned to the caller. For the latter, this is + * called immediately during free and before placing the objects into + * the bucket or slab layer. + */ +static void +mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc) +{ + mbuf_class_t class = (mbuf_class_t)arg; + mcache_audit_t *mca; + + ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class)); + + while (list != NULL) { + lck_mtx_lock(mbuf_mlock); + mca = mcl_audit_buf2mca(class, list); + + /* Do the sanity checks */ + if (class == MC_MBUF) { + mcl_audit_mbuf(mca, list, FALSE, alloc); + ASSERT(mca->mca_uflags & MB_SCVALID); + } else { + mcl_audit_cluster(mca, list, m_maxsize(class), + alloc, TRUE); + ASSERT(!(mca->mca_uflags & MB_SCVALID)); + } + /* Record this transaction */ + mcache_buffer_log(mca, list, m_cache(class)); + if (alloc) + mca->mca_uflags |= MB_INUSE; + else + mca->mca_uflags &= ~MB_INUSE; + /* Unpair the object (unconditionally) */ + mca->mca_uptr = NULL; + lck_mtx_unlock(mbuf_mlock); + + list = list->obj_next; + } +} + +/* + * Common notify routine for all caches. It is called by mcache when + * one or more objects get freed. We use this indication to trigger + * the wakeup of any sleeping threads so that they can retry their + * allocation requests. + */ +static void +mbuf_slab_notify(void *arg, u_int32_t reason) +{ + mbuf_class_t class = (mbuf_class_t)arg; + int w; + + ASSERT(MBUF_CLASS_VALID(class)); + + if (reason != MCN_RETRYALLOC) + return; + + lck_mtx_lock(mbuf_mlock); + if ((w = mb_waiters) > 0) { + m_notified(class)++; + mb_waiters = 0; + } + lck_mtx_unlock(mbuf_mlock); + + if (w != 0) + wakeup(mb_waitchan); +} + +/* + * Obtain object(s) from the composite class's freelist. + */ +static unsigned int +cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num) +{ + unsigned int need = num; + mcl_slab_t *sp, *clsp, *nsp; + struct mbuf *m; + mcache_obj_t **list = *plist; + void *cl; + + VERIFY(need > 0); + VERIFY(class != MC_MBUF_16KCL || njcl > 0); + lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + + /* Get what we can from the freelist */ + while ((*list = m_cobjlist(class)) != NULL) { + MRANGE(*list); + + m = (struct mbuf *)*list; + sp = slab_get(m); + cl = m->m_ext.ext_buf; + clsp = slab_get(cl); + VERIFY(m->m_flags == M_EXT && cl != NULL); + VERIFY(MEXT_RFA(m) != NULL && MBUF_IS_COMPOSITE(m)); + VERIFY(clsp->sl_refcnt == 1); + if (class == MC_MBUF_BIGCL) { + nsp = clsp->sl_next; + /* Next slab must already be present */ + VERIFY(nsp != NULL); + VERIFY(nsp->sl_refcnt == 1); + } else if (class == MC_MBUF_16KCL) { + int k; + for (nsp = clsp, k = 1; + k < (M16KCLBYTES / MCLBYTES); k++) { + nsp = nsp->sl_next; + /* Next slab must already be present */ + VERIFY(nsp != NULL); + VERIFY(nsp->sl_refcnt == 1); + } + } + + if ((m_cobjlist(class) = (*list)->obj_next) != NULL && + !MBUF_IN_MAP(m_cobjlist(class))) { + slab_nextptr_panic(sp, m_cobjlist(class)); + /* NOTREACHED */ + } + (*list)->obj_next = NULL; + list = *plist = &(*list)->obj_next; + + if (--need == 0) + break; + } + m_infree(class) -= (num - need); + + return (num - need); +} + +/* + * Place object(s) back into a composite class's freelist. + */ +static unsigned int +cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged) +{ + mcache_obj_t *o, *tail; + unsigned int num = 0; + struct mbuf *m, *ms; + mcache_audit_t *mca = NULL; + mcache_obj_t *ref_list = NULL; + mcl_slab_t *clsp, *nsp; + void *cl; + + ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class)); + VERIFY(class != MC_MBUF_16KCL || njcl > 0); + lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + + o = tail = list; + + while ((m = ms = (struct mbuf *)o) != NULL) { + mcache_obj_t *rfa, *nexto = o->obj_next; + + /* Do the mbuf sanity checks */ + if (mclaudit != NULL) { + mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); + mcache_audit_free_verify(mca, m, 0, m_maxsize(MC_MBUF)); + ms = (struct mbuf *)mca->mca_contents; + } + + /* Do the cluster sanity checks */ + cl = ms->m_ext.ext_buf; + clsp = slab_get(cl); + if (mclaudit != NULL) { + size_t size; + if (class == MC_MBUF_CL) + size = m_maxsize(MC_CL); + else if (class == MC_MBUF_BIGCL) + size = m_maxsize(MC_BIGCL); + else + size = m_maxsize(MC_16KCL); + mcache_audit_free_verify(mcl_audit_buf2mca(MC_CL, + (mcache_obj_t *)cl), cl, 0, size); + } + VERIFY(ms->m_type == MT_FREE); + VERIFY(ms->m_flags == M_EXT); + VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms)); + VERIFY(clsp->sl_refcnt == 1); + if (class == MC_MBUF_BIGCL) { + nsp = clsp->sl_next; + /* Next slab must already be present */ + VERIFY(nsp != NULL); + VERIFY(nsp->sl_refcnt == 1); + } else if (class == MC_MBUF_16KCL) { + int k; + for (nsp = clsp, k = 1; + k < (M16KCLBYTES / MCLBYTES); k++) { + nsp = nsp->sl_next; + /* Next slab must already be present */ + VERIFY(nsp != NULL); + VERIFY(nsp->sl_refcnt == 1); + } + } + + /* + * If we're asked to purge, restore the actual mbuf using + * contents of the shadow structure (if auditing is enabled) + * and clear EXTF_COMPOSITE flag from the mbuf, as we are + * about to free it and the attached cluster into their caches. + */ + if (purged) { + /* Restore constructed mbuf fields */ + if (mclaudit != NULL) + mcl_audit_restore_mbuf(m, mca, TRUE); + + MEXT_REF(m) = 0; + MEXT_FLAGS(m) = 0; + + rfa = (mcache_obj_t *)MEXT_RFA(m); + rfa->obj_next = ref_list; + ref_list = rfa; + MEXT_RFA(m) = NULL; + + m->m_type = MT_FREE; + m->m_flags = m->m_len = 0; + m->m_next = m->m_nextpkt = NULL; + + /* Save mbuf fields and make auditing happy */ + if (mclaudit != NULL) + mcl_audit_mbuf(mca, o, FALSE, FALSE); + + VERIFY(m_total(class) > 0); + m_total(class)--; + + /* Free the mbuf */ + o->obj_next = NULL; + slab_free(MC_MBUF, o); + + /* And free the cluster */ + ((mcache_obj_t *)cl)->obj_next = NULL; + if (class == MC_MBUF_CL) + slab_free(MC_CL, cl); + else if (class == MC_MBUF_BIGCL) + slab_free(MC_BIGCL, cl); + else + slab_free(MC_16KCL, cl); + } + + ++num; + tail = o; + o = nexto; + } + + if (!purged) { + tail->obj_next = m_cobjlist(class); + m_cobjlist(class) = list; + m_infree(class) += num; + } else if (ref_list != NULL) { + mcache_free_ext(ref_cache, ref_list); + } + + return (num); +} + +/* + * Common allocator for composite objects called by the CPU cache layer + * during an allocation request whenever there is no available element in + * the bucket layer. It returns one or more composite elements from the + * appropriate global freelist. If the freelist is empty, it will attempt + * to obtain the rudimentary objects from their caches and construct them + * into composite mbuf + cluster objects. + */ +static unsigned int +mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed, + int wait) +{ + mbuf_class_t class = (mbuf_class_t)arg; + mcache_t *cp = NULL; + unsigned int num = 0, cnum = 0, want = needed; + mcache_obj_t *ref_list = NULL; + mcache_obj_t *mp_list = NULL; + mcache_obj_t *clp_list = NULL; + mcache_obj_t **list; + struct ext_ref *rfa; + struct mbuf *m; + void *cl; + + ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class)); + ASSERT(needed > 0); + + VERIFY(class != MC_MBUF_16KCL || njcl > 0); + + /* There should not be any slab for this class */ + VERIFY(m_slab_cnt(class) == 0 && + m_slablist(class).tqh_first == NULL && + m_slablist(class).tqh_last == NULL); + + lck_mtx_lock(mbuf_mlock); + + /* Try using the freelist first */ + num = cslab_alloc(class, plist, needed); + list = *plist; + if (num == needed) { + m_alloc_cnt(class) += num; + lck_mtx_unlock(mbuf_mlock); + return (needed); + } + + lck_mtx_unlock(mbuf_mlock); + + /* + * We could not satisfy the request using the freelist alone; + * allocate from the appropriate rudimentary caches and use + * whatever we can get to construct the composite objects. + */ + needed -= num; + + /* + * Mark these allocation requests as coming from a composite cache. + * Also, if the caller is willing to be blocked, mark the request + * with MCR_FAILOK such that we don't end up sleeping at the mbuf + * slab layer waiting for the individual object when one or more + * of the already-constructed composite objects are available. + */ + wait |= MCR_COMP; + if (!(wait & MCR_NOSLEEP)) + wait |= MCR_FAILOK; + + needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait); + if (needed == 0) { + ASSERT(mp_list == NULL); + goto fail; + } + if (class == MC_MBUF_CL) + cp = m_cache(MC_CL); + else if (class == MC_MBUF_BIGCL) + cp = m_cache(MC_BIGCL); + else + cp = m_cache(MC_16KCL); + needed = mcache_alloc_ext(cp, &clp_list, needed, wait); + if (needed == 0) { + ASSERT(clp_list == NULL); + goto fail; + } + needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait); + if (needed == 0) { + ASSERT(ref_list == NULL); + goto fail; + } + + /* + * By this time "needed" is MIN(mbuf, cluster, ref). Any left + * overs will get freed accordingly before we return to caller. + */ + for (cnum = 0; cnum < needed; cnum++) { + struct mbuf *ms; + + m = ms = (struct mbuf *)mp_list; + mp_list = mp_list->obj_next; + + cl = clp_list; + clp_list = clp_list->obj_next; + ((mcache_obj_t *)cl)->obj_next = NULL; + + rfa = (struct ext_ref *)ref_list; + ref_list = ref_list->obj_next; + ((mcache_obj_t *)rfa)->obj_next = NULL; + + /* + * If auditing is enabled, construct the shadow mbuf + * in the audit structure instead of in the actual one. + * mbuf_cslab_audit() will take care of restoring the + * contents after the integrity check. + */ + if (mclaudit != NULL) { + mcache_audit_t *mca, *cl_mca; + size_t size; + + lck_mtx_lock(mbuf_mlock); + mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); + ms = ((struct mbuf *)mca->mca_contents); + cl_mca = mcl_audit_buf2mca(MC_CL, (mcache_obj_t *)cl); + + /* + * Pair them up. Note that this is done at the time + * the mbuf+cluster objects are constructed. This + * information should be treated as "best effort" + * debugging hint since more than one mbufs can refer + * to a cluster. In that case, the cluster might not + * be freed along with the mbuf it was paired with. + */ + mca->mca_uptr = cl_mca; + cl_mca->mca_uptr = mca; + + ASSERT(mca->mca_uflags & MB_SCVALID); + ASSERT(!(cl_mca->mca_uflags & MB_SCVALID)); + lck_mtx_unlock(mbuf_mlock); + + /* Technically, they are in the freelist */ + mcache_set_pattern(MCACHE_FREE_PATTERN, m, + m_maxsize(MC_MBUF)); + if (class == MC_MBUF_CL) + size = m_maxsize(MC_CL); + else if (class == MC_MBUF_BIGCL) + size = m_maxsize(MC_BIGCL); + else + size = m_maxsize(MC_16KCL); + mcache_set_pattern(MCACHE_FREE_PATTERN, cl, size); + } + + MBUF_INIT(ms, 0, MT_FREE); + if (class == MC_MBUF_16KCL) { + MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE); + } else if (class == MC_MBUF_BIGCL) { + MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE); + } else { + MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE); + } + VERIFY(ms->m_flags == M_EXT); + VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms)); + + *list = (mcache_obj_t *)m; + (*list)->obj_next = NULL; + list = *plist = &(*list)->obj_next; + } + +fail: + /* + * Free up what's left of the above. + */ + if (mp_list != NULL) + mcache_free_ext(m_cache(MC_MBUF), mp_list); + if (clp_list != NULL) + mcache_free_ext(cp, clp_list); + if (ref_list != NULL) + mcache_free_ext(ref_cache, ref_list); + + lck_mtx_lock(mbuf_mlock); + if (num > 0 || cnum > 0) { + m_total(class) += cnum; + VERIFY(m_total(class) <= m_maxlimit(class)); + m_alloc_cnt(class) += num + cnum; + } + if ((num + cnum) < want) + m_fail_cnt(class) += (want - (num + cnum)); + lck_mtx_unlock(mbuf_mlock); + + return (num + cnum); +} + +/* + * Common de-allocator for composite objects called by the CPU cache + * layer when one or more elements need to be returned to the appropriate + * global freelist. + */ +static void +mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged) +{ + mbuf_class_t class = (mbuf_class_t)arg; + unsigned int num; + int w; + + ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class)); + + lck_mtx_lock(mbuf_mlock); + + num = cslab_free(class, list, purged); + m_free_cnt(class) += num; + + if ((w = mb_waiters) > 0) + mb_waiters = 0; + + lck_mtx_unlock(mbuf_mlock); + + if (w != 0) + wakeup(mb_waitchan); +} + +/* + * Common auditor for composite objects called by the CPU cache layer + * during an allocation or free request. For the former, this is called + * after the objects are obtained from either the bucket or slab layer + * and before they are returned to the caller. For the latter, this is + * called immediately during free and before placing the objects into + * the bucket or slab layer. + */ +static void +mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc) +{ + mbuf_class_t class = (mbuf_class_t)arg; + mcache_audit_t *mca; + struct mbuf *m, *ms; + mcl_slab_t *clsp, *nsp; + size_t size; + void *cl; + + ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class)); + + while ((m = ms = (struct mbuf *)list) != NULL) { + lck_mtx_lock(mbuf_mlock); + /* Do the mbuf sanity checks and record its transaction */ + mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); + mcl_audit_mbuf(mca, m, TRUE, alloc); + mcache_buffer_log(mca, m, m_cache(class)); + if (alloc) + mca->mca_uflags |= MB_COMP_INUSE; + else + mca->mca_uflags &= ~MB_COMP_INUSE; + + /* + * Use the shadow mbuf in the audit structure if we are + * freeing, since the contents of the actual mbuf has been + * pattern-filled by the above call to mcl_audit_mbuf(). + */ + if (!alloc) + ms = (struct mbuf *)mca->mca_contents; + + /* Do the cluster sanity checks and record its transaction */ + cl = ms->m_ext.ext_buf; + clsp = slab_get(cl); + VERIFY(ms->m_flags == M_EXT && cl != NULL); + VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms)); + VERIFY(clsp->sl_refcnt == 1); + if (class == MC_MBUF_BIGCL) { + nsp = clsp->sl_next; + /* Next slab must already be present */ + VERIFY(nsp != NULL); + VERIFY(nsp->sl_refcnt == 1); + } else if (class == MC_MBUF_16KCL) { + int k; + for (nsp = clsp, k = 1; + k < (M16KCLBYTES / MCLBYTES); k++) { + nsp = nsp->sl_next; + /* Next slab must already be present */ + VERIFY(nsp != NULL); + VERIFY(nsp->sl_refcnt == 1); + } + } + + mca = mcl_audit_buf2mca(MC_CL, cl); + if (class == MC_MBUF_CL) + size = m_maxsize(MC_CL); + else if (class == MC_MBUF_BIGCL) + size = m_maxsize(MC_BIGCL); + else + size = m_maxsize(MC_16KCL); + mcl_audit_cluster(mca, cl, size, alloc, FALSE); + mcache_buffer_log(mca, cl, m_cache(class)); + if (alloc) + mca->mca_uflags |= MB_COMP_INUSE; + else + mca->mca_uflags &= ~MB_COMP_INUSE; + lck_mtx_unlock(mbuf_mlock); + + list = list->obj_next; + } +} + +/* + * Allocate some number of mbuf clusters and place on cluster freelist. + */ +static int +m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize) +{ + int i; + vm_size_t size = 0; + int numpages = 0; + vm_offset_t page = 0; + mcache_audit_t *mca_list = NULL; + mcache_obj_t *con_list = NULL; + mcl_slab_t *sp; + + VERIFY(bufsize == m_maxsize(MC_CL) || + bufsize == m_maxsize(MC_BIGCL) || bufsize == m_maxsize(MC_16KCL)); + + lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + + /* + * Multiple threads may attempt to populate the cluster map one + * after another. Since we drop the lock below prior to acquiring + * the physical page(s), our view of the cluster map may no longer + * be accurate, and we could end up over-committing the pages beyond + * the maximum allowed for each class. To prevent it, this entire + * operation (including the page mapping) is serialized. + */ + while (mb_clalloc_busy) { + mb_clalloc_waiters++; + (void) msleep(mb_clalloc_waitchan, mbuf_mlock, + (PZERO-1), "m_clalloc", NULL); + lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + } + + /* We are busy now; tell everyone else to go away */ + mb_clalloc_busy = TRUE; + + /* + * Honor the caller's wish to block or not block. We have a way + * to grow the pool asynchronously using the mbuf worker thread. + */ + i = m_howmany(num, bufsize); + if (i == 0 || (wait & M_DONTWAIT)) + goto out; + + lck_mtx_unlock(mbuf_mlock); + + size = round_page_32(i * bufsize); + page = kmem_mb_alloc(mb_map, size); + + if (page == 0) { + if (bufsize <= m_maxsize(MC_BIGCL)) { + /* Try for 1 page if failed, only for 2KB/4KB request */ + size = NBPG; + page = kmem_mb_alloc(mb_map, size); + } + + if (page == 0) { + lck_mtx_lock(mbuf_mlock); + goto out; + } + } + + VERIFY(IS_P2ALIGNED(page, NBPG)); + numpages = size / NBPG; + + /* If auditing is enabled, allocate the audit structures now */ + if (mclaudit != NULL) { + int needed; + + /* + * Yes, I realize this is a waste of memory for clusters + * that never get transformed into mbufs, as we may end + * up with NMBPCL-1 unused audit structures per cluster. + * But doing so tremendously simplifies the allocation + * strategy, since at this point we are not holding the + * mbuf lock and the caller is okay to be blocked. For + * the case of big clusters, we allocate one structure + * for each as we never turn them into mbufs. + */ + if (bufsize == m_maxsize(MC_CL)) { + needed = numpages * 2 * NMBPCL; + + i = mcache_alloc_ext(mcl_audit_con_cache, + &con_list, needed, MCR_SLEEP); + + VERIFY(con_list != NULL && i == needed); + } else if (bufsize == m_maxsize(MC_BIGCL)) { + needed = numpages; + } else { + needed = numpages / (M16KCLBYTES / NBPG); + } + + i = mcache_alloc_ext(mcache_audit_cache, + (mcache_obj_t **)&mca_list, needed, MCR_SLEEP); + + VERIFY(mca_list != NULL && i == needed); + } + + lck_mtx_lock(mbuf_mlock); + + for (i = 0; i < numpages; i++, page += NBPG) { + ppnum_t offset = ((char *)page - (char *)mbutl) / NBPG; + ppnum_t new_page = pmap_find_phys(kernel_pmap, + (vm_address_t)page); + + /* + * In the case of no mapper being available the following + * code noops and returns the input page; if there is a + * mapper the appropriate I/O page is returned. + */ + new_page = IOMapperInsertPage(mcl_paddr_base, offset, new_page); + mcl_paddr[offset] = new_page << PGSHIFT; + + /* Pattern-fill this fresh page */ + if (mclaudit != NULL) + mcache_set_pattern(MCACHE_FREE_PATTERN, + (caddr_t)page, NBPG); + + if (bufsize == m_maxsize(MC_CL)) { + union mcluster *mcl = (union mcluster *)page; + + /* 1st cluster in the page */ + sp = slab_get(mcl); + if (mclaudit != NULL) + mcl_audit_init(mcl, &mca_list, &con_list, + AUDIT_CONTENTS_SIZE, NMBPCL); + + VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0); + slab_init(sp, MC_CL, SLF_MAPPED, + mcl, mcl, bufsize, 0, 1); + + /* Insert this slab */ + slab_insert(sp, MC_CL); + + /* Update stats now since slab_get() drops the lock */ + mbstat.m_clfree = ++m_infree(MC_CL) + + m_infree(MC_MBUF_CL); + mbstat.m_clusters = ++m_total(MC_CL); + VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL)); + + /* 2nd cluster in the page */ + sp = slab_get(++mcl); + if (mclaudit != NULL) + mcl_audit_init(mcl, &mca_list, &con_list, + AUDIT_CONTENTS_SIZE, NMBPCL); + + VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0); + slab_init(sp, MC_CL, SLF_MAPPED, + mcl, mcl, bufsize, 0, 1); + + /* Insert this slab */ + slab_insert(sp, MC_CL); + + /* Update stats now since slab_get() drops the lock */ + mbstat.m_clfree = ++m_infree(MC_CL) + + m_infree(MC_MBUF_CL); + mbstat.m_clusters = ++m_total(MC_CL); + VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL)); + } else if (bufsize == m_maxsize(MC_BIGCL)) { + union mbigcluster *mbc = (union mbigcluster *)page; + mcl_slab_t *nsp; + + /* One for the entire page */ + sp = slab_get(mbc); + if (mclaudit != NULL) + mcl_audit_init(mbc, &mca_list, NULL, 0, 1); + + VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0); + slab_init(sp, MC_BIGCL, SLF_MAPPED, + mbc, mbc, bufsize, 0, 1); + + /* 2nd cluster's slab is part of the previous one */ + nsp = slab_get(((union mcluster *)page) + 1); + slab_init(nsp, MC_BIGCL, SLF_MAPPED | SLF_PARTIAL, + mbc, NULL, 0, 0, 0); + + /* Insert this slab */ + slab_insert(sp, MC_BIGCL); + + /* Update stats now since slab_get() drops the lock */ + mbstat.m_bigclfree = ++m_infree(MC_BIGCL) + + m_infree(MC_MBUF_BIGCL); + mbstat.m_bigclusters = ++m_total(MC_BIGCL); + VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL)); + } else if ((i % (M16KCLBYTES / NBPG)) == 0) { + union m16kcluster *m16kcl = (union m16kcluster *)page; + mcl_slab_t *nsp; + int k; + + VERIFY(njcl > 0); + /* One for the entire 16KB */ + sp = slab_get(m16kcl); + if (mclaudit != NULL) + mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1); + + VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0); + slab_init(sp, MC_16KCL, SLF_MAPPED, + m16kcl, m16kcl, bufsize, 0, 1); + + /* 2nd-8th cluster's slab is part of the first one */ + for (k = 1; k < (M16KCLBYTES / MCLBYTES); k++) { + nsp = slab_get(((union mcluster *)page) + k); + VERIFY(nsp->sl_refcnt == 0 && + nsp->sl_flags == 0); + slab_init(nsp, MC_16KCL, + SLF_MAPPED | SLF_PARTIAL, + m16kcl, NULL, 0, 0, 0); + } + + /* Insert this slab */ + slab_insert(sp, MC_16KCL); + + /* Update stats now since slab_get() drops the lock */ + m_infree(MC_16KCL)++; + m_total(MC_16KCL)++; + VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL)); + } + } + VERIFY(mca_list == NULL && con_list == NULL); + + /* We're done; let others enter */ + mb_clalloc_busy = FALSE; + if (mb_clalloc_waiters > 0) { + mb_clalloc_waiters = 0; + wakeup(mb_clalloc_waitchan); + } + + if (bufsize == m_maxsize(MC_CL)) + return (numpages << 1); + else if (bufsize == m_maxsize(MC_BIGCL)) + return (numpages); + + VERIFY(bufsize == m_maxsize(MC_16KCL)); + return (numpages / (M16KCLBYTES / NBPG)); + +out: + lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + + /* We're done; let others enter */ + mb_clalloc_busy = FALSE; + if (mb_clalloc_waiters > 0) { + mb_clalloc_waiters = 0; + wakeup(mb_clalloc_waitchan); + } + + /* + * When non-blocking we kick a thread if we have to grow the + * pool or if the number of free clusters is less than requested. + */ + if (bufsize == m_maxsize(MC_CL)) { + if (i > 0) { + /* + * Remember total number of clusters needed + * at this time. + */ + i += m_total(MC_CL); + if (i > mbuf_expand_mcl) { + mbuf_expand_mcl = i; + if (mbuf_worker_ready) + wakeup((caddr_t)&mbuf_worker_run); + } + } + + if (m_infree(MC_CL) >= num) + return (1); + } else if (bufsize == m_maxsize(MC_BIGCL)) { + if (i > 0) { + /* + * Remember total number of 4KB clusters needed + * at this time. + */ + i += m_total(MC_BIGCL); + if (i > mbuf_expand_big) { + mbuf_expand_big = i; + if (mbuf_worker_ready) + wakeup((caddr_t)&mbuf_worker_run); + } + } + + if (m_infree(MC_BIGCL) >= num) + return (1); + } else { + if (i > 0) { + /* + * Remember total number of 16KB clusters needed + * at this time. + */ + i += m_total(MC_16KCL); + if (i > mbuf_expand_16k) { + mbuf_expand_16k = i; + if (mbuf_worker_ready) + wakeup((caddr_t)&mbuf_worker_run); + } + } + + if (m_infree(MC_16KCL) >= num) + return (1); + } + return (0); +} + +/* + * Populate the global freelist of the corresponding buffer class. + */ +static int +freelist_populate(mbuf_class_t class, unsigned int num, int wait) +{ + mcache_obj_t *o = NULL; + int i; + + VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL || + class == MC_16KCL); + +#if CONFIG_MBUF_NOEXPAND + if ((mbstat.m_mbufs / NMBPCL) >= maxmbufcl) { +#if DEBUG + static int printonce = 1; + if (printonce == 1) { + printonce = 0; + printf("m_expand failed, allocated %ld out of %d " + "clusters\n", mbstat.m_mbufs / NMBPCL, + nmbclusters); + } +#endif /* DEBUG */ + return (0); + } +#endif /* CONFIG_MBUF_NOEXPAND */ + + lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + + switch (class) { + case MC_MBUF: + case MC_CL: + i = m_clalloc(num, wait, m_maxsize(MC_CL)); + + /* Respect the 2K clusters minimum limit */ + if (m_total(MC_CL) == m_maxlimit(MC_CL) && + m_infree(MC_CL) <= m_minlimit(MC_CL)) { + if (class != MC_CL || (wait & MCR_COMP)) + return (0); + } + if (class == MC_CL) + return (i != 0); + break; + + case MC_BIGCL: + case MC_16KCL: + return (m_clalloc(num, wait, m_maxsize(class)) != 0); + /* NOTREACHED */ + + default: + VERIFY(0); + /* NOTREACHED */ + } + + /* Steal a cluster and cut it up to create NMBPCL mbufs */ + if ((o = slab_alloc(MC_CL, wait)) != NULL) { + struct mbuf *m = (struct mbuf *)o; + mcache_audit_t *mca = NULL; + mcl_slab_t *sp = slab_get(o); + + VERIFY(slab_is_detached(sp) && + (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED); + + /* Make sure that the cluster is unmolested while in freelist */ + if (mclaudit != NULL) { + mca = mcl_audit_buf2mca(MC_CL, o); + mcache_audit_free_verify(mca, o, 0, m_maxsize(MC_CL)); + } + + /* Reinitialize it as an mbuf slab */ + slab_init(sp, MC_MBUF, sp->sl_flags, sp->sl_base, NULL, + sp->sl_len, 0, NMBPCL); + + VERIFY(m == (struct mbuf *)sp->sl_base); + VERIFY(sp->sl_head == NULL); + + m_total(MC_MBUF) += NMBPCL; + mbstat.m_mbufs = m_total(MC_MBUF); + m_infree(MC_MBUF) += NMBPCL; + mtype_stat_add(MT_FREE, NMBPCL); + + i = NMBPCL; + while (i--) { + /* + * If auditing is enabled, construct the shadow mbuf + * in the audit structure instead of the actual one. + * mbuf_slab_audit() will take care of restoring the + * contents after the integrity check. + */ + if (mclaudit != NULL) { + struct mbuf *ms; + mca = mcl_audit_buf2mca(MC_MBUF, + (mcache_obj_t *)m); + ms = ((struct mbuf *)mca->mca_contents); + ms->m_type = MT_FREE; + } else { + m->m_type = MT_FREE; + } + m->m_next = sp->sl_head; + sp->sl_head = (void *)m++; + } + + /* Insert it into the mbuf class's slab list */ + slab_insert(sp, MC_MBUF); + + if ((i = mb_waiters) > 0) + mb_waiters = 0; + if (i != 0) + wakeup(mb_waitchan); + + return (1); + } + + return (0); +} + +/* + * (Inaccurately) check if it might be worth a trip back to the + * mcache layer due the availability of objects there. We'll + * end up back here if there's nothing up there. + */ +static boolean_t +mbuf_cached_above(mbuf_class_t class, int wait) +{ + switch (class) { + case MC_MBUF: + if (wait & MCR_COMP) + return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)) || + !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL))); + break; + + case MC_CL: + if (wait & MCR_COMP) + return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL))); + break; + + case MC_BIGCL: + if (wait & MCR_COMP) + return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL))); + break; + + case MC_16KCL: + if (wait & MCR_COMP) + return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL))); + break; + + case MC_MBUF_CL: + case MC_MBUF_BIGCL: + case MC_MBUF_16KCL: + break; + + default: + VERIFY(0); + /* NOTREACHED */ + } + + return (!mcache_bkt_isempty(m_cache(class))); +} + +/* + * If possible, convert constructed objects to raw ones. + */ +static boolean_t +mbuf_steal(mbuf_class_t class, unsigned int num) +{ + mcache_obj_t *top = NULL; + mcache_obj_t **list = ⊤ + unsigned int tot = 0; + + lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + + switch (class) { + case MC_MBUF: + case MC_CL: + case MC_BIGCL: + case MC_16KCL: + return (FALSE); + + case MC_MBUF_CL: + case MC_MBUF_BIGCL: + case MC_MBUF_16KCL: + /* Get the required number of constructed objects if possible */ + if (m_infree(class) > m_minlimit(class)) { + tot = cslab_alloc(class, &list, + MIN(num, m_infree(class))); + } + + /* And destroy them to get back the raw objects */ + if (top != NULL) + (void) cslab_free(class, top, 1); + break; + + default: + VERIFY(0); + /* NOTREACHED */ + } + + return (tot == num); +} + +static void +m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp) +{ + int m, bmap = 0; + + lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + + VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL)); + VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL)); + VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL)); + + /* + * This logic can be made smarter; for now, simply mark + * all other related classes as potential victims. + */ + switch (class) { + case MC_MBUF: + m_wantpurge(MC_CL)++; + m_wantpurge(MC_MBUF_CL)++; + m_wantpurge(MC_MBUF_BIGCL)++; + break; + + case MC_CL: + m_wantpurge(MC_MBUF)++; + if (!comp) + m_wantpurge(MC_MBUF_CL)++; + break; + + case MC_BIGCL: + if (!comp) + m_wantpurge(MC_MBUF_BIGCL)++; + break; + + case MC_16KCL: + if (!comp) + m_wantpurge(MC_MBUF_16KCL)++; + break; + + default: + VERIFY(0); + /* NOTREACHED */ + } + + /* + * Run through each marked class and check if we really need to + * purge (and therefore temporarily disable) the per-CPU caches + * layer used by the class. If so, remember the classes since + * we are going to drop the lock below prior to purging. + */ + for (m = 0; m < NELEM(mbuf_table); m++) { + if (m_wantpurge(m) > 0) { + m_wantpurge(m) = 0; + /* + * Try hard to steal the required number of objects + * from the freelist of other mbuf classes. Only + * purge and disable the per-CPU caches layer when + * we don't have enough; it's the last resort. + */ + if (!mbuf_steal(m, num)) + bmap |= (1 << m); + } + } + + lck_mtx_unlock(mbuf_mlock); + + if (bmap != 0) { + /* drain is performed in pfslowtimo(), to avoid deadlocks */ + do_reclaim = 1; + + /* Sigh; we have no other choices but to ask mcache to purge */ + for (m = 0; m < NELEM(mbuf_table); m++) { + if ((bmap & (1 << m)) && + mcache_purge_cache(m_cache(m))) { + lck_mtx_lock(mbuf_mlock); + m_purge_cnt(m)++; + mbstat.m_drain++; + lck_mtx_unlock(mbuf_mlock); + } + } + } else { + /* + * Request mcache to reap extra elements from all of its caches; + * note that all reaps are serialized and happen only at a fixed + * interval. + */ + mcache_reap(); + } + lck_mtx_lock(mbuf_mlock); +} + +static inline struct mbuf * +m_get_common(int wait, short type, int hdr) +{ + struct mbuf *m; + int mcflags = MSLEEPF(wait); + + /* Is this due to a non-blocking retry? If so, then try harder */ + if (mcflags & MCR_NOSLEEP) + mcflags |= MCR_TRYHARD; + + m = mcache_alloc(m_cache(MC_MBUF), mcflags); + if (m != NULL) { + MBUF_INIT(m, hdr, type); + mtype_stat_inc(type); + mtype_stat_dec(MT_FREE); +#if CONFIG_MACF_NET + if (hdr && mac_init_mbuf(m, wait) != 0) { + m_free(m); + return (NULL); + } +#endif /* MAC_NET */ + } + return (m); +} + +/* + * Space allocation routines; these are also available as macros + * for critical paths. + */ +#define _M_GET(wait, type) m_get_common(wait, type, 0) +#define _M_GETHDR(wait, type) m_get_common(wait, type, 1) +#define _M_RETRY(wait, type) _M_GET(wait, type) +#define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type) +#define _MGET(m, how, type) ((m) = _M_GET(how, type)) +#define _MGETHDR(m, how, type) ((m) = _M_GETHDR(how, type)) + +struct mbuf * +m_get(int wait, int type) +{ + return (_M_GET(wait, type)); +} + +struct mbuf * +m_gethdr(int wait, int type) +{ + return (_M_GETHDR(wait, type)); +} + +struct mbuf * +m_retry(int wait, int type) +{ + return (_M_RETRY(wait, type)); +} + +struct mbuf * +m_retryhdr(int wait, int type) +{ + return (_M_RETRYHDR(wait, type)); +} + +struct mbuf * +m_getclr(int wait, int type) +{ + struct mbuf *m; + + _MGET(m, wait, type); + if (m != NULL) + bzero(MTOD(m, caddr_t), MLEN); + return (m); +} + +struct mbuf * +m_free(struct mbuf *m) +{ + struct mbuf *n = m->m_next; + + if (m->m_type == MT_FREE) + panic("m_free: freeing an already freed mbuf"); + + /* Free the aux data and tags if there is any */ + if (m->m_flags & M_PKTHDR) { + m_tag_delete_chain(m, NULL); + } + + if (m->m_flags & M_EXT) { + u_int32_t refcnt; + u_int32_t flags; + + refcnt = m_decref(m); + flags = MEXT_FLAGS(m); + if (refcnt == 0 && flags == 0) { + if (m->m_ext.ext_free == NULL) { + mcache_free(m_cache(MC_CL), m->m_ext.ext_buf); + } else if (m->m_ext.ext_free == m_bigfree) { + mcache_free(m_cache(MC_BIGCL), + m->m_ext.ext_buf); + } else if (m->m_ext.ext_free == m_16kfree) { + mcache_free(m_cache(MC_16KCL), + m->m_ext.ext_buf); + } else { + (*(m->m_ext.ext_free))(m->m_ext.ext_buf, + m->m_ext.ext_size, m->m_ext.ext_arg); + } + mcache_free(ref_cache, MEXT_RFA(m)); + MEXT_RFA(m) = NULL; + } else if (refcnt == 0 && (flags & EXTF_COMPOSITE)) { + VERIFY(m->m_type != MT_FREE); + + mtype_stat_dec(m->m_type); + mtype_stat_inc(MT_FREE); + + m->m_type = MT_FREE; + m->m_flags = M_EXT; + m->m_len = 0; + m->m_next = m->m_nextpkt = NULL; + + /* "Free" into the intermediate cache */ + if (m->m_ext.ext_free == NULL) { + mcache_free(m_cache(MC_MBUF_CL), m); + } else if (m->m_ext.ext_free == m_bigfree) { + mcache_free(m_cache(MC_MBUF_BIGCL), m); + } else { + VERIFY(m->m_ext.ext_free == m_16kfree); + mcache_free(m_cache(MC_MBUF_16KCL), m); + } + return (n); + } + } + + if (m->m_type != MT_FREE) { + mtype_stat_dec(m->m_type); + mtype_stat_inc(MT_FREE); + } + + m->m_type = MT_FREE; + m->m_flags = m->m_len = 0; + m->m_next = m->m_nextpkt = NULL; + + mcache_free(m_cache(MC_MBUF), m); + + return (n); +} + +__private_extern__ struct mbuf * +m_clattach(struct mbuf *m, int type, caddr_t extbuf, + void (*extfree)(caddr_t, u_int, caddr_t), u_int extsize, caddr_t extarg, + int wait) +{ + struct ext_ref *rfa = NULL; + + if (m == NULL && (m = _M_GETHDR(wait, type)) == NULL) + return (NULL); + + if (m->m_flags & M_EXT) { + u_int32_t refcnt; + u_int32_t flags; + + refcnt = m_decref(m); + flags = MEXT_FLAGS(m); + if (refcnt == 0 && flags == 0) { + if (m->m_ext.ext_free == NULL) { + mcache_free(m_cache(MC_CL), m->m_ext.ext_buf); + } else if (m->m_ext.ext_free == m_bigfree) { + mcache_free(m_cache(MC_BIGCL), + m->m_ext.ext_buf); + } else if (m->m_ext.ext_free == m_16kfree) { + mcache_free(m_cache(MC_16KCL), + m->m_ext.ext_buf); + } else { + (*(m->m_ext.ext_free))(m->m_ext.ext_buf, + m->m_ext.ext_size, m->m_ext.ext_arg); + } + /* Re-use the reference structure */ + rfa = MEXT_RFA(m); + } else if (refcnt == 0 && (flags & EXTF_COMPOSITE)) { + VERIFY(m->m_type != MT_FREE); + + mtype_stat_dec(m->m_type); + mtype_stat_inc(MT_FREE); + + m->m_type = MT_FREE; + m->m_flags = M_EXT; + m->m_len = 0; + m->m_next = m->m_nextpkt = NULL; + /* "Free" into the intermediate cache */ + if (m->m_ext.ext_free == NULL) { + mcache_free(m_cache(MC_MBUF_CL), m); + } else if (m->m_ext.ext_free == m_bigfree) { + mcache_free(m_cache(MC_MBUF_BIGCL), m); + } else { + VERIFY(m->m_ext.ext_free == m_16kfree); + mcache_free(m_cache(MC_MBUF_16KCL), m); + } + /* + * Allocate a new mbuf, since we didn't divorce + * the composite mbuf + cluster pair above. + */ + if ((m = _M_GETHDR(wait, type)) == NULL) + return (NULL); + } + } + + if (rfa == NULL && + (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) { + m_free(m); + return (NULL); + } + + MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa, 1, 0); + + return (m); +} + +/* m_mclget() add an mbuf cluster to a normal mbuf */ +struct mbuf * +m_mclget(struct mbuf *m, int wait) +{ + struct ext_ref *rfa; + + if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) + return (m); + + m->m_ext.ext_buf = m_mclalloc(wait); + if (m->m_ext.ext_buf != NULL) { + MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0); + } else { + mcache_free(ref_cache, rfa); + } + return (m); +} + +/* Allocate an mbuf cluster */ +caddr_t +m_mclalloc(int wait) +{ + int mcflags = MSLEEPF(wait); + + /* Is this due to a non-blocking retry? If so, then try harder */ + if (mcflags & MCR_NOSLEEP) + mcflags |= MCR_TRYHARD; + + return (mcache_alloc(m_cache(MC_CL), mcflags)); +} + +/* Free an mbuf cluster */ +void +m_mclfree(caddr_t p) +{ + mcache_free(m_cache(MC_CL), p); +} + +/* + * mcl_hasreference() checks if a cluster of an mbuf is referenced by + * another mbuf + */ +int +m_mclhasreference(struct mbuf *m) +{ + if (!(m->m_flags & M_EXT)) + return (0); -/* m_mclalloc() allocate an mbuf cluster */ -caddr_t -m_mclalloc( - int nowait) -{ - caddr_t p; - - (void)m_clalloc(1, nowait, MCLBYTES, 0); - if ((p = (caddr_t)mclfree)) { - ++mclrefcnt[mtocl(p)]; - mbstat.m_clfree--; - mclfree = ((union mcluster *)p)->mcl_next; - } else { - mbstat.m_drops++; - } - MBUF_UNLOCK(); - - return p; + ASSERT(MEXT_RFA(m) != NULL); + + return (MEXT_REF(m) > 1); } -/* m_mclfree() releases a reference to a cluster allocated by MCLALLOC, - * freeing the cluster if the reference count has reached 0. */ -void -m_mclfree( - caddr_t p) +__private_extern__ caddr_t +m_bigalloc(int wait) { - MBUF_LOCK(); + int mcflags = MSLEEPF(wait); - m_range_check(p); + /* Is this due to a non-blocking retry? If so, then try harder */ + if (mcflags & MCR_NOSLEEP) + mcflags |= MCR_TRYHARD; - if (--mclrefcnt[mtocl(p)] == 0) { - ((union mcluster *)(p))->mcl_next = mclfree; - mclfree = (union mcluster *)(p); - mbstat.m_clfree++; - } - MBUF_UNLOCK(); + return (mcache_alloc(m_cache(MC_BIGCL), mcflags)); } -/* mcl_hasreference() checks if a cluster of an mbuf is referenced by another mbuf */ -int -m_mclhasreference( - struct mbuf *m) +__private_extern__ void +m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg) { - return (m->m_ext.ext_refs.forward != &(m->m_ext.ext_refs)); + mcache_free(m_cache(MC_BIGCL), p); } -__private_extern__ caddr_t -m_bigalloc(int nowait) -{ - caddr_t p; - - (void)m_clalloc(1, nowait, NBPG, 0); - if ((p = (caddr_t)mbigfree)) { - if (mclrefcnt[mtocl(p)] != mclrefcnt[mtocl(p) + 1]) - panic("m_bigalloc mclrefcnt %x mismatch %d != %d", - p, mclrefcnt[mtocl(p)], mclrefcnt[mtocl(p) + 1]); - if (mclrefcnt[mtocl(p)] || mclrefcnt[mtocl(p) + 1]) - panic("m_bigalloc mclrefcnt %x not null %d != %d", - p, mclrefcnt[mtocl(p)], mclrefcnt[mtocl(p) + 1]); - ++mclrefcnt[mtocl(p)]; - ++mclrefcnt[mtocl(p) + 1]; - mbstat.m_bigclfree--; - mbigfree = ((union mbigcluster *)p)->mbc_next; +/* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */ +__private_extern__ struct mbuf * +m_mbigget(struct mbuf *m, int wait) +{ + struct ext_ref *rfa; + + if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) + return (m); + + m->m_ext.ext_buf = m_bigalloc(wait); + if (m->m_ext.ext_buf != NULL) { + MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0); } else { - mbstat.m_drops++; + mcache_free(ref_cache, rfa); } - MBUF_UNLOCK(); - return p; + return (m); +} + +__private_extern__ caddr_t +m_16kalloc(int wait) +{ + int mcflags = MSLEEPF(wait); + + /* Is this due to a non-blocking retry? If so, then try harder */ + if (mcflags & MCR_NOSLEEP) + mcflags |= MCR_TRYHARD; + + return (mcache_alloc(m_cache(MC_16KCL), mcflags)); } __private_extern__ void -m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg) +m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg) { - m_range_check(p); - - if (mclrefcnt[mtocl(p)] != mclrefcnt[mtocl(p) + 1]) - panic("m_bigfree mclrefcnt %x mismatch %d != %d", - p, mclrefcnt[mtocl(p)], mclrefcnt[mtocl(p) + 1]); - --mclrefcnt[mtocl(p)]; - --mclrefcnt[mtocl(p) + 1]; - if (mclrefcnt[mtocl(p)] == 0) { - ((union mbigcluster *)(p))->mbc_next = mbigfree; - mbigfree = (union mbigcluster *)(p); - mbstat.m_bigclfree++; - } + mcache_free(m_cache(MC_16KCL), p); } -/* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */ +/* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */ __private_extern__ struct mbuf * -m_mbigget(struct mbuf *m, int nowait) +m_m16kget(struct mbuf *m, int wait) { - m->m_ext.ext_buf = m_bigalloc(nowait); - if (m->m_ext.ext_buf) { - m->m_data = m->m_ext.ext_buf; - m->m_flags |= M_EXT; - m->m_ext.ext_size = NBPG; - m->m_ext.ext_free = m_bigfree; - m->m_ext.ext_arg = 0; - m->m_ext.ext_refs.forward = m->m_ext.ext_refs.backward = - &m->m_ext.ext_refs; + struct ext_ref *rfa; + + if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) + return (m); + + m->m_ext.ext_buf = m_16kalloc(wait); + if (m->m_ext.ext_buf != NULL) { + MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0); + } else { + mcache_free(ref_cache, rfa); } - - return m; + return (m); } - /* */ void -m_copy_pkthdr( - struct mbuf *to, - struct mbuf *from) +m_copy_pkthdr(struct mbuf *to, struct mbuf *from) { - to->m_pkthdr = from->m_pkthdr; - from->m_pkthdr.aux = (struct mbuf *)NULL; - SLIST_INIT(&from->m_pkthdr.tags); /* purge tags from src */ +#if CONFIG_MACF_NET + /* We will be taking over the tags of 'to' */ + if (to->m_flags & M_PKTHDR) + m_tag_delete_chain(to, NULL); +#endif /* MAC_NET */ + to->m_pkthdr = from->m_pkthdr; /* especially tags */ + m_tag_init(from); /* purge tags from src */ to->m_flags = from->m_flags & M_COPYFLAGS; to->m_data = (to)->m_pktdat; } -/* - * "Move" mbuf pkthdr from "from" to "to". - * "from" must have M_PKTHDR set, and "to" must be empty. - */ -#ifndef __APPLE__ -void -m_move_pkthdr(struct mbuf *to, struct mbuf *from) -{ - KASSERT((to->m_flags & M_EXT) == 0, ("m_move_pkthdr: to has cluster")); - - to->m_flags = from->m_flags & M_COPYFLAGS; - to->m_data = to->m_pktdat; - to->m_pkthdr = from->m_pkthdr; /* especially tags */ - SLIST_INIT(&from->m_pkthdr.tags); /* purge tags from src */ - from->m_flags &= ~M_PKTHDR; -} -#endif - /* * Duplicate "from"'s mbuf pkthdr in "to". * "from" must have M_PKTHDR set, and "to" must be empty. @@ -813,353 +3023,501 @@ m_move_pkthdr(struct mbuf *to, struct mbuf *from) static int m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how) { - to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT); - if ((to->m_flags & M_EXT) == 0) - to->m_data = to->m_pktdat; - if (to->m_pkthdr.aux != NULL) - m_freem(to->m_pkthdr.aux); - to->m_pkthdr = from->m_pkthdr; - to->m_pkthdr.aux = NULL; - (void) m_aux_copy(to, from); - SLIST_INIT(&to->m_pkthdr.tags); - return (m_tag_copy_chain(to, from, how)); +#if CONFIG_MACF_NET + if (to->m_flags & M_PKTHDR) + m_tag_delete_chain(to, NULL); +#endif /* MAC_NET */ + to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT); + if ((to->m_flags & M_EXT) == 0) + to->m_data = to->m_pktdat; + to->m_pkthdr = from->m_pkthdr; + m_tag_init(to); + return (m_tag_copy_chain(to, from, how)); } /* - * return a list of mbuf hdrs that point to clusters... - * try for num_needed, if wantall is not set, return whatever - * number were available... set up the first num_with_pkthdrs - * with mbuf hdrs configured as packet headers... these are - * chained on the m_nextpkt field... any packets requested beyond - * this are chained onto the last packet header's m_next field. - * The size of the cluster is controlled by the paramter bufsize. + * Return a list of mbuf hdrs that point to clusters. Try for num_needed; + * if wantall is not set, return whatever number were available. Set up the + * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these + * are chained on the m_nextpkt field. Any packets requested beyond this + * are chained onto the last packet header's m_next field. The size of + * the cluster is controlled by the parameter bufsize. */ __private_extern__ struct mbuf * -m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs, int how, int wantall, size_t bufsize) +m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs, + int wait, int wantall, size_t bufsize) { struct mbuf *m; struct mbuf **np, *top; - unsigned int num, needed = *num_needed; - - if (bufsize != MCLBYTES && bufsize != NBPG) - return 0; - + unsigned int pnum, needed = *num_needed; + mcache_obj_t *mp_list = NULL; + int mcflags = MSLEEPF(wait); + u_int32_t flag; + struct ext_ref *rfa; + mcache_t *cp; + void *cl; + + ASSERT(bufsize == m_maxsize(MC_CL) || + bufsize == m_maxsize(MC_BIGCL) || + bufsize == m_maxsize(MC_16KCL)); + + /* + * Caller must first check for njcl because this + * routine is internal and not exposed/used via KPI. + */ + VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0); + top = NULL; np = ⊤ - - (void)m_clalloc(needed, how, bufsize, 0); /* takes the MBUF_LOCK, but doesn't release it... */ - - for (num = 0; num < needed; num++) { - m_range_check(mfree); - m_range_check(mclfree); - m_range_check(mbigfree); - - if (mfree && ((bufsize == NBPG && mbigfree) || (bufsize == MCLBYTES && mclfree))) { - /* mbuf + cluster are available */ - m = mfree; - MCHECK(m); - mfree = m->m_next; - ++mclrefcnt[mtocl(m)]; - mbstat.m_mtypes[MT_FREE]--; - mbstat.m_mtypes[MT_DATA]++; - if (bufsize == NBPG) { - m->m_ext.ext_buf = (caddr_t)mbigfree; /* get the big cluster */ - ++mclrefcnt[mtocl(m->m_ext.ext_buf)]; - ++mclrefcnt[mtocl(m->m_ext.ext_buf) + 1]; - mbstat.m_bigclfree--; - mbigfree = ((union mbigcluster *)(m->m_ext.ext_buf))->mbc_next; - m->m_ext.ext_free = m_bigfree; - m->m_ext.ext_size = NBPG; - } else { - m->m_ext.ext_buf = (caddr_t)mclfree; /* get the cluster */ - ++mclrefcnt[mtocl(m->m_ext.ext_buf)]; - mbstat.m_clfree--; - mclfree = ((union mcluster *)(m->m_ext.ext_buf))->mcl_next; - m->m_ext.ext_free = 0; - m->m_ext.ext_size = MCLBYTES; - } - m->m_ext.ext_arg = 0; - m->m_ext.ext_refs.forward = m->m_ext.ext_refs.backward = &m->m_ext.ext_refs; - m->m_next = m->m_nextpkt = 0; - m->m_type = MT_DATA; - m->m_data = m->m_ext.ext_buf; - m->m_len = 0; + pnum = 0; - if (num_with_pkthdrs == 0) - m->m_flags = M_EXT; - else { - m->m_flags = M_PKTHDR | M_EXT; - _M_CLEAR_PKTHDR(m); - - num_with_pkthdrs--; - } + /* + * The caller doesn't want all the requested buffers; only some. + * Try hard to get what we can, but don't block. This effectively + * overrides MCR_SLEEP, since this thread will not go to sleep + * if we can't get all the buffers. + */ + if (!wantall || (mcflags & MCR_NOSLEEP)) + mcflags |= MCR_TRYHARD; + + /* Allocate the composite mbuf + cluster elements from the cache */ + if (bufsize == m_maxsize(MC_CL)) + cp = m_cache(MC_MBUF_CL); + else if (bufsize == m_maxsize(MC_BIGCL)) + cp = m_cache(MC_MBUF_BIGCL); + else + cp = m_cache(MC_MBUF_16KCL); + needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags); + + for (pnum = 0; pnum < needed; pnum++) { + m = (struct mbuf *)mp_list; + mp_list = mp_list->obj_next; + + VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT); + cl = m->m_ext.ext_buf; + rfa = MEXT_RFA(m); + + ASSERT(cl != NULL && rfa != NULL); + VERIFY(MBUF_IS_COMPOSITE(m)); + + flag = MEXT_FLAGS(m); + + MBUF_INIT(m, num_with_pkthdrs, MT_DATA); + if (bufsize == m_maxsize(MC_16KCL)) { + MBUF_16KCL_INIT(m, cl, rfa, 1, flag); + } else if (bufsize == m_maxsize(MC_BIGCL)) { + MBUF_BIGCL_INIT(m, cl, rfa, 1, flag); } else { - MBUF_UNLOCK(); - - if (num_with_pkthdrs == 0) { - MGET(m, how, MT_DATA ); - } else { - MGETHDR(m, how, MT_DATA); - - num_with_pkthdrs--; - } - if (m == 0) - goto fail; - - if (bufsize == NBPG) - m = m_mbigget(m, how); - else - m = m_mclget(m, how); - if ((m->m_flags & M_EXT) == 0) { + MBUF_CL_INIT(m, cl, rfa, 1, flag); + } + + if (num_with_pkthdrs > 0) { + --num_with_pkthdrs; +#if CONFIG_MACF_NET + if (mac_mbuf_label_init(m, wait) != 0) { m_free(m); - goto fail; + break; } - MBUF_LOCK(); +#endif /* MAC_NET */ } - *np = m; - - if (num_with_pkthdrs) + + *np = m; + if (num_with_pkthdrs > 0) np = &m->m_nextpkt; else np = &m->m_next; } - MBUF_UNLOCK(); - - *num_needed = num; - return (top); -fail: - if (wantall && top) { - m_freem(top); - return 0; + ASSERT(pnum != *num_needed || mp_list == NULL); + if (mp_list != NULL) + mcache_free_ext(cp, mp_list); + + if (pnum > 0) { + mtype_stat_add(MT_DATA, pnum); + mtype_stat_sub(MT_FREE, pnum); + } + + if (wantall && (pnum != *num_needed)) { + if (top != NULL) + m_freem_list(top); + return (NULL); } - return top; -} + *num_needed = pnum; + return (top); +} /* - * Return list of mbuf linked by m_nextpkt - * Try for num_needed, and if wantall is not set, return whatever - * number were available - * The size of each mbuf in the list is controlled by the parameter packetlen. - * Each mbuf of the list may have a chain of mbufs linked by m_next. Each mbuf in - * the chain is called a segment. - * If maxsegments is not null and the value pointed to is not null, this specify - * the maximum number of segments for a chain of mbufs. - * If maxsegments is zero or the value pointed to is zero the - * caller does not have any restriction on the number of segments. - * The actual number of segments of a mbuf chain is return in the value pointed - * to by maxsegments. - * When possible the allocation is done under a single lock. + * Return list of mbuf linked by m_nextpkt. Try for numlist, and if + * wantall is not set, return whatever number were available. The size of + * each mbuf in the list is controlled by the parameter packetlen. Each + * mbuf of the list may have a chain of mbufs linked by m_next. Each mbuf + * in the chain is called a segment. If maxsegments is not null and the + * value pointed to is not null, this specify the maximum number of segments + * for a chain of mbufs. If maxsegments is zero or the value pointed to + * is zero the caller does not have any restriction on the number of segments. + * The actual number of segments of a mbuf chain is return in the value + * pointed to by maxsegments. */ - __private_extern__ struct mbuf * -m_allocpacket_internal(unsigned int *num_needed, size_t packetlen, unsigned int * maxsegments, - int how, int wantall, size_t wantsize) +m_allocpacket_internal(unsigned int *numlist, size_t packetlen, + unsigned int *maxsegments, int wait, int wantall, size_t wantsize) { - struct mbuf **np, *top; - size_t bufsize; - unsigned int num; - unsigned int numchunks = 0; + struct mbuf **np, *top, *first = NULL; + size_t bufsize, r_bufsize; + unsigned int num = 0; + unsigned int nsegs = 0; + unsigned int needed, resid; + int mcflags = MSLEEPF(wait); + mcache_obj_t *mp_list = NULL, *rmp_list = NULL; + mcache_t *cp = NULL, *rcp = NULL; + + if (*numlist == 0) + return (NULL); top = NULL; np = ⊤ - + if (wantsize == 0) { - if (packetlen <= MINCLSIZE) + if (packetlen <= MINCLSIZE) { bufsize = packetlen; - else if (packetlen > MCLBYTES) - bufsize = NBPG; - else - bufsize = MCLBYTES; - } else if (wantsize == MCLBYTES || wantsize == NBPG) + } else if (packetlen > m_maxsize(MC_CL)) { + /* Use 4KB if jumbo cluster pool isn't available */ + if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0) + bufsize = m_maxsize(MC_BIGCL); + else + bufsize = m_maxsize(MC_16KCL); + } else { + bufsize = m_maxsize(MC_CL); + } + } else if (wantsize == m_maxsize(MC_CL) || + wantsize == m_maxsize(MC_BIGCL) || + (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) { bufsize = wantsize; - else - return 0; + } else { + return (NULL); + } if (bufsize <= MHLEN) { - numchunks = 1; + nsegs = 1; } else if (bufsize <= MINCLSIZE) { if (maxsegments != NULL && *maxsegments == 1) { - bufsize = MCLBYTES; - numchunks = 1; + bufsize = m_maxsize(MC_CL); + nsegs = 1; } else { - numchunks = 2; + nsegs = 2; } - } else if (bufsize == NBPG) { - numchunks = ((packetlen - 1) >> PGSHIFT) + 1; + } else if (bufsize == m_maxsize(MC_16KCL)) { + VERIFY(njcl > 0); + nsegs = ((packetlen - 1) >> (PGSHIFT + 2)) + 1; + } else if (bufsize == m_maxsize(MC_BIGCL)) { + nsegs = ((packetlen - 1) >> PGSHIFT) + 1; } else { - numchunks = ((packetlen - 1) >> MCLSHIFT) + 1; + nsegs = ((packetlen - 1) >> MCLSHIFT) + 1; } if (maxsegments != NULL) { - if (*maxsegments && numchunks > *maxsegments) { - *maxsegments = numchunks; - return 0; + if (*maxsegments && nsegs > *maxsegments) { + *maxsegments = nsegs; + return (NULL); } - *maxsegments = numchunks; + *maxsegments = nsegs; } - /* m_clalloc takes the MBUF_LOCK, but do not release it */ - (void)m_clalloc(numchunks, how, (bufsize == NBPG) ? NBPG : MCLBYTES, 0); - for (num = 0; num < *num_needed; num++) { - struct mbuf **nm, *pkt = 0; - size_t len; - nm = &pkt; + /* + * The caller doesn't want all the requested buffers; only some. + * Try hard to get what we can, but don't block. This effectively + * overrides MCR_SLEEP, since this thread will not go to sleep + * if we can't get all the buffers. + */ + if (!wantall || (mcflags & MCR_NOSLEEP)) + mcflags |= MCR_TRYHARD; + + /* + * Simple case where all elements in the lists/chains are mbufs. + * Unless bufsize is greater than MHLEN, each segment chain is made + * up of exactly 1 mbuf. Otherwise, each segment chain is made up + * of 2 mbufs; the second one is used for the residual data, i.e. + * the remaining data that cannot fit into the first mbuf. + */ + if (bufsize <= MINCLSIZE) { + /* Allocate the elements in one shot from the mbuf cache */ + ASSERT(bufsize <= MHLEN || nsegs == 2); + cp = m_cache(MC_MBUF); + needed = mcache_alloc_ext(cp, &mp_list, + (*numlist) * nsegs, mcflags); + + /* + * The number of elements must be even if we are to use an + * mbuf (instead of a cluster) to store the residual data. + * If we couldn't allocate the requested number of mbufs, + * trim the number down (if it's odd) in order to avoid + * creating a partial segment chain. + */ + if (bufsize > MHLEN && (needed & 0x1)) + needed--; - m_range_check(mfree); - m_range_check(mclfree); - m_range_check(mbigfree); + while (num < needed) { + struct mbuf *m; - for (len = 0; len < packetlen; ) { - struct mbuf *m = NULL; + m = (struct mbuf *)mp_list; + mp_list = mp_list->obj_next; + ASSERT(m != NULL); - if (wantsize == 0 && packetlen > MINCLSIZE) { - if (packetlen - len > MCLBYTES) - bufsize = NBPG; - else - bufsize = MCLBYTES; + MBUF_INIT(m, 1, MT_DATA); +#if CONFIG_MACF_NET + if (mac_init_mbuf(m, wait) != 0) { + m_free(m); + break; } - len += bufsize; - - if (mfree && ((bufsize == NBPG && mbigfree) || (bufsize == MCLBYTES && mclfree))) { - /* mbuf + cluster are available */ - m = mfree; - MCHECK(m); - mfree = m->m_next; - ++mclrefcnt[mtocl(m)]; - mbstat.m_mtypes[MT_FREE]--; - mbstat.m_mtypes[MT_DATA]++; - if (bufsize == NBPG) { - m->m_ext.ext_buf = (caddr_t)mbigfree; /* get the big cluster */ - ++mclrefcnt[mtocl(m->m_ext.ext_buf)]; - ++mclrefcnt[mtocl(m->m_ext.ext_buf) + 1]; - mbstat.m_bigclfree--; - mbigfree = ((union mbigcluster *)(m->m_ext.ext_buf))->mbc_next; - m->m_ext.ext_free = m_bigfree; - m->m_ext.ext_size = NBPG; - } else { - m->m_ext.ext_buf = (caddr_t)mclfree; /* get the cluster */ - ++mclrefcnt[mtocl(m->m_ext.ext_buf)]; - mbstat.m_clfree--; - mclfree = ((union mcluster *)(m->m_ext.ext_buf))->mcl_next; - m->m_ext.ext_free = 0; - m->m_ext.ext_size = MCLBYTES; - } - m->m_ext.ext_arg = 0; - m->m_ext.ext_refs.forward = m->m_ext.ext_refs.backward = &m->m_ext.ext_refs; - m->m_next = m->m_nextpkt = 0; - m->m_type = MT_DATA; - m->m_data = m->m_ext.ext_buf; - m->m_len = 0; - - if (pkt == 0) { - pkt = m; - m->m_flags = M_PKTHDR | M_EXT; - _M_CLEAR_PKTHDR(m); - } else { - m->m_flags = M_EXT; - } - } else { - MBUF_UNLOCK(); - - if (pkt == 0) { - MGETHDR(m, how, MT_DATA); - } else { - MGET(m, how, MT_DATA ); - } - if (m == 0) { - m_freem(pkt); - goto fail; - } - if (bufsize <= MINCLSIZE) { - if (bufsize > MHLEN) { - MGET(m->m_next, how, MT_DATA); - if (m->m_next == 0) { - m_free(m); - m_freem(pkt); - goto fail; - } - } - } else { - if (bufsize == NBPG) - m = m_mbigget(m, how); - else - m = m_mclget(m, how); - if ((m->m_flags & M_EXT) == 0) { - m_free(m); - m_freem(pkt); - goto fail; - } - } - MBUF_LOCK(); +#endif /* MAC_NET */ + num++; + if (bufsize > MHLEN) { + /* A second mbuf for this segment chain */ + m->m_next = (struct mbuf *)mp_list; + mp_list = mp_list->obj_next; + ASSERT(m->m_next != NULL); + + MBUF_INIT(m->m_next, 0, MT_DATA); + num++; } - *nm = m; - nm = &m->m_next; + *np = m; + np = &m->m_nextpkt; + } + ASSERT(num != *numlist || mp_list == NULL); + + if (num > 0) { + mtype_stat_add(MT_DATA, num); + mtype_stat_sub(MT_FREE, num); + } + num /= nsegs; + + /* We've got them all; return to caller */ + if (num == *numlist) + return (top); + + goto fail; + } + + /* + * Complex cases where elements are made up of one or more composite + * mbufs + cluster, depending on packetlen. Each N-segment chain can + * be illustrated as follows: + * + * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N] + * + * Every composite mbuf + cluster element comes from the intermediate + * cache (either MC_MBUF_CL or MC_MBUF_BIGCL). For space efficiency, + * the last composite element will come from the MC_MBUF_CL cache, + * unless the residual data is larger than 2KB where we use the + * big cluster composite cache (MC_MBUF_BIGCL) instead. Residual + * data is defined as extra data beyond the first element that cannot + * fit into the previous element, i.e. there is no residual data if + * the chain only has 1 segment. + */ + r_bufsize = bufsize; + resid = packetlen > bufsize ? packetlen % bufsize : 0; + if (resid > 0) { + /* There is residual data; figure out the cluster size */ + if (wantsize == 0 && packetlen > MINCLSIZE) { + /* + * Caller didn't request that all of the segments + * in the chain use the same cluster size; use the + * smaller of the cluster sizes. + */ + if (njcl > 0 && resid > m_maxsize(MC_BIGCL)) + r_bufsize = m_maxsize(MC_16KCL); + else if (resid > m_maxsize(MC_CL)) + r_bufsize = m_maxsize(MC_BIGCL); + else + r_bufsize = m_maxsize(MC_CL); + } else { + /* Use the same cluster size as the other segments */ + resid = 0; + } + } + + needed = *numlist; + if (resid > 0) { + /* + * Attempt to allocate composite mbuf + cluster elements for + * the residual data in each chain; record the number of such + * elements that can be allocated so that we know how many + * segment chains we can afford to create. + */ + if (r_bufsize <= m_maxsize(MC_CL)) + rcp = m_cache(MC_MBUF_CL); + else if (r_bufsize <= m_maxsize(MC_BIGCL)) + rcp = m_cache(MC_MBUF_BIGCL); + else + rcp = m_cache(MC_MBUF_16KCL); + needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags); + + if (needed == 0) + goto fail; + + /* This is temporarily reduced for calculation */ + ASSERT(nsegs > 1); + nsegs--; + } + + /* + * Attempt to allocate the rest of the composite mbuf + cluster + * elements for the number of segment chains that we need. + */ + if (bufsize <= m_maxsize(MC_CL)) + cp = m_cache(MC_MBUF_CL); + else if (bufsize <= m_maxsize(MC_BIGCL)) + cp = m_cache(MC_MBUF_BIGCL); + else + cp = m_cache(MC_MBUF_16KCL); + needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags); + + /* Round it down to avoid creating a partial segment chain */ + needed = (needed / nsegs) * nsegs; + if (needed == 0) + goto fail; + + if (resid > 0) { + /* + * We're about to construct the chain(s); take into account + * the number of segments we have created above to hold the + * residual data for each chain, as well as restore the + * original count of segments per chain. + */ + ASSERT(nsegs > 0); + needed += needed / nsegs; + nsegs++; + } + + for (;;) { + struct mbuf *m; + u_int32_t flag; + struct ext_ref *rfa; + void *cl; + int pkthdr; + + ++num; + if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) { + m = (struct mbuf *)mp_list; + mp_list = mp_list->obj_next; + } else { + m = (struct mbuf *)rmp_list; + rmp_list = rmp_list->obj_next; + } + ASSERT(m != NULL); + VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT); + VERIFY(m->m_ext.ext_free == NULL || + m->m_ext.ext_free == m_bigfree || + m->m_ext.ext_free == m_16kfree); + + cl = m->m_ext.ext_buf; + rfa = MEXT_RFA(m); + + ASSERT(cl != NULL && rfa != NULL); + VERIFY(MBUF_IS_COMPOSITE(m)); + + flag = MEXT_FLAGS(m); + + pkthdr = (nsegs == 1 || (num % nsegs) == 1); + if (pkthdr) + first = m; + MBUF_INIT(m, pkthdr, MT_DATA); + if (m->m_ext.ext_free == m_16kfree) { + MBUF_16KCL_INIT(m, cl, rfa, 1, flag); + } else if (m->m_ext.ext_free == m_bigfree) { + MBUF_BIGCL_INIT(m, cl, rfa, 1, flag); + } else { + MBUF_CL_INIT(m, cl, rfa, 1, flag); + } +#if CONFIG_MACF_NET + if (pkthdr && mac_init_mbuf(m, wait) != 0) { + --num; + m_free(m); + break; } - *np = pkt; - np = &pkt->m_nextpkt; +#endif /* MAC_NET */ + + *np = m; + if ((num % nsegs) == 0) + np = &first->m_nextpkt; + else + np = &m->m_next; + + if (num == needed) + break; + } + + if (num > 0) { + mtype_stat_add(MT_DATA, num); + mtype_stat_sub(MT_FREE, num); } - MBUF_UNLOCK(); - *num_needed = num; - - return top; + + num /= nsegs; + + /* We've got them all; return to caller */ + if (num == *numlist) { + ASSERT(mp_list == NULL && rmp_list == NULL); + return (top); + } + fail: - if (wantall && top) { + /* Free up what's left of the above */ + if (mp_list != NULL) + mcache_free_ext(cp, mp_list); + if (rmp_list != NULL) + mcache_free_ext(rcp, rmp_list); + if (wantall && top != NULL) { m_freem(top); - return 0; + return (NULL); } - *num_needed = num; - - return top; + *numlist = num; + return (top); } - -/* Best effort to get a mbuf cluster + pkthdr under one lock. - * If we don't have them avail, just bail out and use the regular - * path. - * Used by drivers to allocated packets on receive ring. +/* + * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated + * packets on receive ring. */ __private_extern__ struct mbuf * -m_getpacket_how(int how) +m_getpacket_how(int wait) { unsigned int num_needed = 1; - - return m_getpackets_internal(&num_needed, 1, how, 1, MCLBYTES); + + return (m_getpackets_internal(&num_needed, 1, wait, 1, + m_maxsize(MC_CL))); } -/* Best effort to get a mbuf cluster + pkthdr under one lock. - * If we don't have them avail, just bail out and use the regular - * path. - * Used by drivers to allocated packets on receive ring. +/* + * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated + * packets on receive ring. */ struct mbuf * m_getpacket(void) { unsigned int num_needed = 1; - return m_getpackets_internal(&num_needed, 1, M_WAITOK, 1, MCLBYTES); + return (m_getpackets_internal(&num_needed, 1, M_WAIT, 1, + m_maxsize(MC_CL))); } - /* - * return a list of mbuf hdrs that point to clusters... - * try for num_needed, if this can't be met, return whatever - * number were available... set up the first num_with_pkthdrs - * with mbuf hdrs configured as packet headers... these are - * chained on the m_nextpkt field... any packets requested beyond - * this are chained onto the last packet header's m_next field. + * Return a list of mbuf hdrs that point to clusters. Try for num_needed; + * if this can't be met, return whatever number were available. Set up the + * first num_with_pkthdrs with mbuf hdrs configured as packet headers. These + * are chained on the m_nextpkt field. Any packets requested beyond this are + * chained onto the last packet header's m_next field. */ struct mbuf * m_getpackets(int num_needed, int num_with_pkthdrs, int how) { unsigned int n = num_needed; - - return m_getpackets_internal(&n, num_with_pkthdrs, how, 0, MCLBYTES); -} + return (m_getpackets_internal(&n, num_with_pkthdrs, how, 0, + m_maxsize(MC_CL))); +} /* - * return a list of mbuf hdrs set up as packet hdrs - * chained together on the m_nextpkt field + * Return a list of mbuf hdrs set up as packet hdrs chained together + * on the m_nextpkt field */ struct mbuf * m_getpackethdrs(int num_needed, int how) @@ -1170,168 +3528,202 @@ m_getpackethdrs(int num_needed, int how) top = NULL; np = ⊤ - MBUF_LOCK(); - while (num_needed--) { - m_range_check(mfree); - m_range_check(mclfree); - m_range_check(mbigfree); - - if ((m = mfree)) { /* mbufs are available */ - MCHECK(m); - mfree = m->m_next; - ++mclrefcnt[mtocl(m)]; - mbstat.m_mtypes[MT_FREE]--; - mbstat.m_mtypes[MT_DATA]++; - - m->m_next = m->m_nextpkt = 0; - m->m_type = MT_DATA; - m->m_flags = M_PKTHDR; - m->m_len = 0; - m->m_data = m->m_pktdat; - _M_CLEAR_PKTHDR(m); - - } else { - - MBUF_UNLOCK(); - m = m_retryhdr(how, MT_DATA); - if (m == 0) - return(top); - MBUF_LOCK(); - } - *np = m; - np = &m->m_nextpkt; - } - MBUF_UNLOCK(); + m = _M_RETRYHDR(how, MT_DATA); + if (m == NULL) + break; + + *np = m; + np = &m->m_nextpkt; + } return (top); } - -/* free and mbuf list (m_nextpkt) while following m_next under one lock. - * returns the count for mbufs packets freed. Used by the drivers. +/* + * Free an mbuf list (m_nextpkt) while following m_next. Returns the count + * for mbufs packets freed. Used by the drivers. */ -int -m_freem_list( - struct mbuf *m) +int +m_freem_list(struct mbuf *m) { struct mbuf *nextpkt; - int i, count=0; - - MBUF_LOCK(); + mcache_obj_t *mp_list = NULL; + mcache_obj_t *mcl_list = NULL; + mcache_obj_t *mbc_list = NULL; + mcache_obj_t *m16k_list = NULL; + mcache_obj_t *m_mcl_list = NULL; + mcache_obj_t *m_mbc_list = NULL; + mcache_obj_t *m_m16k_list = NULL; + mcache_obj_t *ref_list = NULL; + int pktcount = 0; + int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0; + + while (m != NULL) { + pktcount++; + + nextpkt = m->m_nextpkt; + m->m_nextpkt = NULL; + + while (m != NULL) { + struct mbuf *next = m->m_next; + mcache_obj_t *o, *rfa; + u_int32_t refcnt, flags; - while (m) { - if (m) - nextpkt = m->m_nextpkt; /* chain of linked mbufs from driver */ - else - nextpkt = 0; - - count++; - - while (m) { /* free the mbuf chain (like mfreem) */ - - struct mbuf *n; + if (m->m_type == MT_FREE) + panic("m_free: freeing an already freed mbuf"); - m_range_check(m); - m_range_check(mfree); - m_range_check(mclfree); - m_range_check(mbigfree); - + if (m->m_type != MT_FREE) + mt_free++; - /* Free the aux data if there is any */ - if ((m->m_flags & M_PKTHDR) && m->m_pkthdr.aux) { - /* - * Treat the current m as the nextpkt and set m - * to the aux data. Preserve nextpkt in m->m_nextpkt. - * This lets us free the aux data in this loop - * without having to call m_freem recursively, - * which wouldn't work because we've still got - * the lock. - */ - m->m_nextpkt = nextpkt; - nextpkt = m; - m = nextpkt->m_pkthdr.aux; - nextpkt->m_pkthdr.aux = NULL; - } - - if ((m->m_flags & M_PKTHDR) != 0 && !SLIST_EMPTY(&m->m_pkthdr.tags)) { - /* A quick (albeit inefficient) expedient */ - MBUF_UNLOCK(); + if (m->m_flags & M_PKTHDR) { m_tag_delete_chain(m, NULL); - MBUF_LOCK(); } - n = m->m_next; - - if (n && n->m_nextpkt) - panic("m_freem_list: m_nextpkt of m_next != NULL"); - if (m->m_type == MT_FREE) - panic("freeing free mbuf"); + if (!(m->m_flags & M_EXT)) + goto simple_free; + + o = (mcache_obj_t *)m->m_ext.ext_buf; + refcnt = m_decref(m); + flags = MEXT_FLAGS(m); + if (refcnt == 0 && flags == 0) { + if (m->m_ext.ext_free == NULL) { + o->obj_next = mcl_list; + mcl_list = o; + } else if (m->m_ext.ext_free == m_bigfree) { + o->obj_next = mbc_list; + mbc_list = o; + } else if (m->m_ext.ext_free == m_16kfree) { + o->obj_next = m16k_list; + m16k_list = o; + } else { + (*(m->m_ext.ext_free))((caddr_t)o, + m->m_ext.ext_size, + m->m_ext.ext_arg); + } + rfa = (mcache_obj_t *)MEXT_RFA(m); + rfa->obj_next = ref_list; + ref_list = rfa; + MEXT_RFA(m) = NULL; + } else if (refcnt == 0 && (flags & EXTF_COMPOSITE)) { + VERIFY(m->m_type != MT_FREE); + /* + * Amortize the costs of atomic operations + * by doing them at the end, if possible. + */ + if (m->m_type == MT_DATA) + mt_data++; + else if (m->m_type == MT_HEADER) + mt_header++; + else if (m->m_type == MT_SONAME) + mt_soname++; + else if (m->m_type == MT_TAG) + mt_tag++; + else + mtype_stat_dec(m->m_type); - if (m->m_flags & M_EXT) { - if (MCLHASREFERENCE(m)) { - remque((queue_t)&m->m_ext.ext_refs); - } else if (m->m_ext.ext_free == NULL) { - union mcluster *mcl= (union mcluster *)m->m_ext.ext_buf; - - m_range_check(mcl); - - if (_MCLUNREF(mcl)) { - mcl->mcl_next = mclfree; - mclfree = mcl; - ++mbstat.m_clfree; - } + m->m_type = MT_FREE; + m->m_flags = M_EXT; + m->m_len = 0; + m->m_next = m->m_nextpkt = NULL; + + /* "Free" into the intermediate cache */ + o = (mcache_obj_t *)m; + if (m->m_ext.ext_free == NULL) { + o->obj_next = m_mcl_list; + m_mcl_list = o; + } else if (m->m_ext.ext_free == m_bigfree) { + o->obj_next = m_mbc_list; + m_mbc_list = o; } else { - (*(m->m_ext.ext_free))(m->m_ext.ext_buf, - m->m_ext.ext_size, m->m_ext.ext_arg); + VERIFY(m->m_ext.ext_free == m_16kfree); + o->obj_next = m_m16k_list; + m_m16k_list = o; } + m = next; + continue; } - mbstat.m_mtypes[m->m_type]--; - (void) _MCLUNREF(m); - _MFREE_MUNGE(m); - mbstat.m_mtypes[MT_FREE]++; +simple_free: + /* + * Amortize the costs of atomic operations + * by doing them at the end, if possible. + */ + if (m->m_type == MT_DATA) + mt_data++; + else if (m->m_type == MT_HEADER) + mt_header++; + else if (m->m_type == MT_SONAME) + mt_soname++; + else if (m->m_type == MT_TAG) + mt_tag++; + else if (m->m_type != MT_FREE) + mtype_stat_dec(m->m_type); + m->m_type = MT_FREE; - m->m_flags = 0; - m->m_len = 0; - m->m_next = mfree; - mfree = m; - m = n; - } - m = nextpkt; /* bump m with saved nextpkt if any */ - } - if ((i = m_want)) - m_want = 0; + m->m_flags = m->m_len = 0; + m->m_next = m->m_nextpkt = NULL; - MBUF_UNLOCK(); + ((mcache_obj_t *)m)->obj_next = mp_list; + mp_list = (mcache_obj_t *)m; + + m = next; + } - if (i) - wakeup((caddr_t)&mfree); + m = nextpkt; + } - return (count); + if (mt_free > 0) + mtype_stat_add(MT_FREE, mt_free); + if (mt_data > 0) + mtype_stat_sub(MT_DATA, mt_data); + if (mt_header > 0) + mtype_stat_sub(MT_HEADER, mt_header); + if (mt_soname > 0) + mtype_stat_sub(MT_SONAME, mt_soname); + if (mt_tag > 0) + mtype_stat_sub(MT_TAG, mt_tag); + + if (mp_list != NULL) + mcache_free_ext(m_cache(MC_MBUF), mp_list); + if (mcl_list != NULL) + mcache_free_ext(m_cache(MC_CL), mcl_list); + if (mbc_list != NULL) + mcache_free_ext(m_cache(MC_BIGCL), mbc_list); + if (m16k_list != NULL) + mcache_free_ext(m_cache(MC_16KCL), m16k_list); + if (m_mcl_list != NULL) + mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list); + if (m_mbc_list != NULL) + mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list); + if (m_m16k_list != NULL) + mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list); + if (ref_list != NULL) + mcache_free_ext(ref_cache, ref_list); + + return (pktcount); } void -m_freem( - struct mbuf *m) +m_freem(struct mbuf *m) { - while (m) + while (m != NULL) m = m_free(m); } /* * Mbuffer utility routines. */ + /* - * Compute the amount of space available - * before the current start of data in an mbuf. + * Compute the amount of space available before the current start + * of data in an mbuf. */ int -m_leadingspace( - struct mbuf *m) +m_leadingspace(struct mbuf *m) { if (m->m_flags & M_EXT) { if (MCLHASREFERENCE(m)) - return(0); + return (0); return (m->m_data - m->m_ext.ext_buf); } if (m->m_flags & M_PKTHDR) @@ -1340,40 +3732,33 @@ m_leadingspace( } /* - * Compute the amount of space available - * after the end of data in an mbuf. + * Compute the amount of space available after the end of data in an mbuf. */ int -m_trailingspace( - struct mbuf *m) +m_trailingspace(struct mbuf *m) { if (m->m_flags & M_EXT) { if (MCLHASREFERENCE(m)) - return(0); + return (0); return (m->m_ext.ext_buf + m->m_ext.ext_size - - (m->m_data + m->m_len)); + (m->m_data + m->m_len)); } return (&m->m_dat[MLEN] - (m->m_data + m->m_len)); } /* - * Lesser-used path for M_PREPEND: - * allocate new mbuf to prepend to chain, - * copy junk along. - * Does not adjust packet header length. + * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain, + * copy junk along. Does not adjust packet header length. */ struct mbuf * -m_prepend( - struct mbuf *m, - int len, - int how) +m_prepend(struct mbuf *m, int len, int how) { struct mbuf *mn; - MGET(mn, how, m->m_type); - if (mn == (struct mbuf *)NULL) { + _MGET(mn, how, m->m_type); + if (mn == NULL) { m_freem(m); - return ((struct mbuf *)NULL); + return (NULL); } if (m->m_flags & M_PKTHDR) { M_COPY_PKTHDR(mn, m); @@ -1388,26 +3773,21 @@ m_prepend( } /* - * Replacement for old M_PREPEND macro: - * allocate new mbuf to prepend to chain, - * copy junk along, and adjust length. - * + * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to + * chain, copy junk along, and adjust length. */ struct mbuf * -m_prepend_2( - struct mbuf *m, - int len, - int how) -{ - if (M_LEADINGSPACE(m) >= len) { - m->m_data -= len; - m->m_len += len; - } else { +m_prepend_2(struct mbuf *m, int len, int how) +{ + if (M_LEADINGSPACE(m) >= len) { + m->m_data -= len; + m->m_len += len; + } else { m = m_prepend(m, len, how); - } - if ((m) && (m->m_flags & M_PKTHDR)) - m->m_pkthdr.len += len; - return (m); + } + if ((m) && (m->m_flags & M_PKTHDR)) + m->m_pkthdr.len += len; + return (m); } /* @@ -1418,64 +3798,45 @@ m_prepend_2( int MCFail; struct mbuf * -m_copym( - struct mbuf *m, - int off0, - int len, - int wait) +m_copym(struct mbuf *m, int off0, int len, int wait) { - struct mbuf *n, **np; + struct mbuf *n, *mhdr = NULL, **np; int off = off0; struct mbuf *top; int copyhdr = 0; if (off < 0 || len < 0) - panic("m_copym"); - if (off == 0 && m->m_flags & M_PKTHDR) + panic("m_copym: invalid offset %d or len %d", off, len); + + if (off == 0 && (m->m_flags & M_PKTHDR)) { + mhdr = m; copyhdr = 1; + } while (off >= m->m_len) { - if (m == 0) - panic("m_copym"); + if (m->m_next == NULL) + panic("m_copym: invalid mbuf chain"); off -= m->m_len; m = m->m_next; } np = ⊤ - top = 0; - - MBUF_LOCK(); + top = NULL; while (len > 0) { - m_range_check(mfree); - m_range_check(mclfree); - m_range_check(mbigfree); - - if (m == 0) { + if (m == NULL) { if (len != M_COPYALL) - panic("m_copym"); + panic("m_copym: len != M_COPYALL"); break; } - if ((n = mfree)) { - MCHECK(n); - ++mclrefcnt[mtocl(n)]; - mbstat.m_mtypes[MT_FREE]--; - mbstat.m_mtypes[m->m_type]++; - mfree = n->m_next; - n->m_next = n->m_nextpkt = 0; - n->m_type = m->m_type; - n->m_data = n->m_dat; - n->m_flags = 0; - } else { - MBUF_UNLOCK(); - n = m_retry(wait, m->m_type); - MBUF_LOCK(); - } + + n = _M_RETRY(wait, m->m_type); *np = n; - if (n == 0) + if (n == NULL) goto nospace; - if (copyhdr) { - M_COPY_PKTHDR(n, m); + + if (copyhdr != 0) { + M_COPY_PKTHDR(n, mhdr); if (len == M_COPYALL) n->m_pkthdr.len -= off0; else @@ -1483,24 +3844,24 @@ m_copym( copyhdr = 0; } if (len == M_COPYALL) { - if (min(len, (m->m_len - off)) == len) { - printf("m->m_len %d - off %d = %d, %d\n", - m->m_len, off, m->m_len - off, - min(len, (m->m_len - off))); - } + if (MIN(len, (m->m_len - off)) == len) { + printf("m->m_len %ld - off %d = %ld, %ld\n", + m->m_len, off, m->m_len - off, + MIN(len, (m->m_len - off))); + } } - n->m_len = min(len, (m->m_len - off)); + n->m_len = MIN(len, (m->m_len - off)); if (n->m_len == M_COPYALL) { - printf("n->m_len == M_COPYALL, fixing\n"); - n->m_len = MHLEN; + printf("n->m_len == M_COPYALL, fixing\n"); + n->m_len = MHLEN; } if (m->m_flags & M_EXT) { n->m_ext = m->m_ext; - insque((queue_t)&n->m_ext.ext_refs, (queue_t)&m->m_ext.ext_refs); + m_incref(m); n->m_data = m->m_data + off; n->m_flags |= M_EXT; } else { - bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t), + bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t), (unsigned)n->m_len); } if (len != M_COPYALL) @@ -1509,176 +3870,172 @@ m_copym( m = m->m_next; np = &n->m_next; } - MBUF_UNLOCK(); - if (top == 0) + if (top == NULL) MCFail++; return (top); nospace: - MBUF_UNLOCK(); m_freem(top); MCFail++; - return (0); + return (NULL); } - /* - * equivilent to m_copym except that all necessary - * mbuf hdrs are allocated within this routine - * also, the last mbuf and offset accessed are passed - * out and can be passed back in to avoid having to - * rescan the entire mbuf list (normally hung off of the socket) + * Equivalent to m_copym except that all necessary mbuf hdrs are allocated + * within this routine also, the last mbuf and offset accessed are passed + * out and can be passed back in to avoid having to rescan the entire mbuf + * list (normally hung off of the socket) */ struct mbuf * -m_copym_with_hdrs( - struct mbuf *m, - int off0, - int len, - int wait, - struct mbuf **m_last, - int *m_off) -{ - struct mbuf *n, **np = 0; - int off = off0; - struct mbuf *top = 0; +m_copym_with_hdrs(struct mbuf *m, int off0, int len0, int wait, + struct mbuf **m_last, int *m_off) +{ + struct mbuf *n, **np = NULL; + int off = off0, len = len0; + struct mbuf *top = NULL; + int mcflags = MSLEEPF(wait); int copyhdr = 0; - int type; + int type = 0; + mcache_obj_t *list = NULL; + int needed = 0; - if (off == 0 && m->m_flags & M_PKTHDR) + if (off == 0 && (m->m_flags & M_PKTHDR)) copyhdr = 1; - if (*m_last) { - m = *m_last; + if (*m_last != NULL) { + m = *m_last; off = *m_off; } else { - while (off >= m->m_len) { - off -= m->m_len; + while (off >= m->m_len) { + off -= m->m_len; m = m->m_next; } } - MBUF_LOCK(); + n = m; + while (len > 0) { + needed++; + ASSERT(n != NULL); + len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0))); + n = n->m_next; + } + needed++; + len = len0; + + /* + * If the caller doesn't want to be put to sleep, mark it with + * MCR_TRYHARD so that we may reclaim buffers from other places + * before giving up. + */ + if (mcflags & MCR_NOSLEEP) + mcflags |= MCR_TRYHARD; + + if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed, + mcflags) != needed) + goto nospace; + needed = 0; while (len > 0) { - m_range_check(mfree); - m_range_check(mclfree); - m_range_check(mbigfree); - - if (top == 0) - type = MT_HEADER; - else { - if (m == 0) - panic("m_gethdr_and_copym"); - type = m->m_type; - } - if ((n = mfree)) { - MCHECK(n); - ++mclrefcnt[mtocl(n)]; - mbstat.m_mtypes[MT_FREE]--; - mbstat.m_mtypes[type]++; - mfree = n->m_next; - n->m_next = n->m_nextpkt = 0; - n->m_type = type; - - if (top) { - n->m_data = n->m_dat; - n->m_flags = 0; - } else { - n->m_data = n->m_pktdat; - n->m_flags = M_PKTHDR; - _M_CLEAR_PKTHDR(n); - } - } else { - MBUF_UNLOCK(); - if (top) - n = m_retry(wait, type); - else - n = m_retryhdr(wait, type); - MBUF_LOCK(); - } - if (n == 0) + n = (struct mbuf *)list; + list = list->obj_next; + ASSERT(n != NULL && m != NULL); + + type = (top == NULL) ? MT_HEADER : m->m_type; + MBUF_INIT(n, (top == NULL), type); +#if CONFIG_MACF_NET + if (top == NULL && mac_mbuf_label_init(n, wait) != 0) { + mtype_stat_inc(MT_HEADER); + mtype_stat_dec(MT_FREE); + m_free(n); goto nospace; - if (top == 0) { - top = n; + } +#endif /* MAC_NET */ + + if (top == NULL) { + top = n; np = &top->m_next; continue; - } else - *np = n; + } else { + needed++; + *np = n; + } if (copyhdr) { M_COPY_PKTHDR(n, m); n->m_pkthdr.len = len; copyhdr = 0; } - n->m_len = min(len, (m->m_len - off)); + n->m_len = MIN(len, (m->m_len - off)); if (m->m_flags & M_EXT) { n->m_ext = m->m_ext; - insque((queue_t)&n->m_ext.ext_refs, (queue_t)&m->m_ext.ext_refs); + m_incref(m); n->m_data = m->m_data + off; n->m_flags |= M_EXT; } else { - bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t), + bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t), (unsigned)n->m_len); } len -= n->m_len; - + if (len == 0) { - if ((off + n->m_len) == m->m_len) { - *m_last = m->m_next; - *m_off = 0; + if ((off + n->m_len) == m->m_len) { + *m_last = m->m_next; + *m_off = 0; } else { - *m_last = m; - *m_off = off + n->m_len; + *m_last = m; + *m_off = off + n->m_len; } - break; + break; } off = 0; m = m->m_next; np = &n->m_next; } - MBUF_UNLOCK(); + mtype_stat_inc(MT_HEADER); + mtype_stat_add(type, needed); + mtype_stat_sub(MT_FREE, needed + 1); + + ASSERT(list == NULL); return (top); -nospace: - MBUF_UNLOCK(); - if (top) - m_freem(top); +nospace: + if (list != NULL) + mcache_free_ext(m_cache(MC_MBUF), list); + if (top != NULL) + m_freem(top); MCFail++; - return (0); + return (NULL); } - /* * Copy data from an mbuf chain starting "off" bytes from the beginning, * continuing for "len" bytes, into the indicated buffer. */ -void m_copydata( - struct mbuf *m, - int off, - int len, - caddr_t cp) +void +m_copydata(struct mbuf *m, int off, int len, caddr_t cp) { unsigned count; if (off < 0 || len < 0) - panic("m_copydata"); + panic("m_copydata: invalid offset %d or len %d", off, len); + while (off > 0) { - if (m == 0) - panic("m_copydata"); + if (m == NULL) + panic("m_copydata: invalid mbuf chain"); if (off < m->m_len) break; off -= m->m_len; m = m->m_next; } while (len > 0) { - if (m == 0) - panic("m_copydata"); - count = min(m->m_len - off, len); - bcopy(mtod(m, caddr_t) + off, cp, count); + if (m == NULL) + panic("m_copydata: invalid mbuf chain"); + count = MIN(m->m_len - off, len); + bcopy(MTOD(m, caddr_t) + off, cp, count); len -= count; cp += count; off = 0; @@ -1687,24 +4044,23 @@ void m_copydata( } /* - * Concatenate mbuf chain n to m. - * Both chains must be of the same type (e.g. MT_DATA). - * Any m_pkthdr is not updated. + * Concatenate mbuf chain n to m. Both chains must be of the same type + * (e.g. MT_DATA). Any m_pkthdr is not updated. */ -void m_cat( - struct mbuf *m, struct mbuf *n) +void +m_cat(struct mbuf *m, struct mbuf *n) { while (m->m_next) m = m->m_next; while (n) { - if (m->m_flags & M_EXT || + if ((m->m_flags & M_EXT) || m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) { /* just join the two chains */ m->m_next = n; return; } /* splat the data from one into the other */ - bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, + bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len, (u_int)n->m_len); m->m_len += n->m_len; n = m_free(n); @@ -1712,9 +4068,7 @@ void m_cat( } void -m_adj( - struct mbuf *mp, - int req_len) +m_adj(struct mbuf *mp, int req_len) { int len = req_len; struct mbuf *m; @@ -1797,9 +4151,7 @@ m_adj( int MPFail; struct mbuf * -m_pullup( - struct mbuf *n, - int len) +m_pullup(struct mbuf *n, int len) { struct mbuf *m; int count; @@ -1820,7 +4172,7 @@ m_pullup( } else { if (len > MHLEN) goto bad; - MGET(m, M_DONTWAIT, n->m_type); + _MGET(m, M_DONTWAIT, n->m_type); if (m == 0) goto bad; m->m_len = 0; @@ -1831,9 +4183,9 @@ m_pullup( } space = &m->m_dat[MLEN] - (m->m_data + m->m_len); do { - count = min(min(max(len, max_protohdr), space), n->m_len); - bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, - (unsigned)count); + count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len); + bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len, + (unsigned)count); len -= count; m->m_len += count; n->m_len -= count; @@ -1861,23 +4213,20 @@ bad: * attempts to restore the chain to its original state. */ struct mbuf * -m_split( - struct mbuf *m0, - int len0, - int wait) +m_split(struct mbuf *m0, int len0, int wait) { struct mbuf *m, *n; unsigned len = len0, remain; for (m = m0; m && len > m->m_len; m = m->m_next) len -= m->m_len; - if (m == 0) - return (0); + if (m == NULL) + return (NULL); remain = m->m_len - len; if (m0->m_flags & M_PKTHDR) { - MGETHDR(n, wait, m0->m_type); - if (n == 0) - return (0); + _MGETHDR(n, wait, m0->m_type); + if (n == NULL) + return (NULL); n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; n->m_pkthdr.len = m0->m_pkthdr.len - len0; m0->m_pkthdr.len = len0; @@ -1887,53 +4236,48 @@ m_split( /* m can't be the lead packet */ MH_ALIGN(n, 0); n->m_next = m_split(m, len, wait); - if (n->m_next == 0) { + if (n->m_next == NULL) { (void) m_free(n); - return (0); + return (NULL); } else return (n); } else MH_ALIGN(n, remain); } else if (remain == 0) { n = m->m_next; - m->m_next = 0; + m->m_next = NULL; return (n); } else { - MGET(n, wait, m->m_type); - if (n == 0) - return (0); + _MGET(n, wait, m->m_type); + if (n == NULL) + return (NULL); M_ALIGN(n, remain); } extpacket: if (m->m_flags & M_EXT) { n->m_flags |= M_EXT; - MBUF_LOCK(); n->m_ext = m->m_ext; - insque((queue_t)&n->m_ext.ext_refs, (queue_t)&m->m_ext.ext_refs); - MBUF_UNLOCK(); + m_incref(m); n->m_data = m->m_data + len; } else { - bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain); + bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain); } n->m_len = remain; m->m_len = len; n->m_next = m->m_next; - m->m_next = 0; + m->m_next = NULL; return (n); } + /* * Routine to copy from device local memory into mbufs. */ struct mbuf * -m_devget( - char *buf, - int totlen, - int off0, - struct ifnet *ifp, - void (*copy)(const void *, void *, size_t)) +m_devget(char *buf, int totlen, int off0, struct ifnet *ifp, + void (*copy)(const void *, void *, size_t)) { struct mbuf *m; - struct mbuf *top = 0, **mp = ⊤ + struct mbuf *top = NULL, **mp = ⊤ int off = off0, len; char *cp; char *epkt; @@ -1945,52 +4289,54 @@ m_devget( * If 'off' is non-zero, packet is trailer-encapsulated, * so we have to skip the type and length fields. */ - cp += off + 2 * sizeof(u_int16_t); - totlen -= 2 * sizeof(u_int16_t); + cp += off + 2 * sizeof (u_int16_t); + totlen -= 2 * sizeof (u_int16_t); } - MGETHDR(m, M_DONTWAIT, MT_DATA); - if (m == 0) - return (0); + _MGETHDR(m, M_DONTWAIT, MT_DATA); + if (m == NULL) + return (NULL); m->m_pkthdr.rcvif = ifp; m->m_pkthdr.len = totlen; m->m_len = MHLEN; while (totlen > 0) { - if (top) { - MGET(m, M_DONTWAIT, MT_DATA); - if (m == 0) { + if (top != NULL) { + _MGET(m, M_DONTWAIT, MT_DATA); + if (m == NULL) { m_freem(top); - return (0); + return (NULL); } m->m_len = MLEN; } - len = min(totlen, epkt - cp); + len = MIN(totlen, epkt - cp); if (len >= MINCLSIZE) { MCLGET(m, M_DONTWAIT); - if (m->m_flags & M_EXT) - m->m_len = len = min(len, MCLBYTES); - else { - /* give up when it's out of cluster mbufs */ - if (top) - m_freem(top); + if (m->m_flags & M_EXT) { + m->m_len = len = MIN(len, m_maxsize(MC_CL)); + } else { + /* give up when it's out of cluster mbufs */ + if (top != NULL) + m_freem(top); m_freem(m); - return (0); + return (NULL); } } else { /* * Place initial small packet/header at end of mbuf. */ if (len < m->m_len) { - if (top == 0 && len + max_linkhdr <= m->m_len) + if (top == NULL && + len + max_linkhdr <= m->m_len) m->m_data += max_linkhdr; m->m_len = len; - } else + } else { len = m->m_len; + } } if (copy) - copy(cp, mtod(m, caddr_t), (unsigned)len); + copy(cp, MTOD(m, caddr_t), (unsigned)len); else - bcopy(cp, mtod(m, caddr_t), (unsigned)len); + bcopy(cp, MTOD(m, caddr_t), (unsigned)len); cp += len; *mp = m; mp = &m->m_next; @@ -2002,49 +4348,93 @@ m_devget( } /* - * Cluster freelist allocation check. The mbuf lock must be held. - * Ensure hysteresis between hi/lo. + * Cluster freelist allocation check. */ static int m_howmany(int num, size_t bufsize) { - int i = 0; - + int i = 0, j = 0; + u_int32_t m_clusters, m_bigclusters, m_16kclusters; + u_int32_t m_clfree, m_bigclfree, m_16kclfree; + + lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + + m_clusters = m_total(MC_CL); + m_bigclusters = m_total(MC_BIGCL); + m_16kclusters = m_total(MC_16KCL); + m_clfree = m_infree(MC_CL); + m_bigclfree = m_infree(MC_BIGCL); + m_16kclfree = m_infree(MC_16KCL); + /* Bail if we've maxed out the mbuf memory map */ - if (mbstat.m_clusters + (mbstat.m_bigclusters << 1) < nmbclusters) { - int j = 0; - - if (bufsize == MCLBYTES) { - /* Under minimum */ - if (mbstat.m_clusters < MINCL) - return (MINCL - mbstat.m_clusters); - /* Too few (free < 1/2 total) and not over maximum */ - if (mbstat.m_clusters < (nmbclusters >> 1)) { - if (num >= mbstat.m_clfree) - i = num - mbstat.m_clfree; - if (((mbstat.m_clusters + num) >> 1) > mbstat.m_clfree) - j = ((mbstat.m_clusters + num) >> 1) - mbstat.m_clfree; - i = max(i, j); - if (i + mbstat.m_clusters >= (nmbclusters >> 1)) - i = (nmbclusters >> 1) - mbstat.m_clusters; - } - } else { - /* Under minimum */ - if (mbstat.m_bigclusters < MINCL) - return (MINCL - mbstat.m_bigclusters); - /* Too few (free < 1/2 total) and not over maximum */ - if (mbstat.m_bigclusters < (nmbclusters >> 2)) { - if (num >= mbstat.m_bigclfree) - i = num - mbstat.m_bigclfree; - if (((mbstat.m_bigclusters + num) >> 1) > mbstat.m_bigclfree) - j = ((mbstat.m_bigclusters + num) >> 1) - mbstat.m_bigclfree; - i = max(i, j); - if (i + mbstat.m_bigclusters >= (nmbclusters >> 2)) - i = (nmbclusters >> 2) - mbstat.m_bigclusters; - } + if ((bufsize != m_maxsize(MC_16KCL) && + (m_clusters + (m_bigclusters << 1) >= nclusters)) || + (njcl > 0 && bufsize == m_maxsize(MC_16KCL) && + (m_16kclusters << 3) >= njcl)) { +#if DEBUG + if (bufsize == MCLBYTES && num > m_clfree) { + printf("m_howmany - out of small clusters, " + "%d short\n", num - mbstat.m_clfree); + } +#endif /* DEBUG */ + return (0); + } + + if (bufsize == m_maxsize(MC_CL)) { + /* Under minimum */ + if (m_clusters < MINCL) + return (MINCL - m_clusters); + /* Too few (free < 1/16 total) and not over maximum */ + if (m_clusters < m_maxlimit(MC_CL)) { + if (m_clfree >= MCL_LOWAT) + return (0); + if (num >= m_clfree) + i = num - m_clfree; + if (((m_clusters + num) >> 4) > m_clfree) + j = ((m_clusters + num) >> 4) - m_clfree; + i = MAX(i, j); + if (i + m_clusters >= m_maxlimit(MC_CL)) + i = m_maxlimit(MC_CL) - m_clusters; } + VERIFY((m_total(MC_CL) + i) <= m_maxlimit(MC_CL)); + } else if (bufsize == m_maxsize(MC_BIGCL)) { + /* Under minimum */ + if (m_bigclusters < MINBIGCL) + return (MINBIGCL - m_bigclusters); + /* Too few (free < 1/16 total) and not over maximum */ + if (m_bigclusters < m_maxlimit(MC_BIGCL)) { + if (m_bigclfree >= MBIGCL_LOWAT) + return (0); + if (num >= m_bigclfree) + i = num - m_bigclfree; + if (((m_bigclusters + num) >> 4) > m_bigclfree) + j = ((m_bigclusters + num) >> 4) - m_bigclfree; + i = MAX(i, j); + if (i + m_bigclusters >= m_maxlimit(MC_BIGCL)) + i = m_maxlimit(MC_BIGCL) - m_bigclusters; + } + VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL)); + } else { + VERIFY(njcl > 0); + /* Under minimum */ + if (m_16kclusters < MIN16KCL) + return (MIN16KCL - m_16kclusters); + /* Too few (free < 1/16 total) and not over maximum */ + if (m_16kclusters < m_maxlimit(MC_16KCL)) { + if (m_16kclfree >= M16KCL_LOWAT) + return (0); + if (num >= m_16kclfree) + i = num - m_16kclfree; + if (((m_16kclusters + num) >> 4) > m_16kclfree) + j = ((m_16kclusters + num) >> 4) - m_16kclfree; + i = MAX(i, j); + if (i + m_16kclusters >= m_maxlimit(MC_16KCL)) + i = m_maxlimit(MC_16KCL) - m_16kclusters; + } + VERIFY((m_total(MC_16KCL) + i) <= m_maxlimit(MC_16KCL)); } - return i; + + return (i); } /* @@ -2053,33 +4443,29 @@ m_howmany(int num, size_t bufsize) * chain if necessary. */ void -m_copyback( - struct mbuf *m0, - int off, - int len, - caddr_t cp) +m_copyback(struct mbuf *m0, int off, int len, caddr_t cp) { int mlen; struct mbuf *m = m0, *n; int totlen = 0; - if (m0 == 0) + if (m0 == NULL) return; while (off > (mlen = m->m_len)) { off -= mlen; totlen += mlen; - if (m->m_next == 0) { + if (m->m_next == NULL) { n = m_getclr(M_DONTWAIT, m->m_type); - if (n == 0) + if (n == NULL) goto out; - n->m_len = min(MLEN, len + off); + n->m_len = MIN(MLEN, len + off); m->m_next = n; } m = m->m_next; } while (len > 0) { - mlen = min (m->m_len - off, len); - bcopy(cp, off + mtod(m, caddr_t), (unsigned)mlen); + mlen = MIN(m->m_len - off, len); + bcopy(cp, off + MTOD(m, caddr_t), (unsigned)mlen); cp += mlen; len -= mlen; mlen += off; @@ -2087,51 +4473,52 @@ m_copyback( totlen += mlen; if (len == 0) break; - if (m->m_next == 0) { - n = m_get(M_DONTWAIT, m->m_type); - if (n == 0) + if (m->m_next == NULL) { + n = _M_GET(M_DONTWAIT, m->m_type); + if (n == NULL) break; - n->m_len = min(MLEN, len); + n->m_len = MIN(MLEN, len); m->m_next = n; } m = m->m_next; } -out: if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) +out: + if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) m->m_pkthdr.len = totlen; } +char * +mcl_to_paddr(char *addr) +{ + int base_phys; -char *mcl_to_paddr(char *addr) { - int base_phys; - - if (addr < (char *)mbutl || addr >= (char *)embutl) - return (0); + if (!MBUF_IN_MAP(addr)) + return (NULL); base_phys = mcl_paddr[(addr - (char *)mbutl) >> PGSHIFT]; if (base_phys == 0) - return (0); + return (NULL); return ((char *)((int)base_phys | ((int)addr & PGOFSET))); } /* * Dup the mbuf chain passed in. The whole thing. No cute additional cruft. * And really copy the thing. That way, we don't "precompute" checksums - * for unsuspecting consumers. - * Assumption: m->m_nextpkt == 0. - * Trick: for small packets, don't dup into a cluster. That way received - * packets don't take up too much room in the sockbuf (cf. sbspace()). + * for unsuspecting consumers. Assumption: m->m_nextpkt == 0. Trick: for + * small packets, don't dup into a cluster. That way received packets + * don't take up too much room in the sockbuf (cf. sbspace()). */ int MDFail; struct mbuf * m_dup(struct mbuf *m, int how) -{ +{ struct mbuf *n, **np; struct mbuf *top; int copyhdr = 0; np = ⊤ - top = 0; + top = NULL; if (m->m_flags & M_PKTHDR) copyhdr = 1; @@ -2139,172 +4526,766 @@ m_dup(struct mbuf *m, int how) * Quick check: if we have one mbuf and its data fits in an * mbuf with packet header, just copy and go. */ - if (m->m_next == NULL) - { /* Then just move the data into an mbuf and be done... */ - if (copyhdr) - { if (m->m_pkthdr.len <= MHLEN) - { if ((n = m_gethdr(how, m->m_type)) == NULL) - return(NULL); + if (m->m_next == NULL) { + /* Then just move the data into an mbuf and be done... */ + if (copyhdr) { + if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) { + if ((n = _M_GETHDR(how, m->m_type)) == NULL) + return (NULL); n->m_len = m->m_len; m_dup_pkthdr(n, m, how); bcopy(m->m_data, n->m_data, m->m_len); - return(n); + return (n); } - } else if (m->m_len <= MLEN) - { if ((n = m_get(how, m->m_type)) == NULL) - return(NULL); + } else if (m->m_len <= MLEN) { + if ((n = _M_GET(how, m->m_type)) == NULL) + return (NULL); bcopy(m->m_data, n->m_data, m->m_len); n->m_len = m->m_len; - return(n); + return (n); } } - while (m) - { + while (m != NULL) { #if BLUE_DEBUG kprintf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len, - m->m_data); + m->m_data); #endif if (copyhdr) - n = m_gethdr(how, m->m_type); + n = _M_GETHDR(how, m->m_type); else - n = m_get(how, m->m_type); - if (n == 0) + n = _M_GET(how, m->m_type); + if (n == NULL) goto nospace; - if (m->m_flags & M_EXT) - { MCLGET(n, how); - if ((n->m_flags & M_EXT) == 0) + if (m->m_flags & M_EXT) { + if (m->m_len <= m_maxsize(MC_CL)) + MCLGET(n, how); + else if (m->m_len <= m_maxsize(MC_BIGCL)) + n = m_mbigget(n, how); + else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0) + n = m_m16kget(n, how); + if (!(n->m_flags & M_EXT)) { + (void) m_free(n); goto nospace; + } } *np = n; - if (copyhdr) - { /* Don't use M_COPY_PKTHDR: preserve m_data */ + if (copyhdr) { + /* Don't use M_COPY_PKTHDR: preserve m_data */ m_dup_pkthdr(n, m, how); copyhdr = 0; - if ((n->m_flags & M_EXT) == 0) + if (!(n->m_flags & M_EXT)) n->m_data = n->m_pktdat; } n->m_len = m->m_len; /* * Get the dup on the same bdry as the original * Assume that the two mbufs have the same offset to data area - * (up to word bdries) + * (up to word boundaries) */ - bcopy(mtod(m, caddr_t), mtod(n, caddr_t), (unsigned)n->m_len); + bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len); m = m->m_next; np = &n->m_next; #if BLUE_DEBUG kprintf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len, - n->m_data); + n->m_data); #endif } - if (top == 0) + if (top == NULL) MDFail++; return (top); - nospace: + +nospace: m_freem(top); MDFail++; - return (0); + return (NULL); } -int -m_mclref(struct mbuf *p) +#define MBUF_MULTIPAGES(m) \ + (((m)->m_flags & M_EXT) && \ + ((IS_P2ALIGNED((m)->m_data, NBPG) && (m)->m_len > NBPG) || \ + (!IS_P2ALIGNED((m)->m_data, NBPG) && \ + P2ROUNDUP((m)->m_data, NBPG) < ((uintptr_t)(m)->m_data + (m)->m_len)))) + +static struct mbuf * +m_expand(struct mbuf *m, struct mbuf **last) { - return (_MCLREF(p)); + struct mbuf *top = NULL; + struct mbuf **nm = ⊤ + uintptr_t data0, data; + unsigned int len0, len; + + VERIFY(MBUF_MULTIPAGES(m)); + VERIFY(m->m_next == NULL); + data0 = (uintptr_t)m->m_data; + len0 = m->m_len; + *last = top; + + for (;;) { + struct mbuf *n; + + data = data0; + if (IS_P2ALIGNED(data, NBPG) && len0 > NBPG) + len = NBPG; + else if (!IS_P2ALIGNED(data, NBPG) && + P2ROUNDUP(data, NBPG) < (data + len0)) + len = P2ROUNDUP(data, NBPG) - data; + else + len = len0; + + VERIFY(len > 0); + VERIFY(m->m_flags & M_EXT); + m->m_data = (void *)data; + m->m_len = len; + + *nm = *last = m; + nm = &m->m_next; + m->m_next = NULL; + + data0 += len; + len0 -= len; + if (len0 == 0) + break; + + n = _M_RETRY(M_DONTWAIT, MT_DATA); + if (n == NULL) { + m_freem(top); + top = *last = NULL; + break; + } + + n->m_ext = m->m_ext; + m_incref(m); + n->m_flags |= M_EXT; + m = n; + } + return (top); } -int -m_mclunref(struct mbuf *p) +struct mbuf * +m_normalize(struct mbuf *m) { - return (_MCLUNREF(p)); + struct mbuf *top = NULL; + struct mbuf **nm = ⊤ + boolean_t expanded = FALSE; + + while (m != NULL) { + struct mbuf *n; + + n = m->m_next; + m->m_next = NULL; + + /* Does the data cross one or more page boundaries? */ + if (MBUF_MULTIPAGES(m)) { + struct mbuf *last; + if ((m = m_expand(m, &last)) == NULL) { + m_freem(n); + m_freem(top); + top = NULL; + break; + } + *nm = m; + nm = &last->m_next; + expanded = TRUE; + } else { + *nm = m; + nm = &m->m_next; + } + m = n; + } + if (expanded) + atomic_add_32(&mb_normalized, 1); + return (top); } -/* change mbuf to new type */ void m_mchtype(struct mbuf *m, int t) { - MBUF_LOCK(); - mbstat.m_mtypes[(m)->m_type]--; - mbstat.m_mtypes[t]++; - (m)->m_type = t; - MBUF_UNLOCK(); + mtype_stat_inc(t); + mtype_stat_dec(m->m_type); + (m)->m_type = t; } -void *m_mtod(struct mbuf *m) +void * +m_mtod(struct mbuf *m) { - return ((m)->m_data); + return (MTOD(m, void *)); } -struct mbuf *m_dtom(void *x) +struct mbuf * +m_dtom(void *x) { return ((struct mbuf *)((u_long)(x) & ~(MSIZE-1))); } -int m_mtocl(void *x) +void +m_mcheck(struct mbuf *m) { - return (((char *)(x) - (char *)mbutl) / sizeof(union mcluster)); + _MCHECK(m); } -union mcluster *m_cltom(int x) +/* + * Inform the corresponding mcache(s) that there's a waiter below. + */ +static void +mbuf_waiter_inc(mbuf_class_t class, boolean_t comp) { - return ((union mcluster *)(mbutl + (x))); + mcache_waiter_inc(m_cache(class)); + if (comp) { + if (class == MC_CL) { + mcache_waiter_inc(m_cache(MC_MBUF_CL)); + } else if (class == MC_BIGCL) { + mcache_waiter_inc(m_cache(MC_MBUF_BIGCL)); + } else if (class == MC_16KCL) { + mcache_waiter_inc(m_cache(MC_MBUF_16KCL)); + } else { + mcache_waiter_inc(m_cache(MC_MBUF_CL)); + mcache_waiter_inc(m_cache(MC_MBUF_BIGCL)); + } + } } +/* + * Inform the corresponding mcache(s) that there's no more waiter below. + */ +static void +mbuf_waiter_dec(mbuf_class_t class, boolean_t comp) +{ + mcache_waiter_dec(m_cache(class)); + if (comp) { + if (class == MC_CL) { + mcache_waiter_dec(m_cache(MC_MBUF_CL)); + } else if (class == MC_BIGCL) { + mcache_waiter_dec(m_cache(MC_MBUF_BIGCL)); + } else if (class == MC_16KCL) { + mcache_waiter_dec(m_cache(MC_MBUF_16KCL)); + } else { + mcache_waiter_dec(m_cache(MC_MBUF_CL)); + mcache_waiter_dec(m_cache(MC_MBUF_BIGCL)); + } + } +} -void m_mcheck(struct mbuf *m) +/* + * Called during blocking allocation. Returns TRUE if one or more objects + * are available at the per-CPU caches layer and that allocation should be + * retried at that level. + */ +static boolean_t +mbuf_sleep(mbuf_class_t class, unsigned int num, int wait) { - if (m->m_type != MT_FREE) - panic("mget MCHECK: m_type=%x m=%x", m->m_type, m); + boolean_t mcache_retry = FALSE; + + lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + + /* Check if there's anything at the cache layer */ + if (mbuf_cached_above(class, wait)) { + mcache_retry = TRUE; + goto done; + } + + /* Nothing? Then try hard to get it from somewhere */ + m_reclaim(class, num, (wait & MCR_COMP)); + + /* We tried hard and got something? */ + if (m_infree(class) > 0) { + mbstat.m_wait++; + goto done; + } else if (mbuf_cached_above(class, wait)) { + mbstat.m_wait++; + mcache_retry = TRUE; + goto done; + } else if (wait & MCR_TRYHARD) { + mcache_retry = TRUE; + goto done; + } + + /* + * There's really nothing for us right now; inform the + * cache(s) that there is a waiter below and go to sleep. + */ + mbuf_waiter_inc(class, (wait & MCR_COMP)); + + VERIFY(!(wait & MCR_NOSLEEP)); + mb_waiters++; + (void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL); + + /* We are now up; stop getting notified until next round */ + mbuf_waiter_dec(class, (wait & MCR_COMP)); + + /* We waited and got something */ + if (m_infree(class) > 0) { + mbstat.m_wait++; + goto done; + } else if (mbuf_cached_above(class, wait)) { + mbstat.m_wait++; + mcache_retry = TRUE; + } +done: + return (mcache_retry); } static void -mbuf_expand_thread(void) +mbuf_worker_thread(void) { + int mbuf_expand; + while (1) { - MBUF_LOCK(); + lck_mtx_lock(mbuf_mlock); + + mbuf_expand = 0; if (mbuf_expand_mcl) { int n; - - /* Adjust to the current number of cluster in use */ - n = mbuf_expand_mcl - (mbstat.m_clusters - mbstat.m_clfree); + + /* Adjust to current number of cluster in use */ + n = mbuf_expand_mcl - + (m_total(MC_CL) - m_infree(MC_CL)); + if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL)) + n = m_maxlimit(MC_CL) - m_total(MC_CL); mbuf_expand_mcl = 0; - - if (n > 0) - (void)m_clalloc(n, M_WAIT, MCLBYTES, 1); + + if (n > 0 && freelist_populate(MC_CL, n, M_WAIT) > 0) + mbuf_expand++; } if (mbuf_expand_big) { int n; - - /* Adjust to the current number of 4 KB cluster in use */ - n = mbuf_expand_big - (mbstat.m_bigclusters - mbstat.m_bigclfree); + + /* Adjust to current number of 4 KB cluster in use */ + n = mbuf_expand_big - + (m_total(MC_BIGCL) - m_infree(MC_BIGCL)); + if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL)) + n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL); mbuf_expand_big = 0; - - if (n > 0) - (void)m_clalloc(n, M_WAIT, NBPG, 1); - } - MBUF_UNLOCK(); - /* - * Because we can run out of memory before filling the mbuf map, we - * should not allocate more clusters than they are mbufs -- otherwise - * we could have a large number of useless clusters allocated. + + if (n > 0 && freelist_populate(MC_BIGCL, n, M_WAIT) > 0) + mbuf_expand++; + } + if (mbuf_expand_16k) { + int n; + + /* Adjust to current number of 16 KB cluster in use */ + n = mbuf_expand_16k - + (m_total(MC_16KCL) - m_infree(MC_16KCL)); + if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL)) + n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL); + mbuf_expand_16k = 0; + + if (n > 0) + (void) freelist_populate(MC_16KCL, n, M_WAIT); + } + + /* + * Because we can run out of memory before filling the mbuf + * map, we should not allocate more clusters than they are + * mbufs -- otherwise we could have a large number of useless + * clusters allocated. */ - while (mbstat.m_mbufs < mbstat.m_bigclusters + mbstat.m_clusters) { - if (m_expand(M_WAIT) == 0) - break; + if (mbuf_expand) { + while (m_total(MC_MBUF) < + (m_total(MC_BIGCL) + m_total(MC_CL))) { + if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0) + break; + } } - - assert_wait(&mbuf_expand_thread_wakeup, THREAD_UNINT); - (void) thread_block((thread_continue_t)mbuf_expand_thread); + + lck_mtx_unlock(mbuf_mlock); + + assert_wait(&mbuf_worker_run, THREAD_UNINT); + (void) thread_block((thread_continue_t)mbuf_worker_thread); } } static void -mbuf_expand_thread_init(void) +mbuf_worker_thread_init(void) { - mbuf_expand_thread_initialized++; - mbuf_expand_thread(); + mbuf_worker_ready++; + mbuf_worker_thread(); } -SYSCTL_DECL(_kern_ipc); -SYSCTL_STRUCT(_kern_ipc, KIPC_MBSTAT, mbstat, CTLFLAG_RW, &mbstat, mbstat, ""); +static mcl_slab_t * +slab_get(void *buf) +{ + mcl_slabg_t *slg; + unsigned int ix, k; + + lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + + VERIFY(MBUF_IN_MAP(buf)); + ix = ((char *)buf - (char *)mbutl) >> MBSHIFT; + VERIFY(ix < maxslabgrp); + + if ((slg = slabstbl[ix]) == NULL) { + /* + * In the current implementation, we never shrink the memory + * pool (hence the cluster map); if we attempt to reallocate + * a cluster group when it's already allocated, panic since + * this is a sign of a memory corruption (slabstbl[ix] got + * nullified). This also means that there shouldn't be any + * hole in the kernel sub-map for the mbuf pool. + */ + ++slabgrp; + VERIFY(ix < slabgrp); + /* + * Slabs expansion can only be done single threaded; when + * we get here, it must be as a result of m_clalloc() which + * is serialized and therefore mb_clalloc_busy must be set. + */ + VERIFY(mb_clalloc_busy); + lck_mtx_unlock(mbuf_mlock); + + /* This is a new buffer; create the slabs group for it */ + MALLOC(slg, mcl_slabg_t *, sizeof (*slg), M_TEMP, + M_WAITOK | M_ZERO); + VERIFY(slg != NULL); + + lck_mtx_lock(mbuf_mlock); + /* + * No other thread could have gone into m_clalloc() after + * we dropped the lock above, so verify that it's true. + */ + VERIFY(mb_clalloc_busy); + + slabstbl[ix] = slg; + + /* Chain each slab in the group to its forward neighbor */ + for (k = 1; k < NSLABSPMB; k++) + slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k]; + VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL); + + /* And chain the last slab in the previous group to this */ + if (ix > 0) { + VERIFY(slabstbl[ix - 1]-> + slg_slab[NSLABSPMB - 1].sl_next == NULL); + slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next = + &slg->slg_slab[0]; + } + } + + ix = MTOCL(buf) % NSLABSPMB; + VERIFY(ix < NSLABSPMB); + + return (&slg->slg_slab[ix]); +} + +static void +slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags, + void *base, void *head, unsigned int len, int refcnt, int chunks) +{ + sp->sl_class = class; + sp->sl_flags = flags; + sp->sl_base = base; + sp->sl_head = head; + sp->sl_len = len; + sp->sl_refcnt = refcnt; + sp->sl_chunks = chunks; + slab_detach(sp); +} + +static void +slab_insert(mcl_slab_t *sp, mbuf_class_t class) +{ + VERIFY(slab_is_detached(sp)); + m_slab_cnt(class)++; + TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link); + sp->sl_flags &= ~SLF_DETACHED; + if (class == MC_BIGCL) { + sp = sp->sl_next; + /* Next slab must already be present */ + VERIFY(sp != NULL); + VERIFY(slab_is_detached(sp)); + sp->sl_flags &= ~SLF_DETACHED; + } else if (class == MC_16KCL) { + int k; + for (k = 1; k < (M16KCLBYTES / MCLBYTES); k++) { + sp = sp->sl_next; + /* Next slab must already be present */ + VERIFY(sp != NULL); + VERIFY(slab_is_detached(sp)); + sp->sl_flags &= ~SLF_DETACHED; + } + } +} + +static void +slab_remove(mcl_slab_t *sp, mbuf_class_t class) +{ + VERIFY(!slab_is_detached(sp)); + VERIFY(m_slab_cnt(class) > 0); + m_slab_cnt(class)--; + TAILQ_REMOVE(&m_slablist(class), sp, sl_link); + slab_detach(sp); + if (class == MC_BIGCL) { + sp = sp->sl_next; + /* Next slab must already be present */ + VERIFY(sp != NULL); + VERIFY(!slab_is_detached(sp)); + slab_detach(sp); + } else if (class == MC_16KCL) { + int k; + for (k = 1; k < (M16KCLBYTES / MCLBYTES); k++) { + sp = sp->sl_next; + /* Next slab must already be present */ + VERIFY(sp != NULL); + VERIFY(!slab_is_detached(sp)); + slab_detach(sp); + } + } +} + +static boolean_t +slab_inrange(mcl_slab_t *sp, void *buf) +{ + return ((uintptr_t)buf >= (uintptr_t)sp->sl_base && + (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len)); +} + +#undef panic(...) + +static void +slab_nextptr_panic(mcl_slab_t *sp, void *addr) +{ + int i; + unsigned int chunk_len = sp->sl_len / sp->sl_chunks; + uintptr_t buf = (uintptr_t)sp->sl_base; + + for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) { + void *next = ((mcache_obj_t *)buf)->obj_next; + if (next != addr) + continue; + if (mclaudit == NULL) { + if (next != NULL && !MBUF_IN_MAP(next)) { + mcache_t *cp = m_cache(sp->sl_class); + panic("%s: %s buffer %p in slab %p modified " + "after free at offset 0: %p out of range " + "[%p-%p)\n", __func__, cp->mc_name, + (void *)buf, sp, next, mbutl, embutl); + /* NOTREACHED */ + } + } else { + mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class, + (mcache_obj_t *)buf); + mcl_audit_verify_nextptr(next, mca); + } + } +} + +static void +slab_detach(mcl_slab_t *sp) +{ + sp->sl_link.tqe_next = (mcl_slab_t *)-1; + sp->sl_link.tqe_prev = (mcl_slab_t **)-1; + sp->sl_flags |= SLF_DETACHED; +} + +static boolean_t +slab_is_detached(mcl_slab_t *sp) +{ + return ((intptr_t)sp->sl_link.tqe_next == -1 && + (intptr_t)sp->sl_link.tqe_prev == -1 && + (sp->sl_flags & SLF_DETACHED)); +} + +static void +mcl_audit_init(void *buf, mcache_audit_t **mca_list, + mcache_obj_t **con_list, size_t con_size, unsigned int num) +{ + mcache_audit_t *mca, *mca_tail; + mcache_obj_t *con = NULL; + boolean_t save_contents = (con_list != NULL); + unsigned int i, ix; + + ASSERT(num <= NMBPCL); + ASSERT(con_list == NULL || con_size != 0); + + ix = MTOCL(buf); + /* Make sure we haven't been here before */ + for (i = 0; i < NMBPCL; i++) + VERIFY(mclaudit[ix].cl_audit[i] == NULL); + + mca = mca_tail = *mca_list; + if (save_contents) + con = *con_list; + + for (i = 0; i < num; i++) { + mcache_audit_t *next; + + next = mca->mca_next; + bzero(mca, sizeof (*mca)); + mca->mca_next = next; + mclaudit[ix].cl_audit[i] = mca; + + /* Attach the contents buffer if requested */ + if (save_contents) { + VERIFY(con != NULL); + mca->mca_contents_size = con_size; + mca->mca_contents = con; + con = con->obj_next; + bzero(mca->mca_contents, mca->mca_contents_size); + } + + mca_tail = mca; + mca = mca->mca_next; + } + if (save_contents) + *con_list = con; + + *mca_list = mca_tail->mca_next; + mca_tail->mca_next = NULL; +} + +/* + * Given an address of a buffer (mbuf/cluster/big cluster), return + * the corresponding audit structure for that buffer. + */ +static mcache_audit_t * +mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *o) +{ + mcache_audit_t *mca = NULL; + int ix = MTOCL(o); + + VERIFY(IS_P2ALIGNED(o, MIN(m_maxsize(class), NBPG))); + + switch (class) { + case MC_MBUF: + /* + * For the mbuf case, find the index of the cluster + * used by the mbuf and use that index to locate the + * base address of the cluster. Then find out the + * mbuf index relative to the cluster base and use + * it to locate the audit structure. + */ + VERIFY(MCLIDX(CLTOM(ix), o) < (int)NMBPCL); + mca = mclaudit[ix].cl_audit[MCLIDX(CLTOM(ix), o)]; + break; + + case MC_CL: + case MC_BIGCL: + case MC_16KCL: + /* + * Same as above, but only return the first element. + */ + mca = mclaudit[ix].cl_audit[0]; + break; + + default: + VERIFY(0); + /* NOTREACHED */ + } + + return (mca); +} + +static void +mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite, + boolean_t alloc) +{ + struct mbuf *m = addr; + mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next; + + VERIFY(mca->mca_contents != NULL && + mca->mca_contents_size == AUDIT_CONTENTS_SIZE); + + mcl_audit_verify_nextptr(next, mca); + + if (!alloc) { + /* Save constructed mbuf fields */ + mcl_audit_save_mbuf(m, mca); + mcache_set_pattern(MCACHE_FREE_PATTERN, m, m_maxsize(MC_MBUF)); + ((mcache_obj_t *)m)->obj_next = next; + return; + } + + /* Check if the buffer has been corrupted while in freelist */ + mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF)); + + /* Restore constructed mbuf fields */ + mcl_audit_restore_mbuf(m, mca, composite); +} + +static void +mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite) +{ + struct mbuf *ms = (struct mbuf *)mca->mca_contents; + + if (composite) { + struct mbuf *next = m->m_next; + VERIFY(ms->m_flags == M_EXT && MEXT_RFA(ms) != NULL && + MBUF_IS_COMPOSITE(ms)); + /* + * We could have hand-picked the mbuf fields and restore + * them individually, but that will be a maintenance + * headache. Instead, restore everything that was saved; + * the mbuf layer will recheck and reinitialize anyway. + */ + bcopy(ms, m, mca->mca_contents_size); + m->m_next = next; + } else { + /* + * For a regular mbuf (no cluster attached) there's nothing + * to restore other than the type field, which is expected + * to be MT_FREE. + */ + m->m_type = ms->m_type; + } + _MCHECK(m); +} + +static void +mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca) +{ + _MCHECK(m); + bcopy(m, mca->mca_contents, mca->mca_contents_size); +} + +static void +mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc, + boolean_t save_next) +{ + mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next; + + if (!alloc) { + mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size); + if (save_next) { + mcl_audit_verify_nextptr(next, mca); + ((mcache_obj_t *)addr)->obj_next = next; + } + } else { + /* Check if the buffer has been corrupted while in freelist */ + mcl_audit_verify_nextptr(next, mca); + mcache_audit_free_verify_set(mca, addr, 0, size); + } +} + +static void +mcl_audit_mcheck_panic(struct mbuf *m) +{ + mcache_audit_t *mca; + + MRANGE(m); + mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); + + panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n", + m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(mca)); + /* NOTREACHED */ +} + +static void +mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca) +{ + if (next != NULL && next != (void *)MCACHE_FREE_PATTERN && + !MBUF_IN_MAP(next)) { + panic("mcl_audit: buffer %p modified after free at offset 0: " + "%p out of range [%p-%p)\n%s\n", + mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca)); + /* NOTREACHED */ + } +} + +SYSCTL_DECL(_kern_ipc); +SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat, CTLFLAG_RD | CTLFLAG_LOCKED, + 0, 0, mbstat_sysctl, "S,mbstat", ""); +SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat, CTLFLAG_RD | CTLFLAG_LOCKED, + 0, 0, mb_stat_sysctl, "S,mb_stat", ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized, CTLFLAG_RD | CTLFLAG_LOCKED, + &mb_normalized, 0, "");