git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 1998-2020 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
	29	/*
	30	* Copyright (c) 1982, 1986, 1988, 1991, 1993
	31	* The Regents of the University of California. All rights reserved.
	32	*
	33	* Redistribution and use in source and binary forms, with or without
	34	* modification, are permitted provided that the following conditions
	35	* are met:
	36	* 1. Redistributions of source code must retain the above copyright
	37	* notice, this list of conditions and the following disclaimer.
	38	* 2. Redistributions in binary form must reproduce the above copyright
	39	* notice, this list of conditions and the following disclaimer in the
	40	* documentation and/or other materials provided with the distribution.
	41	* 3. All advertising materials mentioning features or use of this software
	42	* must display the following acknowledgement:
	43	* This product includes software developed by the University of
	44	* California, Berkeley and its contributors.
	45	* 4. Neither the name of the University nor the names of its contributors
	46	* may be used to endorse or promote products derived from this software
	47	* without specific prior written permission.
	48	*
	49	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	50	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	51	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	52	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	53	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	54	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	55	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	56	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	57	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	58	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	59	* SUCH DAMAGE.
	60	*
	61	* @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
	62	*/
	63	/*
	64	* NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
	65	* support for mandatory and extensible security protections. This notice
	66	* is included in support of clause 2.2 (b) of the Apple Public License,
	67	* Version 2.0.
	68	*/
	69
	70	#include <ptrauth.h>
	71
	72	#include <sys/param.h>
	73	#include <sys/systm.h>
	74	#include <sys/malloc.h>
	75	#include <sys/mbuf.h>
	76	#include <sys/kernel.h>
	77	#include <sys/sysctl.h>
	78	#include <sys/syslog.h>
	79	#include <sys/protosw.h>
	80	#include <sys/domain.h>
	81	#include <sys/queue.h>
	82	#include <sys/proc.h>
	83	#include <sys/filedesc.h>
	84	#include <sys/file_internal.h>
	85
	86	#include <dev/random/randomdev.h>
	87
	88	#include <kern/kern_types.h>
	89	#include <kern/simple_lock.h>
	90	#include <kern/queue.h>
	91	#include <kern/sched_prim.h>
	92	#include <kern/backtrace.h>
	93	#include <kern/percpu.h>
	94	#include <kern/zalloc.h>
	95
	96	#include <libkern/OSAtomic.h>
	97	#include <libkern/OSDebug.h>
	98	#include <libkern/libkern.h>
	99
	100	#include <os/log.h>
	101	#include <os/ptrtools.h>
	102
	103	#include <IOKit/IOMapper.h>
	104
	105	#include <machine/limits.h>
	106	#include <machine/machine_routines.h>
	107
	108	#include <sys/mcache.h>
	109	#include <net/ntstat.h>
	110
	111	/*
	112	* MBUF IMPLEMENTATION NOTES.
	113	*
	114	* There is a total of 5 per-CPU caches:
	115	*
	116	* MC_MBUF:
	117	* This is a cache of rudimentary objects of MSIZE in size; each
	118	* object represents an mbuf structure. This cache preserves only
	119	* the m_type field of the mbuf during its transactions.
	120	*
	121	* MC_CL:
	122	* This is a cache of rudimentary objects of MCLBYTES in size; each
	123	* object represents a mcluster structure. This cache does not
	124	* preserve the contents of the objects during its transactions.
	125	*
	126	* MC_BIGCL:
	127	* This is a cache of rudimentary objects of MBIGCLBYTES in size; each
	128	* object represents a mbigcluster structure. This cache does not
	129	* preserve the contents of the objects during its transaction.
	130	*
	131	* MC_MBUF_CL:
	132	* This is a cache of mbufs each having a cluster attached to it.
	133	* It is backed by MC_MBUF and MC_CL rudimentary caches. Several
	134	* fields of the mbuf related to the external cluster are preserved
	135	* during transactions.
	136	*
	137	* MC_MBUF_BIGCL:
	138	* This is a cache of mbufs each having a big cluster attached to it.
	139	* It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several
	140	* fields of the mbuf related to the external cluster are preserved
	141	* during transactions.
	142	*
	143	* OBJECT ALLOCATION:
	144	*
	145	* Allocation requests are handled first at the per-CPU (mcache) layer
	146	* before falling back to the slab layer. Performance is optimal when
	147	* the request is satisfied at the CPU layer because global data/lock
	148	* never gets accessed. When the slab layer is entered for allocation,
	149	* the slab freelist will be checked first for available objects before
	150	* the VM backing store is invoked. Slab layer operations are serialized
	151	* for all of the caches as the mbuf global lock is held most of the time.
	152	* Allocation paths are different depending on the class of objects:
	153	*
	154	* a. Rudimentary object:
	155	*
	156	* { m_get_common(), m_clattach(), m_mclget(),
	157	* m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
	158	* composite object allocation }
	159	* \| ^
	160	* \| \|
	161	* \| +-----------------------+
	162	* v \|
	163	* mcache_alloc/mcache_alloc_ext() mbuf_slab_audit()
	164	* \| ^
	165	* v \|
	166	* [CPU cache] -------> (found?) -------+
	167	* \| \|
	168	* v \|
	169	* mbuf_slab_alloc() \|
	170	* \| \|
	171	* v \|
	172	* +---------> [freelist] -------> (found?) -------+
	173	* \| \|
	174	* \| v
	175	* \| m_clalloc()
	176	* \| \|
	177	* \| v
	178	* +---<<---- kmem_mb_alloc()
	179	*
	180	* b. Composite object:
	181	*
	182	* { m_getpackets_internal(), m_allocpacket_internal() }
	183	* \| ^
	184	* \| \|
	185	* \| +------ (done) ---------+
	186	* v \|
	187	* mcache_alloc/mcache_alloc_ext() mbuf_cslab_audit()
	188	* \| ^
	189	* v \|
	190	* [CPU cache] -------> (found?) -------+
	191	* \| \|
	192	* v \|
	193	* mbuf_cslab_alloc() \|
	194	* \| \|
	195	* v \|
	196	* [freelist] -------> (found?) -------+
	197	* \| \|
	198	* v \|
	199	* (rudimentary object) \|
	200	* mcache_alloc/mcache_alloc_ext() ------>>-----+
	201	*
	202	* Auditing notes: If auditing is enabled, buffers will be subjected to
	203	* integrity checks by the audit routine. This is done by verifying their
	204	* contents against DEADBEEF (free) pattern before returning them to caller.
	205	* As part of this step, the routine will also record the transaction and
	206	* pattern-fill the buffers with BADDCAFE (uninitialized) pattern. It will
	207	* also restore any constructed data structure fields if necessary.
	208	*
	209	* OBJECT DEALLOCATION:
	210	*
	211	* Freeing an object simply involves placing it into the CPU cache; this
	212	* pollutes the cache to benefit subsequent allocations. The slab layer
	213	* will only be entered if the object is to be purged out of the cache.
	214	* During normal operations, this happens only when the CPU layer resizes
	215	* its bucket while it's adjusting to the allocation load. Deallocation
	216	* paths are different depending on the class of objects:
	217	*
	218	* a. Rudimentary object:
	219	*
	220	* { m_free(), m_freem_list(), composite object deallocation }
	221	* \| ^
	222	* \| \|
	223	* \| +------ (done) ---------+
	224	* v \|
	225	* mcache_free/mcache_free_ext() \|
	226	* \| \|
	227	* v \|
	228	* mbuf_slab_audit() \|
	229	* \| \|
	230	* v \|
	231	* [CPU cache] ---> (not purging?) -----+
	232	* \| \|
	233	* v \|
	234	* mbuf_slab_free() \|
	235	* \| \|
	236	* v \|
	237	* [freelist] ----------->>------------+
	238	* (objects get purged to VM only on demand)
	239	*
	240	* b. Composite object:
	241	*
	242	* { m_free(), m_freem_list() }
	243	* \| ^
	244	* \| \|
	245	* \| +------ (done) ---------+
	246	* v \|
	247	* mcache_free/mcache_free_ext() \|
	248	* \| \|
	249	* v \|
	250	* mbuf_cslab_audit() \|
	251	* \| \|
	252	* v \|
	253	* [CPU cache] ---> (not purging?) -----+
	254	* \| \|
	255	* v \|
	256	* mbuf_cslab_free() \|
	257	* \| \|
	258	* v \|
	259	* [freelist] ---> (not purging?) -----+
	260	* \| \|
	261	* v \|
	262	* (rudimentary object) \|
	263	* mcache_free/mcache_free_ext() ------->>------+
	264	*
	265	* Auditing notes: If auditing is enabled, the audit routine will save
	266	* any constructed data structure fields (if necessary) before filling the
	267	* contents of the buffers with DEADBEEF (free) pattern and recording the
	268	* transaction. Buffers that are freed (whether at CPU or slab layer) are
	269	* expected to contain the free pattern.
	270	*
	271	* DEBUGGING:
	272	*
	273	* Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
	274	* translates to the mcache flags (MCF_VERIFY \| MCF_AUDIT). Additionally,
	275	* the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
	276	* i.e. modify the boot argument parameter to "mbuf_debug=0x13". Leak
	277	* detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g.
	278	* "mbuf_debug=0x113". Note that debugging consumes more CPU and memory.
	279	*
	280	* Each object is associated with exactly one mcache_audit_t structure that
	281	* contains the information related to its last buffer transaction. Given
	282	* an address of an object, the audit structure can be retrieved by finding
	283	* the position of the object relevant to the base address of the cluster:
	284	*
	285	* +------------+ +=============+
	286	* \| mbuf addr \| \| mclaudit[i] \|
	287	* +------------+ +=============+
	288	* \| \| cl_audit[0] \|
	289	* i = MTOBG(addr) +-------------+
	290	* \| +-----> \| cl_audit[1] \| -----> mcache_audit_t
	291	* b = BGTOM(i) \| +-------------+
	292	* \| \| \| ... \|
	293	* x = MCLIDX(b, addr) \| +-------------+
	294	* \| \| \| cl_audit[7] \|
	295	* +-----------------+ +-------------+
	296	* (e.g. x == 1)
	297	*
	298	* The mclaudit[] array is allocated at initialization time, but its contents
	299	* get populated when the corresponding cluster is created. Because a page
	300	* can be turned into NMBPG number of mbufs, we preserve enough space for the
	301	* mbufs so that there is a 1-to-1 mapping between them. A page that never
	302	* gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
	303	* remaining entries unused. For 16KB cluster, only one entry from the first
	304	* page is allocated and used for the entire object.
	305	*/
	306
	307	/* TODO: should be in header file */
	308	/* kernel translater */
	309	extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
	310	extern vm_map_t mb_map; /* special map */
	311
	312	static uint32_t mb_kmem_contig_failed;
	313	static uint32_t mb_kmem_failed;
	314	static uint32_t mb_kmem_one_failed;
	315	/* Timestamp of allocation failures. */
	316	static uint64_t mb_kmem_contig_failed_ts;
	317	static uint64_t mb_kmem_failed_ts;
	318	static uint64_t mb_kmem_one_failed_ts;
	319	static uint64_t mb_kmem_contig_failed_size;
	320	static uint64_t mb_kmem_failed_size;
	321	static uint32_t mb_kmem_stats[6];
	322	static const char *mb_kmem_stats_labels[] = { "INVALID_ARGUMENT",
	323	"INVALID_ADDRESS",
	324	"RESOURCE_SHORTAGE",
	325	"NO_SPACE",
	326	"KERN_FAILURE",
	327	"OTHERS" };
	328
	329	/* Global lock */
	330	static LCK_GRP_DECLARE(mbuf_mlock_grp, "mbuf");
	331	static LCK_MTX_DECLARE(mbuf_mlock_data, &mbuf_mlock_grp);
	332	static lck_mtx_t *const mbuf_mlock = &mbuf_mlock_data;
	333
	334	/* Back-end (common) layer */
	335	static uint64_t mb_expand_cnt;
	336	static uint64_t mb_expand_cl_cnt;
	337	static uint64_t mb_expand_cl_total;
	338	static uint64_t mb_expand_bigcl_cnt;
	339	static uint64_t mb_expand_bigcl_total;
	340	static uint64_t mb_expand_16kcl_cnt;
	341	static uint64_t mb_expand_16kcl_total;
	342	static boolean_t mbuf_worker_needs_wakeup; /* wait channel for mbuf worker */
	343	static uint32_t mbuf_worker_run_cnt;
	344	static uint64_t mbuf_worker_last_runtime;
	345	static uint64_t mbuf_drain_last_runtime;
	346	static int mbuf_worker_ready; /* worker thread is runnable */
	347	static unsigned int ncpu; /* number of CPUs */
	348	static ppnum_t mcl_paddr; / Array of cluster physical addresses */
	349	static ppnum_t mcl_pages; /* Size of array (# physical pages) */
	350	static ppnum_t mcl_paddr_base; /* Handle returned by IOMapper::iovmAlloc() */
	351	static mcache_t ref_cache; / Cache of cluster reference & flags */
	352	static mcache_t mcl_audit_con_cache; / Audit contents cache */
	353	static unsigned int mbuf_debug; /* patchable mbuf mcache flags */
	354	static unsigned int mb_normalized; /* number of packets "normalized" */
	355
	356	#define MB_GROWTH_AGGRESSIVE 1 /* Threshold: 1/2 of total */
	357	#define MB_GROWTH_NORMAL 2 /* Threshold: 3/4 of total */
	358
	359	typedef enum {
	360	MC_MBUF = 0, /* Regular mbuf */
	361	MC_CL, /* Cluster */
	362	MC_BIGCL, /* Large (4KB) cluster */
	363	MC_16KCL, /* Jumbo (16KB) cluster */
	364	MC_MBUF_CL, /* mbuf + cluster */
	365	MC_MBUF_BIGCL, /* mbuf + large (4KB) cluster */
	366	MC_MBUF_16KCL /* mbuf + jumbo (16KB) cluster */
	367	} mbuf_class_t;
	368
	369	#define MBUF_CLASS_MIN MC_MBUF
	370	#define MBUF_CLASS_MAX MC_MBUF_16KCL
	371	#define MBUF_CLASS_LAST MC_16KCL
	372	#define MBUF_CLASS_VALID(c) \
	373	((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
	374	#define MBUF_CLASS_COMPOSITE(c) \
	375	((int)(c) > MBUF_CLASS_LAST)
	376
	377
	378	/*
	379	* mbuf specific mcache allocation request flags.
	380	*/
	381	#define MCR_COMP MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
	382
	383	/*
	384	* Per-cluster slab structure.
	385	*
	386	* A slab is a cluster control structure that contains one or more object
	387	* chunks; the available chunks are chained in the slab's freelist (sl_head).
	388	* Each time a chunk is taken out of the slab, the slab's reference count
	389	* gets incremented. When all chunks have been taken out, the empty slab
	390	* gets removed (SLF_DETACHED) from the class's slab list. A chunk that is
	391	* returned to a slab causes the slab's reference count to be decremented;
	392	* it also causes the slab to be reinserted back to class's slab list, if
	393	* it's not already done.
	394	*
	395	* Compartmentalizing of the object chunks into slabs allows us to easily
	396	* merge one or more slabs together when the adjacent slabs are idle, as
	397	* well as to convert or move a slab from one class to another; e.g. the
	398	* mbuf cluster slab can be converted to a regular cluster slab when all
	399	* mbufs in the slab have been freed.
	400	*
	401	* A slab may also span across multiple clusters for chunks larger than
	402	* a cluster's size. In this case, only the slab of the first cluster is
	403	* used. The rest of the slabs are marked with SLF_PARTIAL to indicate
	404	* that they are part of the larger slab.
	405	*
	406	* Each slab controls a page of memory.
	407	*/
	408	typedef struct mcl_slab {
	409	struct mcl_slab sl_next; / neighboring slab */
	410	u_int8_t sl_class; /* controlling mbuf class */
	411	int8_t sl_refcnt; /* outstanding allocations */
	412	int8_t sl_chunks; /* chunks (bufs) in this slab */
	413	u_int16_t sl_flags; /* slab flags (see below) */
	414	u_int16_t sl_len; /* slab length */
	415	void sl_base; / base of allocated memory */
	416	void sl_head; / first free buffer */
	417	TAILQ_ENTRY(mcl_slab) sl_link; /* next/prev slab on freelist */
	418	} mcl_slab_t;
	419
	420	#define SLF_MAPPED 0x0001 /* backed by a mapped page */
	421	#define SLF_PARTIAL 0x0002 /* part of another slab */
	422	#define SLF_DETACHED 0x0004 /* not in slab freelist */
	423
	424	/*
	425	* The array of slabs are broken into groups of arrays per 1MB of kernel
	426	* memory to reduce the footprint. Each group is allocated on demand
	427	* whenever a new piece of memory mapped in from the VM crosses the 1MB
	428	* boundary.
	429	*/
	430	#define NSLABSPMB ((1 << MBSHIFT) >> PAGE_SHIFT)
	431
	432	typedef struct mcl_slabg {
	433	mcl_slab_t slg_slab; / group of slabs */
	434	} mcl_slabg_t;
	435
	436	/*
	437	* Number of slabs needed to control a 16KB cluster object.
	438	*/
	439	#define NSLABSP16KB (M16KCLBYTES >> PAGE_SHIFT)
	440
	441	/*
	442	* Per-cluster audit structure.
	443	*/
	444	typedef struct {
	445	mcache_audit_t *cl_audit; / array of audits */
	446	} mcl_audit_t;
	447
	448	typedef struct {
	449	struct thread msa_thread; / thread doing transaction */
	450	struct thread msa_pthread; / previous transaction thread */
	451	uint32_t msa_tstamp; /* transaction timestamp (ms) */
	452	uint32_t msa_ptstamp; /* prev transaction timestamp (ms) */
	453	uint16_t msa_depth; /* pc stack depth */
	454	uint16_t msa_pdepth; /* previous transaction pc stack */
	455	void *msa_stack[MCACHE_STACK_DEPTH];
	456	void *msa_pstack[MCACHE_STACK_DEPTH];
	457	} mcl_scratch_audit_t;
	458
	459	typedef struct {
	460	/*
	461	* Size of data from the beginning of an mbuf that covers m_hdr,
	462	* pkthdr and m_ext structures. If auditing is enabled, we allocate
	463	* a shadow mbuf structure of this size inside each audit structure,
	464	* and the contents of the real mbuf gets copied into it when the mbuf
	465	* is freed. This allows us to pattern-fill the mbuf for integrity
	466	* check, and to preserve any constructed mbuf fields (e.g. mbuf +
	467	* cluster cache case). Note that we don't save the contents of
	468	* clusters when they are freed; we simply pattern-fill them.
	469	*/
	470	u_int8_t sc_mbuf[(MSIZE - _MHLEN) + sizeof(_m_ext_t)];
	471	mcl_scratch_audit_t sc_scratch __attribute__((aligned(8)));
	472	} mcl_saved_contents_t;
	473
	474	#define AUDIT_CONTENTS_SIZE (sizeof (mcl_saved_contents_t))
	475
	476	#define MCA_SAVED_MBUF_PTR(_mca) \
	477	((struct mbuf )(void )((mcl_saved_contents_t *) \
	478	(_mca)->mca_contents)->sc_mbuf)
	479	#define MCA_SAVED_MBUF_SIZE \
	480	(sizeof (((mcl_saved_contents_t *)0)->sc_mbuf))
	481	#define MCA_SAVED_SCRATCH_PTR(_mca) \
	482	(&((mcl_saved_contents_t *)(_mca)->mca_contents)->sc_scratch)
	483
	484	/*
	485	* mbuf specific mcache audit flags
	486	*/
	487	#define MB_INUSE 0x01 /* object has not been returned to slab */
	488	#define MB_COMP_INUSE 0x02 /* object has not been returned to cslab */
	489	#define MB_SCVALID 0x04 /* object has valid saved contents */
	490
	491	/*
	492	* Each of the following two arrays hold up to nmbclusters elements.
	493	*/
	494	static mcl_audit_t mclaudit; / array of cluster audit information */
	495	static unsigned int maxclaudit; /* max # of entries in audit table */
	496	static mcl_slabg_t *slabstbl; / cluster slabs table */
	497	static unsigned int maxslabgrp; /* max # of entries in slabs table */
	498	static unsigned int slabgrp; /* # of entries in slabs table */
	499
	500	/* Globals */
	501	int nclusters; /* # of clusters for non-jumbo (legacy) sizes */
	502	int njcl; /* # of clusters for jumbo sizes */
	503	int njclbytes; /* size of a jumbo cluster */
	504	unsigned char mbutl; / first mapped cluster address */
	505	unsigned char embutl; / ending virtual address of mclusters */
	506	int _max_linkhdr; /* largest link-level header */
	507	int _max_protohdr; /* largest protocol header */
	508	int max_hdr; /* largest link+protocol header */
	509	int max_datalen; /* MHLEN - max_hdr */
	510
	511	static boolean_t mclverify; /* debug: pattern-checking */
	512	static boolean_t mcltrace; /* debug: stack tracing */
	513	static boolean_t mclfindleak; /* debug: leak detection */
	514	static boolean_t mclexpleak; /* debug: expose leak info to user space */
	515
	516	static struct timeval mb_start; /* beginning of time */
	517
	518	/* mbuf leak detection variables */
	519	static struct mleak_table mleak_table;
	520	static mleak_stat_t *mleak_stat;
	521
	522	#define MLEAK_STAT_SIZE(n) \
	523	__builtin_offsetof(mleak_stat_t, ml_trace[n])
	524
	525	struct mallocation {
	526	mcache_obj_t element; / the alloc'ed element, NULL if unused */
	527	u_int32_t trace_index; /* mtrace index for corresponding backtrace */
	528	u_int32_t count; /* How many objects were requested */
	529	u_int64_t hitcount; /* for determining hash effectiveness */
	530	};
	531
	532	struct mtrace {
	533	u_int64_t collisions;
	534	u_int64_t hitcount;
	535	u_int64_t allocs;
	536	u_int64_t depth;
	537	uintptr_t addr[MLEAK_STACK_DEPTH];
	538	};
	539
	540	/* Size must be a power of two for the zhash to be able to just mask off bits */
	541	#define MLEAK_ALLOCATION_MAP_NUM 512
	542	#define MLEAK_TRACE_MAP_NUM 256
	543
	544	/*
	545	* Sample factor for how often to record a trace. This is overwritable
	546	* by the boot-arg mleak_sample_factor.
	547	*/
	548	#define MLEAK_SAMPLE_FACTOR 500
	549
	550	/*
	551	* Number of top leakers recorded.
	552	*/
	553	#define MLEAK_NUM_TRACES 5
	554
	555	#define MB_LEAK_SPACING_64 " "
	556	#define MB_LEAK_SPACING_32 " "
	557
	558
	559	#define MB_LEAK_HDR_32 "\n\
	560	trace [1] trace [2] trace [3] trace [4] trace [5] \n\
	561	---------- ---------- ---------- ---------- ---------- \n\
	562	"
	563
	564	#define MB_LEAK_HDR_64 "\n\
	565	trace [1] trace [2] trace [3] \
	566	trace [4] trace [5] \n\
	567	------------------ ------------------ ------------------ \
	568	------------------ ------------------ \n\
	569	"
	570
	571	static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM;
	572	static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM;
	573
	574	/* Hashmaps of allocations and their corresponding traces */
	575	static struct mallocation *mleak_allocations;
	576	static struct mtrace *mleak_traces;
	577	static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES];
	578
	579	/* Lock to protect mleak tables from concurrent modification */
	580	static LCK_GRP_DECLARE(mleak_lock_grp, "mleak_lock");
	581	static LCK_MTX_DECLARE(mleak_lock_data, &mleak_lock_grp);
	582	static lck_mtx_t *const mleak_lock = &mleak_lock_data;
	583
	584	/* Failed large allocations. */
	585	struct mtracelarge {
	586	uint64_t size;
	587	uint64_t depth;
	588	uintptr_t addr[MLEAK_STACK_DEPTH];
	589	};
	590
	591	#define MTRACELARGE_NUM_TRACES 5
	592	static struct mtracelarge mtracelarge_table[MTRACELARGE_NUM_TRACES];
	593
	594	static void mtracelarge_register(size_t size);
	595
	596	/* Lock to protect the completion callback table */
	597	static LCK_GRP_DECLARE(mbuf_tx_compl_tbl_lck_grp, "mbuf_tx_compl_tbl");
	598	LCK_RW_DECLARE(mbuf_tx_compl_tbl_lock, &mbuf_tx_compl_tbl_lck_grp);
	599
	600	extern u_int32_t high_sb_max;
	601
	602	/* The minimum number of objects that are allocated, to start. */
	603	#define MINCL 32
	604	#define MINBIGCL (MINCL >> 1)
	605	#define MIN16KCL (MINCL >> 2)
	606
	607	/* Low watermarks (only map in pages once free counts go below) */
	608	#define MBIGCL_LOWAT MINBIGCL
	609	#define M16KCL_LOWAT MIN16KCL
	610
	611	typedef struct {
	612	mbuf_class_t mtbl_class; /* class type */
	613	mcache_t mtbl_cache; / mcache for this buffer class */
	614	TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */
	615	mcache_obj_t mtbl_cobjlist; / composite objects freelist */
	616	mb_class_stat_t mtbl_stats; / statistics fetchable via sysctl */
	617	u_int32_t mtbl_maxsize; /* maximum buffer size */
	618	int mtbl_minlimit; /* minimum allowed */
	619	int mtbl_maxlimit; /* maximum allowed */
	620	u_int32_t mtbl_wantpurge; /* purge during next reclaim */
	621	uint32_t mtbl_avgtotal; /* average total on iOS */
	622	u_int32_t mtbl_expand; /* worker should expand the class */
	623	} mbuf_table_t;
	624
	625	#define m_class(c) mbuf_table[c].mtbl_class
	626	#define m_cache(c) mbuf_table[c].mtbl_cache
	627	#define m_slablist(c) mbuf_table[c].mtbl_slablist
	628	#define m_cobjlist(c) mbuf_table[c].mtbl_cobjlist
	629	#define m_maxsize(c) mbuf_table[c].mtbl_maxsize
	630	#define m_minlimit(c) mbuf_table[c].mtbl_minlimit
	631	#define m_maxlimit(c) mbuf_table[c].mtbl_maxlimit
	632	#define m_wantpurge(c) mbuf_table[c].mtbl_wantpurge
	633	#define m_cname(c) mbuf_table[c].mtbl_stats->mbcl_cname
	634	#define m_size(c) mbuf_table[c].mtbl_stats->mbcl_size
	635	#define m_total(c) mbuf_table[c].mtbl_stats->mbcl_total
	636	#define m_active(c) mbuf_table[c].mtbl_stats->mbcl_active
	637	#define m_infree(c) mbuf_table[c].mtbl_stats->mbcl_infree
	638	#define m_slab_cnt(c) mbuf_table[c].mtbl_stats->mbcl_slab_cnt
	639	#define m_alloc_cnt(c) mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
	640	#define m_free_cnt(c) mbuf_table[c].mtbl_stats->mbcl_free_cnt
	641	#define m_notified(c) mbuf_table[c].mtbl_stats->mbcl_notified
	642	#define m_purge_cnt(c) mbuf_table[c].mtbl_stats->mbcl_purge_cnt
	643	#define m_fail_cnt(c) mbuf_table[c].mtbl_stats->mbcl_fail_cnt
	644	#define m_ctotal(c) mbuf_table[c].mtbl_stats->mbcl_ctotal
	645	#define m_peak(c) mbuf_table[c].mtbl_stats->mbcl_peak_reported
	646	#define m_release_cnt(c) mbuf_table[c].mtbl_stats->mbcl_release_cnt
	647	#define m_region_expand(c) mbuf_table[c].mtbl_expand
	648
	649	static mbuf_table_t mbuf_table[] = {
	650	/*
	651	* The caches for mbufs, regular clusters and big clusters.
	652	* The average total values were based on data gathered by actual
	653	* usage patterns on iOS.
	654	*/
	655	{ MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
	656	NULL, NULL, 0, 0, 0, 0, 3000, 0 },
	657	{ MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
	658	NULL, NULL, 0, 0, 0, 0, 2000, 0 },
	659	{ MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
	660	NULL, NULL, 0, 0, 0, 0, 1000, 0 },
	661	{ MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
	662	NULL, NULL, 0, 0, 0, 0, 200, 0 },
	663	/*
	664	* The following are special caches; they serve as intermediate
	665	* caches backed by the above rudimentary caches. Each object
	666	* in the cache is an mbuf with a cluster attached to it. Unlike
	667	* the above caches, these intermediate caches do not directly
	668	* deal with the slab structures; instead, the constructed
	669	* cached elements are simply stored in the freelists.
	670	*/
	671	{ MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 2000, 0 },
	672	{ MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 1000, 0 },
	673	{ MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 200, 0 },
	674	};
	675
	676	#define NELEM(a) (sizeof (a) / sizeof ((a)[0]))
	677
	678
	679	static uint32_t
	680	m_avgtotal(mbuf_class_t c)
	681	{
	682	return mbuf_table[c].mtbl_avgtotal;
	683	}
	684
	685	static void mb_waitchan = &mbuf_table; / wait channel for all caches */
	686	static int mb_waiters; /* number of waiters */
	687
	688	boolean_t mb_peak_newreport = FALSE;
	689	boolean_t mb_peak_firstreport = FALSE;
	690
	691	/* generate a report by default after 1 week of uptime */
	692	#define MBUF_PEAK_FIRST_REPORT_THRESHOLD 604800
	693
	694	#define MB_WDT_MAXTIME 10 /* # of secs before watchdog panic */
	695	static struct timeval mb_wdtstart; /* watchdog start timestamp */
	696	static char *mbuf_dump_buf;
	697
	698	#define MBUF_DUMP_BUF_SIZE 4096
	699
	700	/*
	701	* mbuf watchdog is enabled by default. It is also toggeable via the
	702	* kern.ipc.mb_watchdog sysctl.
	703	* Garbage collection is enabled by default on embedded platforms.
	704	* mb_drain_maxint controls the amount of time to wait (in seconds) before
	705	* consecutive calls to mbuf_drain().
	706	*/
	707	#if !XNU_TARGET_OS_OSX \|\| DEVELOPMENT \|\| DEBUG
	708	static unsigned int mb_watchdog = 1;
	709	#else /* XNU_TARGET_OS_OSX && !DEVELOPMENT && !DEBUG */
	710	static unsigned int mb_watchdog = 0;
	711	#endif /* XNU_TARGET_OS_OSX && !DEVELOPMENT && !DEBUG */
	712	#if !XNU_TARGET_OS_OSX
	713	static unsigned int mb_drain_maxint = 60;
	714	#else /* XNU_TARGET_OS_OSX */
	715	static unsigned int mb_drain_maxint = 0;
	716	#endif /* XNU_TARGET_OS_OSX */
	717	static unsigned int mb_memory_pressure_percentage = 80;
	718
	719	uintptr_t mb_obscure_extfree __attribute__((visibility("hidden")));
	720	uintptr_t mb_obscure_extref __attribute__((visibility("hidden")));
	721
	722	/* Red zone */
	723	static u_int32_t mb_redzone_cookie;
	724	static void m_redzone_init(struct mbuf *);
	725	static void m_redzone_verify(struct mbuf *m);
	726
	727	/* The following are used to serialize m_clalloc() */
	728	static boolean_t mb_clalloc_busy;
	729	static void *mb_clalloc_waitchan = &mb_clalloc_busy;
	730	static int mb_clalloc_waiters;
	731
	732	static void mbuf_mtypes_sync(boolean_t);
	733	static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
	734	static void mbuf_stat_sync(void);
	735	static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
	736	static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS;
	737	static int mleak_table_sysctl SYSCTL_HANDLER_ARGS;
	738	static char *mbuf_dump(void);
	739	static void mbuf_table_init(void);
	740	static inline void m_incref(struct mbuf *);
	741	static inline u_int16_t m_decref(struct mbuf *);
	742	static int m_clalloc(const u_int32_t, const int, const u_int32_t);
	743	static void mbuf_worker_thread_init(void);
	744	static mcache_obj_t *slab_alloc(mbuf_class_t, int);
	745	static void slab_free(mbuf_class_t, mcache_obj_t *);
	746	static unsigned int mbuf_slab_alloc(void , mcache_obj_t **,
	747	unsigned int, int);
	748	static void mbuf_slab_free(void , mcache_obj_t , int);
	749	static void mbuf_slab_audit(void , mcache_obj_t , boolean_t);
	750	static void mbuf_slab_notify(void *, u_int32_t);
	751	static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
	752	unsigned int);
	753	static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
	754	static unsigned int mbuf_cslab_alloc(void , mcache_obj_t **,
	755	unsigned int, int);
	756	static void mbuf_cslab_free(void , mcache_obj_t , int);
	757	static void mbuf_cslab_audit(void , mcache_obj_t , boolean_t);
	758	static int freelist_populate(mbuf_class_t, unsigned int, int);
	759	static void freelist_init(mbuf_class_t);
	760	static boolean_t mbuf_cached_above(mbuf_class_t, int);
	761	static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
	762	static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
	763	static int m_howmany(int, size_t);
	764	static void mbuf_worker_thread(void);
	765	static void mbuf_watchdog(void);
	766	static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
	767
	768	static void mcl_audit_init(void , mcache_audit_t , mcache_obj_t *,
	769	size_t, unsigned int);
	770	static void mcl_audit_free(void *, unsigned int);
	771	static mcache_audit_t mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t );
	772	static void mcl_audit_mbuf(mcache_audit_t , void , boolean_t, boolean_t);
	773	static void mcl_audit_cluster(mcache_audit_t , void , size_t, boolean_t,
	774	boolean_t);
	775	static void mcl_audit_restore_mbuf(struct mbuf , mcache_audit_t , boolean_t);
	776	static void mcl_audit_save_mbuf(struct mbuf , mcache_audit_t );
	777	static void mcl_audit_scratch(mcache_audit_t *);
	778	static void mcl_audit_mcheck_panic(struct mbuf *);
	779	static void mcl_audit_verify_nextptr(void , mcache_audit_t );
	780
	781	static void mleak_activate(void);
	782	static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t);
	783	static boolean_t mleak_log(uintptr_t , mcache_obj_t , uint32_t, int);
	784	static void mleak_free(mcache_obj_t *);
	785	static void mleak_sort_traces(void);
	786	static void mleak_update_stats(void);
	787
	788	static mcl_slab_t slab_get(void );
	789	static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
	790	void , void , unsigned int, int, int);
	791	static void slab_insert(mcl_slab_t *, mbuf_class_t);
	792	static void slab_remove(mcl_slab_t *, mbuf_class_t);
	793	static boolean_t slab_inrange(mcl_slab_t , void );
	794	static void slab_nextptr_panic(mcl_slab_t , void );
	795	static void slab_detach(mcl_slab_t *);
	796	static boolean_t slab_is_detached(mcl_slab_t *);
	797
	798	static int m_copyback0(struct mbuf *, int, int, const void , int, int);
	799	static struct mbuf m_split0(struct mbuf , int, int, int);
	800	__private_extern__ void mbuf_report_peak_usage(void);
	801	static boolean_t mbuf_report_usage(mbuf_class_t);
	802	#if DEBUG \|\| DEVELOPMENT
	803	#define mbwdog_logger(fmt, ...) _mbwdog_logger(__func__, __LINE__, fmt, ## __VA_ARGS__)
	804	static void _mbwdog_logger(const char func, const int line, const char fmt, ...);
	805	static char *mbwdog_logging;
	806	const unsigned mbwdog_logging_size = 4096;
	807	static size_t mbwdog_logging_used;
	808	#else
	809	#define mbwdog_logger(fmt, ...) do { } while (0)
	810	#endif
	811	static void mbuf_drain_locked(boolean_t);
	812
	813	/* flags for m_copyback0 */
	814	#define M_COPYBACK0_COPYBACK 0x0001 /* copyback from cp */
	815	#define M_COPYBACK0_PRESERVE 0x0002 /* preserve original data */
	816	#define M_COPYBACK0_COW 0x0004 /* do copy-on-write */
	817	#define M_COPYBACK0_EXTEND 0x0008 /* extend chain */
	818
	819	/*
	820	* This flag is set for all mbufs that come out of and into the composite
	821	* mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL. mbufs that
	822	* are marked with such a flag have clusters attached to them, and will be
	823	* treated differently when they are freed; instead of being placed back
	824	* into the mbuf and cluster freelists, the composite mbuf + cluster objects
	825	* are placed back into the appropriate composite cache's freelist, and the
	826	* actual freeing is deferred until the composite objects are purged. At
	827	* such a time, this flag will be cleared from the mbufs and the objects
	828	* will be freed into their own separate freelists.
	829	*/
	830	#define EXTF_COMPOSITE 0x1
	831
	832	/*
	833	* This flag indicates that the external cluster is read-only, i.e. it is
	834	* or was referred to by more than one mbufs. Once set, this flag is never
	835	* cleared.
	836	*/
	837	#define EXTF_READONLY 0x2
	838	/*
	839	* This flag indicates that the external cluster is paired with the mbuf.
	840	* Pairing implies an external free routine defined which will be invoked
	841	* when the reference count drops to the minimum at m_free time. This
	842	* flag is never cleared.
	843	*/
	844	#define EXTF_PAIRED 0x4
	845
	846	#define EXTF_MASK \
	847	(EXTF_COMPOSITE \| EXTF_READONLY \| EXTF_PAIRED)
	848
	849	#define MEXT_MINREF(m) ((m_get_rfa(m))->minref)
	850	#define MEXT_REF(m) ((m_get_rfa(m))->refcnt)
	851	#define MEXT_PREF(m) ((m_get_rfa(m))->prefcnt)
	852	#define MEXT_FLAGS(m) ((m_get_rfa(m))->flags)
	853	#define MEXT_PRIV(m) ((m_get_rfa(m))->priv)
	854	#define MEXT_PMBUF(m) ((m_get_rfa(m))->paired)
	855	#define MEXT_TOKEN(m) ((m_get_rfa(m))->ext_token)
	856	#define MBUF_IS_COMPOSITE(m) \
	857	(MEXT_REF(m) == MEXT_MINREF(m) && \
	858	(MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE)
	859	/*
	860	* This macro can be used to test if the mbuf is paired to an external
	861	* cluster. The test for MEXT_PMBUF being equal to the mbuf in subject
	862	* is important, as EXTF_PAIRED alone is insufficient since it is immutable,
	863	* and thus survives calls to m_free_paired.
	864	*/
	865	#define MBUF_IS_PAIRED(m) \
	866	(((m)->m_flags & M_EXT) && \
	867	(MEXT_FLAGS(m) & EXTF_MASK) == EXTF_PAIRED && \
	868	MEXT_PMBUF(m) == (m))
	869
	870	/*
	871	* Macros used to verify the integrity of the mbuf.
	872	*/
	873	#define _MCHECK(m) { \
	874	if ((m)->m_type != MT_FREE && !MBUF_IS_PAIRED(m)) { \
	875	if (mclaudit == NULL) \
	876	panic("MCHECK: m_type=%d m=%p", \
	877	(u_int16_t)(m)->m_type, m); \
	878	else \
	879	mcl_audit_mcheck_panic(m); \
	880	} \
	881	}
	882
	883	#define MBUF_IN_MAP(addr) \
	884	((unsigned char *)(addr) >= mbutl && \
	885	(unsigned char *)(addr) < embutl)
	886
	887	#define MRANGE(addr) { \
	888	if (!MBUF_IN_MAP(addr)) \
	889	panic("MRANGE: address out of range 0x%p", addr); \
	890	}
	891
	892	/*
	893	* Macro version of mtod.
	894	*/
	895	#define MTOD(m, t) ((t)((m)->m_data))
	896
	897	/*
	898	* Macros to obtain page index given a base cluster address
	899	*/
	900	#define MTOPG(x) (((unsigned char *)x - mbutl) >> PAGE_SHIFT)
	901	#define PGTOM(x) (mbutl + (x << PAGE_SHIFT))
	902
	903	/*
	904	* Macro to find the mbuf index relative to a base.
	905	*/
	906	#define MBPAGEIDX(c, m) \
	907	(((unsigned char )(m) - (unsigned char )(c)) >> MSIZESHIFT)
	908
	909	/*
	910	* Same thing for 2KB cluster index.
	911	*/
	912	#define CLPAGEIDX(c, m) \
	913	(((unsigned char )(m) - (unsigned char )(c)) >> MCLSHIFT)
	914
	915	/*
	916	* Macro to find 4KB cluster index relative to a base
	917	*/
	918	#define BCLPAGEIDX(c, m) \
	919	(((unsigned char )(m) - (unsigned char )(c)) >> MBIGCLSHIFT)
	920
	921	/*
	922	* Macros used during mbuf and cluster initialization.
	923	*/
	924	#define MBUF_INIT_PKTHDR(m) { \
	925	(m)->m_pkthdr.rcvif = NULL; \
	926	(m)->m_pkthdr.pkt_hdr = NULL; \
	927	(m)->m_pkthdr.len = 0; \
	928	(m)->m_pkthdr.csum_flags = 0; \
	929	(m)->m_pkthdr.csum_data = 0; \
	930	(m)->m_pkthdr.vlan_tag = 0; \
	931	(m)->m_pkthdr.comp_gencnt = 0; \
	932	m_classifier_init(m, 0); \
	933	m_tag_init(m, 1); \
	934	m_scratch_init(m); \
	935	m_redzone_init(m); \
	936	}
	937
	938	#define MBUF_INIT(m, pkthdr, type) { \
	939	_MCHECK(m); \
	940	(m)->m_next = (m)->m_nextpkt = NULL; \
	941	(m)->m_len = 0; \
	942	(m)->m_type = type; \
	943	if ((pkthdr) == 0) { \
	944	(m)->m_data = (m)->m_dat; \
	945	(m)->m_flags = 0; \
	946	} else { \
	947	(m)->m_data = (m)->m_pktdat; \
	948	(m)->m_flags = M_PKTHDR; \
	949	MBUF_INIT_PKTHDR(m); \
	950	} \
	951	}
	952
	953	#define MEXT_INIT(m, buf, size, free, arg, rfa, min, ref, pref, flag, \
	954	priv, pm) { \
	955	(m)->m_data = (m)->m_ext.ext_buf = (buf); \
	956	(m)->m_flags \|= M_EXT; \
	957	m_set_ext((m), (rfa), (free), (arg)); \
	958	(m)->m_ext.ext_size = (size); \
	959	MEXT_MINREF(m) = (min); \
	960	MEXT_REF(m) = (ref); \
	961	MEXT_PREF(m) = (pref); \
	962	MEXT_FLAGS(m) = (flag); \
	963	MEXT_PRIV(m) = (priv); \
	964	MEXT_PMBUF(m) = (pm); \
	965	}
	966
	967	#define MBUF_CL_INIT(m, buf, rfa, ref, flag) \
	968	MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, 0, \
	969	ref, 0, flag, 0, NULL)
	970
	971	#define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
	972	MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, 0, \
	973	ref, 0, flag, 0, NULL)
	974
	975	#define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
	976	MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, 0, \
	977	ref, 0, flag, 0, NULL)
	978
	979	/*
	980	* Macro to convert BSD malloc sleep flag to mcache's
	981	*/
	982	#define MSLEEPF(f) ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
	983
	984	/*
	985	* The structure that holds all mbuf class statistics exportable via sysctl.
	986	* Similar to mbstat structure, the mb_stat structure is protected by the
	987	* global mbuf lock. It contains additional information about the classes
	988	* that allows for a more accurate view of the state of the allocator.
	989	*/
	990	struct mb_stat *mb_stat;
	991	struct omb_stat omb_stat; / For backwards compatibility */
	992
	993	#define MB_STAT_SIZE(n) \
	994	__builtin_offsetof(mb_stat_t, mbs_class[n])
	995	#define OMB_STAT_SIZE(n) \
	996	__builtin_offsetof(struct omb_stat, mbs_class[n])
	997
	998	/*
	999	* The legacy structure holding all of the mbuf allocation statistics.
	1000	* The actual statistics used by the kernel are stored in the mbuf_table
	1001	* instead, and are updated atomically while the global mbuf lock is held.
	1002	* They are mirrored in mbstat to support legacy applications (e.g. netstat).
	1003	* Unlike before, the kernel no longer relies on the contents of mbstat for
	1004	* its operations (e.g. cluster expansion) because the structure is exposed
	1005	* to outside and could possibly be modified, therefore making it unsafe.
	1006	* With the exception of the mbstat.m_mtypes array (see below), all of the
	1007	* statistics are updated as they change.
	1008	*/
	1009	struct mbstat mbstat;
	1010
	1011	#define MBSTAT_MTYPES_MAX \
	1012	(sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
	1013
	1014	/*
	1015	* Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
	1016	* atomically and stored in a per-CPU structure which is lock-free; this is
	1017	* done in order to avoid writing to the global mbstat data structure which
	1018	* would cause false sharing. During sysctl request for kern.ipc.mbstat,
	1019	* the statistics across all CPUs will be converged into the mbstat.m_mtypes
	1020	* array and returned to the application. Any updates for types greater or
	1021	* equal than MT_MAX would be done atomically to the mbstat; this slows down
	1022	* performance but is okay since the kernel uses only up to MT_MAX-1 while
	1023	* anything beyond that (up to type 255) is considered a corner case.
	1024	*/
	1025	typedef struct {
	1026	unsigned int cpu_mtypes[MT_MAX];
	1027	} mbuf_mtypes_t;
	1028
	1029	static mbuf_mtypes_t PERCPU_DATA(mbuf_mtypes);
	1030
	1031	#define mtype_stat_add(type, n) { \
	1032	if ((unsigned)(type) < MT_MAX) { \
	1033	mbuf_mtypes_t *mbs = PERCPU_GET(mbuf_mtypes); \
	1034	atomic_add_32(&mbs->cpu_mtypes[type], n); \
	1035	} else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) { \
	1036	atomic_add_16((int16_t *)&mbstat.m_mtypes[type], n); \
	1037	} \
	1038	}
	1039
	1040	#define mtype_stat_sub(t, n) mtype_stat_add(t, -(n))
	1041	#define mtype_stat_inc(t) mtype_stat_add(t, 1)
	1042	#define mtype_stat_dec(t) mtype_stat_sub(t, 1)
	1043
	1044	static void
	1045	mbuf_mtypes_sync(boolean_t locked)
	1046	{
	1047	mbuf_mtypes_t mtc;
	1048
	1049	if (locked) {
	1050	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
	1051	}
	1052
	1053	mtc = *PERCPU_GET_MASTER(mbuf_mtypes);
	1054	percpu_foreach_secondary(mtype, mbuf_mtypes) {
	1055	for (int n = 0; n < MT_MAX; n++) {
	1056	mtc.cpu_mtypes[n] += mtype->cpu_mtypes[n];
	1057	}
	1058	}
	1059
	1060	if (!locked) {
	1061	lck_mtx_lock(mbuf_mlock);
	1062	}
	1063	for (int n = 0; n < MT_MAX; n++) {
	1064	mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
	1065	}
	1066	if (!locked) {
	1067	lck_mtx_unlock(mbuf_mlock);
	1068	}
	1069	}
	1070
	1071	static int
	1072	mbstat_sysctl SYSCTL_HANDLER_ARGS
	1073	{
	1074	#pragma unused(oidp, arg1, arg2)
	1075	mbuf_mtypes_sync(FALSE);
	1076
	1077	return SYSCTL_OUT(req, &mbstat, sizeof(mbstat));
	1078	}
	1079
	1080	static void
	1081	mbuf_stat_sync(void)
	1082	{
	1083	mb_class_stat_t *sp;
	1084	mcache_cpu_t *ccp;
	1085	mcache_t *cp;
	1086	int k, m, bktsize;
	1087
	1088	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
	1089
	1090	for (k = 0; k < NELEM(mbuf_table); k++) {
	1091	cp = m_cache(k);
	1092	ccp = &cp->mc_cpu[0];
	1093	bktsize = ccp->cc_bktsize;
	1094	sp = mbuf_table[k].mtbl_stats;
	1095
	1096	if (cp->mc_flags & MCF_NOCPUCACHE) {
	1097	sp->mbcl_mc_state = MCS_DISABLED;
	1098	} else if (cp->mc_purge_cnt > 0) {
	1099	sp->mbcl_mc_state = MCS_PURGING;
	1100	} else if (bktsize == 0) {
	1101	sp->mbcl_mc_state = MCS_OFFLINE;
	1102	} else {
	1103	sp->mbcl_mc_state = MCS_ONLINE;
	1104	}
	1105
	1106	sp->mbcl_mc_cached = 0;
	1107	for (m = 0; m < ncpu; m++) {
	1108	ccp = &cp->mc_cpu[m];
	1109	if (ccp->cc_objs > 0) {
	1110	sp->mbcl_mc_cached += ccp->cc_objs;
	1111	}
	1112	if (ccp->cc_pobjs > 0) {
	1113	sp->mbcl_mc_cached += ccp->cc_pobjs;
	1114	}
	1115	}
	1116	sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
	1117	sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
	1118	sp->mbcl_infree;
	1119
	1120	sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
	1121	sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
	1122	sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
	1123
	1124	/* Calculate total count specific to each class */
	1125	sp->mbcl_ctotal = sp->mbcl_total;
	1126	switch (m_class(k)) {
	1127	case MC_MBUF:
	1128	/* Deduct mbufs used in composite caches */
	1129	sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
	1130	m_total(MC_MBUF_BIGCL));
	1131	break;
	1132
	1133	case MC_CL:
	1134	/* Deduct clusters used in composite cache */
	1135	sp->mbcl_ctotal -= m_total(MC_MBUF_CL);
	1136	break;
	1137
	1138	case MC_BIGCL:
	1139	/* Deduct clusters used in composite cache */
	1140	sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
	1141	break;
	1142
	1143	case MC_16KCL:
	1144	/* Deduct clusters used in composite cache */
	1145	sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
	1146	break;
	1147
	1148	default:
	1149	break;
	1150	}
	1151	}
	1152	}
	1153
	1154	static int
	1155	mb_stat_sysctl SYSCTL_HANDLER_ARGS
	1156	{
	1157	#pragma unused(oidp, arg1, arg2)
	1158	void *statp;
	1159	int k, statsz, proc64 = proc_is64bit(req->p);
	1160
	1161	lck_mtx_lock(mbuf_mlock);
	1162	mbuf_stat_sync();
	1163
	1164	if (!proc64) {
	1165	struct omb_class_stat *oc;
	1166	struct mb_class_stat *c;
	1167
	1168	omb_stat->mbs_cnt = mb_stat->mbs_cnt;
	1169	oc = &omb_stat->mbs_class[0];
	1170	c = &mb_stat->mbs_class[0];
	1171	for (k = 0; k < omb_stat->mbs_cnt; k++, oc++, c++) {
	1172	(void) snprintf(oc->mbcl_cname, sizeof(oc->mbcl_cname),
	1173	"%s", c->mbcl_cname);
	1174	oc->mbcl_size = c->mbcl_size;
	1175	oc->mbcl_total = c->mbcl_total;
	1176	oc->mbcl_active = c->mbcl_active;
	1177	oc->mbcl_infree = c->mbcl_infree;
	1178	oc->mbcl_slab_cnt = c->mbcl_slab_cnt;
	1179	oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt;
	1180	oc->mbcl_free_cnt = c->mbcl_free_cnt;
	1181	oc->mbcl_notified = c->mbcl_notified;
	1182	oc->mbcl_purge_cnt = c->mbcl_purge_cnt;
	1183	oc->mbcl_fail_cnt = c->mbcl_fail_cnt;
	1184	oc->mbcl_ctotal = c->mbcl_ctotal;
	1185	oc->mbcl_release_cnt = c->mbcl_release_cnt;
	1186	oc->mbcl_mc_state = c->mbcl_mc_state;
	1187	oc->mbcl_mc_cached = c->mbcl_mc_cached;
	1188	oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt;
	1189	oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt;
	1190	oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt;
	1191	}
	1192	statp = omb_stat;
	1193	statsz = OMB_STAT_SIZE(NELEM(mbuf_table));
	1194	} else {
	1195	statp = mb_stat;
	1196	statsz = MB_STAT_SIZE(NELEM(mbuf_table));
	1197	}
	1198
	1199	lck_mtx_unlock(mbuf_mlock);
	1200
	1201	return SYSCTL_OUT(req, statp, statsz);
	1202	}
	1203
	1204	static int
	1205	mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS
	1206	{
	1207	#pragma unused(oidp, arg1, arg2)
	1208	int i;
	1209
	1210	/* Ensure leak tracing turned on */
	1211	if (!mclfindleak \|\| !mclexpleak) {
	1212	return ENXIO;
	1213	}
	1214
	1215	lck_mtx_lock(mleak_lock);
	1216	mleak_update_stats();
	1217	i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES));
	1218	lck_mtx_unlock(mleak_lock);
	1219
	1220	return i;
	1221	}
	1222
	1223	static int
	1224	mleak_table_sysctl SYSCTL_HANDLER_ARGS
	1225	{
	1226	#pragma unused(oidp, arg1, arg2)
	1227	int i = 0;
	1228
	1229	/* Ensure leak tracing turned on */
	1230	if (!mclfindleak \|\| !mclexpleak) {
	1231	return ENXIO;
	1232	}
	1233
	1234	lck_mtx_lock(mleak_lock);
	1235	i = SYSCTL_OUT(req, &mleak_table, sizeof(mleak_table));
	1236	lck_mtx_unlock(mleak_lock);
	1237
	1238	return i;
	1239	}
	1240
	1241	static inline void
	1242	m_incref(struct mbuf *m)
	1243	{
	1244	UInt16 old, new;
	1245	volatile UInt16 addr = (volatile UInt16 )&MEXT_REF(m);
	1246
	1247	do {
	1248	old = *addr;
	1249	new = old + 1;
	1250	VERIFY(new != 0);
	1251	} while (!OSCompareAndSwap16(old, new, addr));
	1252
	1253	/*
	1254	* If cluster is shared, mark it with (sticky) EXTF_READONLY;
	1255	* we don't clear the flag when the refcount goes back to the
	1256	* minimum, to simplify code calling m_mclhasreference().
	1257	*/
	1258	if (new > (MEXT_MINREF(m) + 1) && !(MEXT_FLAGS(m) & EXTF_READONLY)) {
	1259	(void) OSBitOrAtomic16(EXTF_READONLY, &MEXT_FLAGS(m));
	1260	}
	1261	}
	1262
	1263	static inline u_int16_t
	1264	m_decref(struct mbuf *m)
	1265	{
	1266	UInt16 old, new;
	1267	volatile UInt16 addr = (volatile UInt16 )&MEXT_REF(m);
	1268
	1269	do {
	1270	old = *addr;
	1271	new = old - 1;
	1272	VERIFY(old != 0);
	1273	} while (!OSCompareAndSwap16(old, new, addr));
	1274
	1275	return new;
	1276	}
	1277
	1278	static void
	1279	mbuf_table_init(void)
	1280	{
	1281	unsigned int b, c, s;
	1282	int m, config_mbuf_jumbo = 0;
	1283
	1284	omb_stat = zalloc_permanent(OMB_STAT_SIZE(NELEM(mbuf_table)),
	1285	ZALIGN(struct omb_stat));
	1286
	1287	mb_stat = zalloc_permanent(MB_STAT_SIZE(NELEM(mbuf_table)),
	1288	ZALIGN(mb_stat_t));
	1289
	1290	mb_stat->mbs_cnt = NELEM(mbuf_table);
	1291	for (m = 0; m < NELEM(mbuf_table); m++) {
	1292	mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
	1293	}
	1294
	1295	#if CONFIG_MBUF_JUMBO
	1296	config_mbuf_jumbo = 1;
	1297	#endif /* CONFIG_MBUF_JUMBO */
	1298
	1299	if (config_mbuf_jumbo == 1 \|\| PAGE_SIZE == M16KCLBYTES) {
	1300	/*
	1301	* Set aside 1/3 of the mbuf cluster map for jumbo
	1302	* clusters; we do this only on platforms where jumbo
	1303	* cluster pool is enabled.
	1304	*/
	1305	njcl = nmbclusters / 3;
	1306	njclbytes = M16KCLBYTES;
	1307	}
	1308
	1309	/*
	1310	* nclusters holds both the 2KB and 4KB pools, so ensure it's
	1311	* a multiple of 4KB clusters.
	1312	*/
	1313	nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG);
	1314	if (njcl > 0) {
	1315	/*
	1316	* Each jumbo cluster takes 8 2KB clusters, so make
	1317	* sure that the pool size is evenly divisible by 8;
	1318	* njcl is in 2KB unit, hence treated as such.
	1319	*/
	1320	njcl = P2ROUNDDOWN(nmbclusters - nclusters, NCLPJCL);
	1321
	1322	/* Update nclusters with rounded down value of njcl */
	1323	nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG);
	1324	}
	1325
	1326	/*
	1327	* njcl is valid only on platforms with 16KB jumbo clusters or
	1328	* with 16KB pages, where it is configured to 1/3 of the pool
	1329	* size. On these platforms, the remaining is used for 2KB
	1330	* and 4KB clusters. On platforms without 16KB jumbo clusters,
	1331	* the entire pool is used for both 2KB and 4KB clusters. A 4KB
	1332	* cluster can either be splitted into 16 mbufs, or into 2 2KB
	1333	* clusters.
	1334	*
	1335	* +---+---+------------ ... -----------+------- ... -------+
	1336	* \| c \| b \| s \| njcl \|
	1337	* +---+---+------------ ... -----------+------- ... -------+
	1338	*
	1339	* 1/32th of the shared region is reserved for pure 2KB and 4KB
	1340	* clusters (1/64th each.)
	1341	*/
	1342	c = P2ROUNDDOWN((nclusters >> 6), NCLPG); /* in 2KB unit */
	1343	b = P2ROUNDDOWN((nclusters >> (6 + NCLPBGSHIFT)), NBCLPG); /* in 4KB unit */
	1344	s = nclusters - (c + (b << NCLPBGSHIFT)); /* in 2KB unit */
	1345
	1346	/*
	1347	* 1/64th (c) is reserved for 2KB clusters.
	1348	*/
	1349	m_minlimit(MC_CL) = c;
	1350	m_maxlimit(MC_CL) = s + c; /* in 2KB unit */
	1351	m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
	1352	(void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
	1353
	1354	/*
	1355	* Another 1/64th (b) of the map is reserved for 4KB clusters.
	1356	* It cannot be turned into 2KB clusters or mbufs.
	1357	*/
	1358	m_minlimit(MC_BIGCL) = b;
	1359	m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b; /* in 4KB unit */
	1360	m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES;
	1361	(void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
	1362
	1363	/*
	1364	* The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB)
	1365	*/
	1366	m_minlimit(MC_MBUF) = 0;
	1367	m_maxlimit(MC_MBUF) = (s << NMBPCLSHIFT); /* in mbuf unit */
	1368	m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE;
	1369	(void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
	1370
	1371	/*
	1372	* Set limits for the composite classes.
	1373	*/
	1374	m_minlimit(MC_MBUF_CL) = 0;
	1375	m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL);
	1376	m_maxsize(MC_MBUF_CL) = MCLBYTES;
	1377	m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
	1378	(void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
	1379
	1380	m_minlimit(MC_MBUF_BIGCL) = 0;
	1381	m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
	1382	m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES;
	1383	m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
	1384	(void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
	1385
	1386	/*
	1387	* And for jumbo classes.
	1388	*/
	1389	m_minlimit(MC_16KCL) = 0;
	1390	m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT); /* in 16KB unit */
	1391	m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
	1392	(void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
	1393
	1394	m_minlimit(MC_MBUF_16KCL) = 0;
	1395	m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
	1396	m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
	1397	m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
	1398	(void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
	1399
	1400	/*
	1401	* Initialize the legacy mbstat structure.
	1402	*/
	1403	bzero(&mbstat, sizeof(mbstat));
	1404	mbstat.m_msize = m_maxsize(MC_MBUF);
	1405	mbstat.m_mclbytes = m_maxsize(MC_CL);
	1406	mbstat.m_minclsize = MINCLSIZE;
	1407	mbstat.m_mlen = MLEN;
	1408	mbstat.m_mhlen = MHLEN;
	1409	mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
	1410	}
	1411
	1412	int
	1413	mbuf_get_class(struct mbuf *m)
	1414	{
	1415	if (m->m_flags & M_EXT) {
	1416	uint32_t composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
	1417	m_ext_free_func_t m_free_func = m_get_ext_free(m);
	1418
	1419	if (m_free_func == NULL) {
	1420	if (composite) {
	1421	return MC_MBUF_CL;
	1422	} else {
	1423	return MC_CL;
	1424	}
	1425	} else if (m_free_func == m_bigfree) {
	1426	if (composite) {
	1427	return MC_MBUF_BIGCL;
	1428	} else {
	1429	return MC_BIGCL;
	1430	}
	1431	} else if (m_free_func == m_16kfree) {
	1432	if (composite) {
	1433	return MC_MBUF_16KCL;
	1434	} else {
	1435	return MC_16KCL;
	1436	}
	1437	}
	1438	}
	1439
	1440	return MC_MBUF;
	1441	}
	1442
	1443	bool
	1444	mbuf_class_under_pressure(struct mbuf *m)
	1445	{
	1446	int mclass = mbuf_get_class(m);
	1447
	1448	if (m_total(mclass) - m_infree(mclass) >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / 100) {
	1449	/*
	1450	* The above computation does not include the per-CPU cached objects.
	1451	* As a fast-path check this is good-enough. But now we do
	1452	* the "slower" count of the cached objects to know exactly the
	1453	* number of active mbufs in use.
	1454	*
	1455	* We do not take the mbuf_lock here to avoid lock-contention. Numbers
	1456	* might be slightly off but we don't try to be 100% accurate.
	1457	* At worst, we drop a packet that we shouldn't have dropped or
	1458	* we might go slightly above our memory-pressure threshold.
	1459	*/
	1460	mcache_t *cp = m_cache(mclass);
	1461	mcache_cpu_t *ccp = &cp->mc_cpu[0];
	1462
	1463	int bktsize = os_access_once(ccp->cc_bktsize);
	1464	uint32_t bl_total = os_access_once(cp->mc_full.bl_total);
	1465	uint32_t cached = 0;
	1466	int i;
	1467
	1468	for (i = 0; i < ncpu; i++) {
	1469	ccp = &cp->mc_cpu[i];
	1470
	1471	int cc_objs = os_access_once(ccp->cc_objs);
	1472	if (cc_objs > 0) {
	1473	cached += cc_objs;
	1474	}
	1475
	1476	int cc_pobjs = os_access_once(ccp->cc_pobjs);
	1477	if (cc_pobjs > 0) {
	1478	cached += cc_pobjs;
	1479	}
	1480	}
	1481	cached += (bl_total * bktsize);
	1482
	1483	if (m_total(mclass) - m_infree(mclass) - cached >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / 100) {
	1484	os_log(OS_LOG_DEFAULT,
	1485	"%s memory-pressure on mbuf due to class %u, total %u free %u cached %u max %u",
	1486	__func__, mclass, m_total(mclass), m_infree(mclass), cached, m_maxlimit(mclass));
	1487	return true;
	1488	}
	1489	}
	1490
	1491	return false;
	1492	}
	1493
	1494	#if defined(__LP64__)
	1495	typedef struct ncl_tbl {
	1496	uint64_t nt_maxmem; /* memory (sane) size */
	1497	uint32_t nt_mbpool; /* mbuf pool size */
	1498	} ncl_tbl_t;
	1499
	1500	static const ncl_tbl_t ncl_table[] = {
	1501	{ (1ULL << GBSHIFT) /* 1 GB /, (64 << MBSHIFT) / 64 MB */ },
	1502	{ (1ULL << (GBSHIFT + 2)) /* 4 GB /, (96 << MBSHIFT) / 96 MB */ },
	1503	{ (1ULL << (GBSHIFT + 3)) /* 8 GB /, (128 << MBSHIFT) / 128 MB */ },
	1504	{ (1ULL << (GBSHIFT + 4)) /* 16 GB /, (256 << MBSHIFT) / 256 MB */ },
	1505	{ (1ULL << (GBSHIFT + 5)) /* 32 GB /, (512 << MBSHIFT) / 512 MB */ },
	1506	{ 0, 0 }
	1507	};
	1508	#endif /* __LP64__ */
	1509
	1510	__private_extern__ unsigned int
	1511	mbuf_default_ncl(uint64_t mem)
	1512	{
	1513	#if !defined(__LP64__)
	1514	unsigned int n;
	1515	/*
	1516	* 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
	1517	*/
	1518	if ((n = ((mem / 16) / MCLBYTES)) > 32768) {
	1519	n = 32768;
	1520	}
	1521	#else
	1522	unsigned int n, i;
	1523	/*
	1524	* 64-bit kernel (mbuf pool size based on table).
	1525	*/
	1526	n = ncl_table[0].nt_mbpool;
	1527	for (i = 0; ncl_table[i].nt_mbpool != 0; i++) {
	1528	if (mem < ncl_table[i].nt_maxmem) {
	1529	break;
	1530	}
	1531	n = ncl_table[i].nt_mbpool;
	1532	}
	1533	n >>= MCLSHIFT;
	1534	#endif /* !__LP64__ */
	1535	return n;
	1536	}
	1537
	1538	__private_extern__ void
	1539	mbinit(void)
	1540	{
	1541	unsigned int m;
	1542	unsigned int initmcl = 0;
	1543	thread_t thread = THREAD_NULL;
	1544
	1545	microuptime(&mb_start);
	1546
	1547	/*
	1548	* These MBUF_ values must be equal to their private counterparts.
	1549	*/
	1550	_CASSERT(MBUF_EXT == M_EXT);
	1551	_CASSERT(MBUF_PKTHDR == M_PKTHDR);
	1552	_CASSERT(MBUF_EOR == M_EOR);
	1553	_CASSERT(MBUF_LOOP == M_LOOP);
	1554	_CASSERT(MBUF_BCAST == M_BCAST);
	1555	_CASSERT(MBUF_MCAST == M_MCAST);
	1556	_CASSERT(MBUF_FRAG == M_FRAG);
	1557	_CASSERT(MBUF_FIRSTFRAG == M_FIRSTFRAG);
	1558	_CASSERT(MBUF_LASTFRAG == M_LASTFRAG);
	1559	_CASSERT(MBUF_PROMISC == M_PROMISC);
	1560	_CASSERT(MBUF_HASFCS == M_HASFCS);
	1561
	1562	_CASSERT(MBUF_TYPE_FREE == MT_FREE);
	1563	_CASSERT(MBUF_TYPE_DATA == MT_DATA);
	1564	_CASSERT(MBUF_TYPE_HEADER == MT_HEADER);
	1565	_CASSERT(MBUF_TYPE_SOCKET == MT_SOCKET);
	1566	_CASSERT(MBUF_TYPE_PCB == MT_PCB);
	1567	_CASSERT(MBUF_TYPE_RTABLE == MT_RTABLE);
	1568	_CASSERT(MBUF_TYPE_HTABLE == MT_HTABLE);
	1569	_CASSERT(MBUF_TYPE_ATABLE == MT_ATABLE);
	1570	_CASSERT(MBUF_TYPE_SONAME == MT_SONAME);
	1571	_CASSERT(MBUF_TYPE_SOOPTS == MT_SOOPTS);
	1572	_CASSERT(MBUF_TYPE_FTABLE == MT_FTABLE);
	1573	_CASSERT(MBUF_TYPE_RIGHTS == MT_RIGHTS);
	1574	_CASSERT(MBUF_TYPE_IFADDR == MT_IFADDR);
	1575	_CASSERT(MBUF_TYPE_CONTROL == MT_CONTROL);
	1576	_CASSERT(MBUF_TYPE_OOBDATA == MT_OOBDATA);
	1577
	1578	_CASSERT(MBUF_TSO_IPV4 == CSUM_TSO_IPV4);
	1579	_CASSERT(MBUF_TSO_IPV6 == CSUM_TSO_IPV6);
	1580	_CASSERT(MBUF_CSUM_REQ_SUM16 == CSUM_PARTIAL);
	1581	_CASSERT(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16);
	1582	_CASSERT(MBUF_CSUM_REQ_ZERO_INVERT == CSUM_ZERO_INVERT);
	1583	_CASSERT(MBUF_CSUM_REQ_IP == CSUM_IP);
	1584	_CASSERT(MBUF_CSUM_REQ_TCP == CSUM_TCP);
	1585	_CASSERT(MBUF_CSUM_REQ_UDP == CSUM_UDP);
	1586	_CASSERT(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6);
	1587	_CASSERT(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6);
	1588	_CASSERT(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED);
	1589	_CASSERT(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID);
	1590	_CASSERT(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID);
	1591	_CASSERT(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR);
	1592
	1593	_CASSERT(MBUF_WAITOK == M_WAIT);
	1594	_CASSERT(MBUF_DONTWAIT == M_DONTWAIT);
	1595	_CASSERT(MBUF_COPYALL == M_COPYALL);
	1596
	1597	_CASSERT(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK);
	1598	_CASSERT(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK);
	1599	_CASSERT(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE);
	1600	_CASSERT(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE);
	1601	_CASSERT(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE);
	1602	_CASSERT(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI);
	1603	_CASSERT(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI);
	1604	_CASSERT(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI);
	1605	_CASSERT(MBUF_SC2TC(MBUF_SC_SIG) == MBUF_TC_VI);
	1606	_CASSERT(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO);
	1607	_CASSERT(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO);
	1608
	1609	_CASSERT(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK);
	1610	_CASSERT(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE);
	1611	_CASSERT(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI);
	1612	_CASSERT(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO);
	1613
	1614	/* Module specific scratch space (32-bit alignment requirement) */
	1615	_CASSERT(!(offsetof(struct mbuf, m_pkthdr.pkt_mpriv) %
	1616	sizeof(uint32_t)));
	1617
	1618	/* pktdata needs to start at 128-bit offset! */
	1619	_CASSERT((offsetof(struct mbuf, m_pktdat) % 16) == 0);
	1620
	1621	/* Initialize random red zone cookie value */
	1622	_CASSERT(sizeof(mb_redzone_cookie) ==
	1623	sizeof(((struct pkthdr *)0)->redzone));
	1624	read_random(&mb_redzone_cookie, sizeof(mb_redzone_cookie));
	1625	read_random(&mb_obscure_extref, sizeof(mb_obscure_extref));
	1626	read_random(&mb_obscure_extfree, sizeof(mb_obscure_extfree));
	1627	mb_obscure_extref \|= 0x3;
	1628	mb_obscure_extfree \|= 0x3;
	1629
	1630	/* Make sure we don't save more than we should */
	1631	_CASSERT(MCA_SAVED_MBUF_SIZE <= sizeof(struct mbuf));
	1632
	1633	if (nmbclusters == 0) {
	1634	nmbclusters = NMBCLUSTERS;
	1635	}
	1636
	1637	/* This should be a sane (at least even) value by now */
	1638	VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1));
	1639
	1640	/* Setup the mbuf table */
	1641	mbuf_table_init();
	1642
	1643	/*
	1644	* Allocate cluster slabs table:
	1645	*
	1646	* maxslabgrp = (N * 2048) / (1024 * 1024)
	1647	*
	1648	* Where N is nmbclusters rounded up to the nearest 512. This yields
	1649	* mcl_slab_g_t units, each one representing a MB of memory.
	1650	*/
	1651	maxslabgrp =
	1652	(P2ROUNDUP(nmbclusters, (MBSIZE >> MCLSHIFT)) << MCLSHIFT) >> MBSHIFT;
	1653	slabstbl = zalloc_permanent(maxslabgrp * sizeof(mcl_slabg_t *),
	1654	ZALIGN(mcl_slabg_t));
	1655
	1656	/*
	1657	* Allocate audit structures, if needed:
	1658	*
	1659	* maxclaudit = (maxslabgrp * 1024 * 1024) / PAGE_SIZE
	1660	*
	1661	* This yields mcl_audit_t units, each one representing a page.
	1662	*/
	1663	PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof(mbuf_debug));
	1664	mbuf_debug \|= mcache_getflags();
	1665	if (mbuf_debug & MCF_DEBUG) {
	1666	int l;
	1667	mcl_audit_t *mclad;
	1668	maxclaudit = ((maxslabgrp << MBSHIFT) >> PAGE_SHIFT);
	1669	mclaudit = zalloc_permanent(maxclaudit * sizeof(*mclaudit),
	1670	ZALIGN(mcl_audit_t));
	1671	for (l = 0, mclad = mclaudit; l < maxclaudit; l++) {
	1672	mclad[l].cl_audit = zalloc_permanent(NMBPG * sizeof(mcache_audit_t *),
	1673	ZALIGN_PTR);
	1674	}
	1675
	1676	mcl_audit_con_cache = mcache_create("mcl_audit_contents",
	1677	AUDIT_CONTENTS_SIZE, sizeof(u_int64_t), 0, MCR_SLEEP);
	1678	VERIFY(mcl_audit_con_cache != NULL);
	1679	}
	1680	mclverify = (mbuf_debug & MCF_VERIFY);
	1681	mcltrace = (mbuf_debug & MCF_TRACE);
	1682	mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG);
	1683	mclexpleak = mclfindleak && (mbuf_debug & MCF_EXPLEAKLOG);
	1684
	1685	/* Enable mbuf leak logging, with a lock to protect the tables */
	1686
	1687	mleak_activate();
	1688
	1689	/*
	1690	* Allocate structure for per-CPU statistics that's aligned
	1691	* on the CPU cache boundary; this code assumes that we never
	1692	* uninitialize this framework, since the original address
	1693	* before alignment is not saved.
	1694	*/
	1695	ncpu = ml_wait_max_cpus();
	1696
	1697	/* Calculate the number of pages assigned to the cluster pool */
	1698	mcl_pages = (nmbclusters << MCLSHIFT) / PAGE_SIZE;
	1699	mcl_paddr = zalloc_permanent(mcl_pages * sizeof(ppnum_t),
	1700	ZALIGN(ppnum_t));
	1701
	1702	/* Register with the I/O Bus mapper */
	1703	mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
	1704
	1705	embutl = (mbutl + (nmbclusters * MCLBYTES));
	1706	VERIFY(((embutl - mbutl) % MBIGCLBYTES) == 0);
	1707
	1708	/* Prime up the freelist */
	1709	PE_parse_boot_argn("initmcl", &initmcl, sizeof(initmcl));
	1710	if (initmcl != 0) {
	1711	initmcl >>= NCLPBGSHIFT; /* become a 4K unit */
	1712	if (initmcl > m_maxlimit(MC_BIGCL)) {
	1713	initmcl = m_maxlimit(MC_BIGCL);
	1714	}
	1715	}
	1716	if (initmcl < m_minlimit(MC_BIGCL)) {
	1717	initmcl = m_minlimit(MC_BIGCL);
	1718	}
	1719
	1720	lck_mtx_lock(mbuf_mlock);
	1721
	1722	/*
	1723	* For classes with non-zero minimum limits, populate their freelists
	1724	* so that m_total(class) is at least m_minlimit(class).
	1725	*/
	1726	VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0);
	1727	freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT);
	1728	VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
	1729	freelist_init(m_class(MC_CL));
	1730
	1731	for (m = 0; m < NELEM(mbuf_table); m++) {
	1732	/* Make sure we didn't miss any */
	1733	VERIFY(m_minlimit(m_class(m)) == 0 \|\|
	1734	m_total(m_class(m)) >= m_minlimit(m_class(m)));
	1735
	1736	/* populate the initial sizes and report from there on */
	1737	m_peak(m_class(m)) = m_total(m_class(m));
	1738	}
	1739	mb_peak_newreport = FALSE;
	1740
	1741	lck_mtx_unlock(mbuf_mlock);
	1742
	1743	(void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init,
	1744	NULL, &thread);
	1745	thread_deallocate(thread);
	1746
	1747	ref_cache = mcache_create("mext_ref", sizeof(struct ext_ref),
	1748	0, 0, MCR_SLEEP);
	1749
	1750	/* Create the cache for each class */
	1751	for (m = 0; m < NELEM(mbuf_table); m++) {
	1752	void allocfunc, freefunc, auditfunc, logfunc;
	1753	u_int32_t flags;
	1754
	1755	flags = mbuf_debug;
	1756	if (m_class(m) == MC_MBUF_CL \|\| m_class(m) == MC_MBUF_BIGCL \|\|
	1757	m_class(m) == MC_MBUF_16KCL) {
	1758	allocfunc = mbuf_cslab_alloc;
	1759	freefunc = mbuf_cslab_free;
	1760	auditfunc = mbuf_cslab_audit;
	1761	logfunc = mleak_logger;
	1762	} else {
	1763	allocfunc = mbuf_slab_alloc;
	1764	freefunc = mbuf_slab_free;
	1765	auditfunc = mbuf_slab_audit;
	1766	logfunc = mleak_logger;
	1767	}
	1768
	1769	/*
	1770	* Disable per-CPU caches for jumbo classes if there
	1771	* is no jumbo cluster pool available in the system.
	1772	* The cache itself is still created (but will never
	1773	* be populated) since it simplifies the code.
	1774	*/
	1775	if ((m_class(m) == MC_MBUF_16KCL \|\| m_class(m) == MC_16KCL) &&
	1776	njcl == 0) {
	1777	flags \|= MCF_NOCPUCACHE;
	1778	}
	1779
	1780	if (!mclfindleak) {
	1781	flags \|= MCF_NOLEAKLOG;
	1782	}
	1783
	1784	m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
	1785	allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify,
	1786	(void *)(uintptr_t)m, flags, MCR_SLEEP);
	1787	}
	1788
	1789	/*
	1790	* Set the max limit on sb_max to be 1/16 th of the size of
	1791	* memory allocated for mbuf clusters.
	1792	*/
	1793	high_sb_max = (nmbclusters << (MCLSHIFT - 4));
	1794	if (high_sb_max < sb_max) {
	1795	/* sb_max is too large for this configuration, scale it down */
	1796	if (high_sb_max > (1 << MBSHIFT)) {
	1797	/* We have atleast 16 M of mbuf pool */
	1798	sb_max = high_sb_max;
	1799	} else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) {
	1800	/*
	1801	* If we have more than 1M of mbufpool, cap the size of
	1802	* max sock buf at 1M
	1803	*/
	1804	sb_max = high_sb_max = (1 << MBSHIFT);
	1805	} else {
	1806	sb_max = high_sb_max;
	1807	}
	1808	}
	1809
	1810	/* allocate space for mbuf_dump_buf */
	1811	mbuf_dump_buf = zalloc_permanent(MBUF_DUMP_BUF_SIZE, ZALIGN_NONE);
	1812
	1813	if (mbuf_debug & MCF_DEBUG) {
	1814	printf("%s: MLEN %d, MHLEN %d\n", __func__,
	1815	(int)_MLEN, (int)_MHLEN);
	1816	}
	1817
	1818	printf("%s: done [%d MB total pool size, (%d/%d) split]\n", __func__,
	1819	(nmbclusters << MCLSHIFT) >> MBSHIFT,
	1820	(nclusters << MCLSHIFT) >> MBSHIFT,
	1821	(njcl << MCLSHIFT) >> MBSHIFT);
	1822	}
	1823
	1824	/*
	1825	* Obtain a slab of object(s) from the class's freelist.
	1826	*/
	1827	static mcache_obj_t *
	1828	slab_alloc(mbuf_class_t class, int wait)
	1829	{
	1830	mcl_slab_t *sp;
	1831	mcache_obj_t *buf;
	1832
	1833	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
	1834
	1835	/* This should always be NULL for us */
	1836	VERIFY(m_cobjlist(class) == NULL);
	1837
	1838	/*
	1839	* Treat composite objects as having longer lifespan by using
	1840	* a slab from the reverse direction, in hoping that this could
	1841	* reduce the probability of fragmentation for slabs that hold
	1842	* more than one buffer chunks (e.g. mbuf slabs). For other
	1843	* slabs, this probably doesn't make much of a difference.
	1844	*/
	1845	if ((class == MC_MBUF \|\| class == MC_CL \|\| class == MC_BIGCL)
	1846	&& (wait & MCR_COMP)) {
	1847	sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
	1848	} else {
	1849	sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
	1850	}
	1851
	1852	if (sp == NULL) {
	1853	VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
	1854	/* The slab list for this class is empty */
	1855	return NULL;
	1856	}
	1857
	1858	VERIFY(m_infree(class) > 0);
	1859	VERIFY(!slab_is_detached(sp));
	1860	VERIFY(sp->sl_class == class &&
	1861	(sp->sl_flags & (SLF_MAPPED \| SLF_PARTIAL)) == SLF_MAPPED);
	1862	buf = sp->sl_head;
	1863	VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
	1864	sp->sl_head = buf->obj_next;
	1865	/* Increment slab reference */
	1866	sp->sl_refcnt++;
	1867
	1868	VERIFY(sp->sl_head != NULL \|\| sp->sl_refcnt == sp->sl_chunks);
	1869
	1870	if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
	1871	slab_nextptr_panic(sp, sp->sl_head);
	1872	/* In case sl_head is in the map but not in the slab */
	1873	VERIFY(slab_inrange(sp, sp->sl_head));
	1874	/* NOTREACHED */
	1875	}
	1876
	1877	if (mclaudit != NULL) {
	1878	mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
	1879	mca->mca_uflags = 0;
	1880	/* Save contents on mbuf objects only */
	1881	if (class == MC_MBUF) {
	1882	mca->mca_uflags \|= MB_SCVALID;
	1883	}
	1884	}
	1885
	1886	if (class == MC_CL) {
	1887	mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
	1888	/*
	1889	* A 2K cluster slab can have at most NCLPG references.
	1890	*/
	1891	VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPG &&
	1892	sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE);
	1893	VERIFY(sp->sl_refcnt < NCLPG \|\| sp->sl_head == NULL);
	1894	} else if (class == MC_BIGCL) {
	1895	mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
	1896	m_infree(MC_MBUF_BIGCL);
	1897	/*
	1898	* A 4K cluster slab can have NBCLPG references.
	1899	*/
	1900	VERIFY(sp->sl_refcnt >= 1 && sp->sl_chunks == NBCLPG &&
	1901	sp->sl_len == PAGE_SIZE &&
	1902	(sp->sl_refcnt < NBCLPG \|\| sp->sl_head == NULL));
	1903	} else if (class == MC_16KCL) {
	1904	mcl_slab_t *nsp;
	1905	int k;
	1906
	1907	--m_infree(MC_16KCL);
	1908	VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
	1909	sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
	1910	/*
	1911	* Increment 2nd-Nth slab reference, where N is NSLABSP16KB.
	1912	* A 16KB big cluster takes NSLABSP16KB slabs, each having at
	1913	* most 1 reference.
	1914	*/
	1915	for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
	1916	nsp = nsp->sl_next;
	1917	/* Next slab must already be present */
	1918	VERIFY(nsp != NULL);
	1919	nsp->sl_refcnt++;
	1920	VERIFY(!slab_is_detached(nsp));
	1921	VERIFY(nsp->sl_class == MC_16KCL &&
	1922	nsp->sl_flags == (SLF_MAPPED \| SLF_PARTIAL) &&
	1923	nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
	1924	nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
	1925	nsp->sl_head == NULL);
	1926	}
	1927	} else {
	1928	VERIFY(class == MC_MBUF);
	1929	--m_infree(MC_MBUF);
	1930	/*
	1931	* If auditing is turned on, this check is
	1932	* deferred until later in mbuf_slab_audit().
	1933	*/
	1934	if (mclaudit == NULL) {
	1935	_MCHECK((struct mbuf *)buf);
	1936	}
	1937	/*
	1938	* Since we have incremented the reference count above,
	1939	* an mbuf slab (formerly a 4KB cluster slab that was cut
	1940	* up into mbufs) must have a reference count between 1
	1941	* and NMBPG at this point.
	1942	*/
	1943	VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPG &&
	1944	sp->sl_chunks == NMBPG &&
	1945	sp->sl_len == PAGE_SIZE);
	1946	VERIFY(sp->sl_refcnt < NMBPG \|\| sp->sl_head == NULL);
	1947	}
	1948
	1949	/* If empty, remove this slab from the class's freelist */
	1950	if (sp->sl_head == NULL) {
	1951	VERIFY(class != MC_MBUF \|\| sp->sl_refcnt == NMBPG);
	1952	VERIFY(class != MC_CL \|\| sp->sl_refcnt == NCLPG);
	1953	VERIFY(class != MC_BIGCL \|\| sp->sl_refcnt == NBCLPG);
	1954	slab_remove(sp, class);
	1955	}
	1956
	1957	return buf;
	1958	}
	1959
	1960	/*
	1961	* Place a slab of object(s) back into a class's slab list.
	1962	*/
	1963	static void
	1964	slab_free(mbuf_class_t class, mcache_obj_t *buf)
	1965	{
	1966	mcl_slab_t *sp;
	1967	boolean_t reinit_supercl = false;
	1968	mbuf_class_t super_class;
	1969
	1970	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
	1971
	1972	VERIFY(class != MC_16KCL \|\| njcl > 0);
	1973	VERIFY(buf->obj_next == NULL);
	1974
	1975	/*
	1976	* Synchronizing with m_clalloc, as it reads m_total, while we here
	1977	* are modifying m_total.
	1978	*/
	1979	while (mb_clalloc_busy) {
	1980	mb_clalloc_waiters++;
	1981	(void) msleep(mb_clalloc_waitchan, mbuf_mlock,
	1982	(PZERO - 1), "m_clalloc", NULL);
	1983	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
	1984	}
	1985
	1986	/* We are busy now; tell everyone else to go away */
	1987	mb_clalloc_busy = TRUE;
	1988
	1989	sp = slab_get(buf);
	1990	VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
	1991	(sp->sl_flags & (SLF_MAPPED \| SLF_PARTIAL)) == SLF_MAPPED);
	1992
	1993	/* Decrement slab reference */
	1994	sp->sl_refcnt--;
	1995
	1996	if (class == MC_CL) {
	1997	VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
	1998	/*
	1999	* A slab that has been splitted for 2KB clusters can have
	2000	* at most 1 outstanding reference at this point.
	2001	*/
	2002	VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPG - 1) &&
	2003	sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE);
	2004	VERIFY(sp->sl_refcnt < (NCLPG - 1) \|\|
	2005	(slab_is_detached(sp) && sp->sl_head == NULL));
	2006	} else if (class == MC_BIGCL) {
	2007	VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES));
	2008
	2009	/* A 4KB cluster slab can have NBCLPG references at most */
	2010	VERIFY(sp->sl_refcnt >= 0 && sp->sl_chunks == NBCLPG);
	2011	VERIFY(sp->sl_refcnt < (NBCLPG - 1) \|\|
	2012	(slab_is_detached(sp) && sp->sl_head == NULL));
	2013	} else if (class == MC_16KCL) {
	2014	mcl_slab_t *nsp;
	2015	int k;
	2016	/*
	2017	* A 16KB cluster takes NSLABSP16KB slabs, all must
	2018	* now have 0 reference.
	2019	*/
	2020	VERIFY(IS_P2ALIGNED(buf, PAGE_SIZE));
	2021	VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
	2022	sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
	2023	VERIFY(slab_is_detached(sp));
	2024	for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
	2025	nsp = nsp->sl_next;
	2026	/* Next slab must already be present */
	2027	VERIFY(nsp != NULL);
	2028	nsp->sl_refcnt--;
	2029	VERIFY(slab_is_detached(nsp));
	2030	VERIFY(nsp->sl_class == MC_16KCL &&
	2031	(nsp->sl_flags & (SLF_MAPPED \| SLF_PARTIAL)) &&
	2032	nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
	2033	nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
	2034	nsp->sl_head == NULL);
	2035	}
	2036	} else {
	2037	/*
	2038	* A slab that has been splitted for mbufs has at most
	2039	* NMBPG reference counts. Since we have decremented
	2040	* one reference above, it must now be between 0 and
	2041	* NMBPG-1.
	2042	*/
	2043	VERIFY(class == MC_MBUF);
	2044	VERIFY(sp->sl_refcnt >= 0 &&
	2045	sp->sl_refcnt <= (NMBPG - 1) &&
	2046	sp->sl_chunks == NMBPG &&
	2047	sp->sl_len == PAGE_SIZE);
	2048	VERIFY(sp->sl_refcnt < (NMBPG - 1) \|\|
	2049	(slab_is_detached(sp) && sp->sl_head == NULL));
	2050	}
	2051
	2052	/*
	2053	* When auditing is enabled, ensure that the buffer still
	2054	* contains the free pattern. Otherwise it got corrupted
	2055	* while at the CPU cache layer.
	2056	*/
	2057	if (mclaudit != NULL) {
	2058	mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
	2059	if (mclverify) {
	2060	mcache_audit_free_verify(mca, buf, 0,
	2061	m_maxsize(class));
	2062	}
	2063	mca->mca_uflags &= ~MB_SCVALID;
	2064	}
	2065
	2066	if (class == MC_CL) {
	2067	mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
	2068	buf->obj_next = sp->sl_head;
	2069	} else if (class == MC_BIGCL) {
	2070	mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
	2071	m_infree(MC_MBUF_BIGCL);
	2072	buf->obj_next = sp->sl_head;
	2073	} else if (class == MC_16KCL) {
	2074	++m_infree(MC_16KCL);
	2075	} else {
	2076	++m_infree(MC_MBUF);
	2077	buf->obj_next = sp->sl_head;
	2078	}
	2079	sp->sl_head = buf;
	2080
	2081	/*
	2082	* If a slab has been split to either one which holds 2KB clusters,
	2083	* or one which holds mbufs, turn it back to one which holds a
	2084	* 4 or 16 KB cluster depending on the page size.
	2085	*/
	2086	if (m_maxsize(MC_BIGCL) == PAGE_SIZE) {
	2087	super_class = MC_BIGCL;
	2088	} else {
	2089	VERIFY(PAGE_SIZE == m_maxsize(MC_16KCL));
	2090	super_class = MC_16KCL;
	2091	}
	2092	if (class == MC_MBUF && sp->sl_refcnt == 0 &&
	2093	m_total(class) >= (m_minlimit(class) + NMBPG) &&
	2094	m_total(super_class) < m_maxlimit(super_class)) {
	2095	int i = NMBPG;
	2096
	2097	m_total(MC_MBUF) -= NMBPG;
	2098	mbstat.m_mbufs = m_total(MC_MBUF);
	2099	m_infree(MC_MBUF) -= NMBPG;
	2100	mtype_stat_add(MT_FREE, -((unsigned)NMBPG));
	2101
	2102	while (i--) {
	2103	struct mbuf *m = sp->sl_head;
	2104	VERIFY(m != NULL);
	2105	sp->sl_head = m->m_next;
	2106	m->m_next = NULL;
	2107	}
	2108	reinit_supercl = true;
	2109	} else if (class == MC_CL && sp->sl_refcnt == 0 &&
	2110	m_total(class) >= (m_minlimit(class) + NCLPG) &&
	2111	m_total(super_class) < m_maxlimit(super_class)) {
	2112	int i = NCLPG;
	2113
	2114	m_total(MC_CL) -= NCLPG;
	2115	mbstat.m_clusters = m_total(MC_CL);
	2116	m_infree(MC_CL) -= NCLPG;
	2117
	2118	while (i--) {
	2119	union mcluster *c = sp->sl_head;
	2120	VERIFY(c != NULL);
	2121	sp->sl_head = c->mcl_next;
	2122	c->mcl_next = NULL;
	2123	}
	2124	reinit_supercl = true;
	2125	} else if (class == MC_BIGCL && super_class != MC_BIGCL &&
	2126	sp->sl_refcnt == 0 &&
	2127	m_total(class) >= (m_minlimit(class) + NBCLPG) &&
	2128	m_total(super_class) < m_maxlimit(super_class)) {
	2129	int i = NBCLPG;
	2130
	2131	VERIFY(super_class == MC_16KCL);
	2132	m_total(MC_BIGCL) -= NBCLPG;
	2133	mbstat.m_bigclusters = m_total(MC_BIGCL);
	2134	m_infree(MC_BIGCL) -= NBCLPG;
	2135
	2136	while (i--) {
	2137	union mbigcluster *bc = sp->sl_head;
	2138	VERIFY(bc != NULL);
	2139	sp->sl_head = bc->mbc_next;
	2140	bc->mbc_next = NULL;
	2141	}
	2142	reinit_supercl = true;
	2143	}
	2144
	2145	if (reinit_supercl) {
	2146	VERIFY(sp->sl_head == NULL);
	2147	VERIFY(m_total(class) >= m_minlimit(class));
	2148	slab_remove(sp, class);
	2149
	2150	/* Reinitialize it as a cluster for the super class */
	2151	m_total(super_class)++;
	2152	m_infree(super_class)++;
	2153	VERIFY(sp->sl_flags == (SLF_MAPPED \| SLF_DETACHED) &&
	2154	sp->sl_len == PAGE_SIZE && sp->sl_refcnt == 0);
	2155
	2156	slab_init(sp, super_class, SLF_MAPPED, sp->sl_base,
	2157	sp->sl_base, PAGE_SIZE, 0, 1);
	2158	if (mclverify) {
	2159	mcache_set_pattern(MCACHE_FREE_PATTERN,
	2160	(caddr_t)sp->sl_base, sp->sl_len);
	2161	}
	2162	((mcache_obj_t *)(sp->sl_base))->obj_next = NULL;
	2163
	2164	if (super_class == MC_BIGCL) {
	2165	mbstat.m_bigclusters = m_total(MC_BIGCL);
	2166	mbstat.m_bigclfree = m_infree(MC_BIGCL) +
	2167	m_infree(MC_MBUF_BIGCL);
	2168	}
	2169
	2170	VERIFY(slab_is_detached(sp));
	2171	VERIFY(m_total(super_class) <= m_maxlimit(super_class));
	2172
	2173	/* And finally switch class */
	2174	class = super_class;
	2175	}
	2176
	2177	/* Reinsert the slab to the class's slab list */
	2178	if (slab_is_detached(sp)) {
	2179	slab_insert(sp, class);
	2180	}
	2181
	2182	/* We're done; let others enter */
	2183	mb_clalloc_busy = FALSE;
	2184	if (mb_clalloc_waiters > 0) {
	2185	mb_clalloc_waiters = 0;
	2186	wakeup(mb_clalloc_waitchan);
	2187	}
	2188	}
	2189
	2190	/*
	2191	* Common allocator for rudimentary objects called by the CPU cache layer
	2192	* during an allocation request whenever there is no available element in the
	2193	* bucket layer. It returns one or more elements from the appropriate global
	2194	* freelist. If the freelist is empty, it will attempt to populate it and
	2195	* retry the allocation.
	2196	*/
	2197	static unsigned int
	2198	mbuf_slab_alloc(void arg, mcache_obj_t **plist, unsigned int num, int wait)
	2199	{
	2200	mbuf_class_t class = (mbuf_class_t)arg;
	2201	unsigned int need = num;
	2202	mcache_obj_t *list = plist;
	2203
	2204	ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
	2205	ASSERT(need > 0);
	2206
	2207	lck_mtx_lock(mbuf_mlock);
	2208
	2209	for (;;) {
	2210	if ((*list = slab_alloc(class, wait)) != NULL) {
	2211	(*list)->obj_next = NULL;
	2212	list = plist = &(list)->obj_next;
	2213
	2214	if (--need == 0) {
	2215	/*
	2216	* If the number of elements in freelist has
	2217	* dropped below low watermark, asynchronously
	2218	* populate the freelist now rather than doing
	2219	* it later when we run out of elements.
	2220	*/
	2221	if (!mbuf_cached_above(class, wait) &&
	2222	m_infree(class) < (m_total(class) >> 5)) {
	2223	(void) freelist_populate(class, 1,
	2224	M_DONTWAIT);
	2225	}
	2226	break;
	2227	}
	2228	} else {
	2229	VERIFY(m_infree(class) == 0 \|\| class == MC_CL);
	2230
	2231	(void) freelist_populate(class, 1,
	2232	(wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
	2233
	2234	if (m_infree(class) > 0) {
	2235	continue;
	2236	}
	2237
	2238	/* Check if there's anything at the cache layer */
	2239	if (mbuf_cached_above(class, wait)) {
	2240	break;
	2241	}
	2242
	2243	/* watchdog checkpoint */
	2244	mbuf_watchdog();
	2245
	2246	/* We have nothing and cannot block; give up */
	2247	if (wait & MCR_NOSLEEP) {
	2248	if (!(wait & MCR_TRYHARD)) {
	2249	m_fail_cnt(class)++;
	2250	mbstat.m_drops++;
	2251	break;
	2252	}
	2253	}
	2254
	2255	/*
	2256	* If the freelist is still empty and the caller is
	2257	* willing to be blocked, sleep on the wait channel
	2258	* until an element is available. Otherwise, if
	2259	* MCR_TRYHARD is set, do our best to satisfy the
	2260	* request without having to go to sleep.
	2261	*/
	2262	if (mbuf_worker_ready &&
	2263	mbuf_sleep(class, need, wait)) {
	2264	break;
	2265	}
	2266
	2267	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
	2268	}
	2269	}
	2270
	2271	m_alloc_cnt(class) += num - need;
	2272	lck_mtx_unlock(mbuf_mlock);
	2273
	2274	return num - need;
	2275	}
	2276
	2277	/*
	2278	* Common de-allocator for rudimentary objects called by the CPU cache
	2279	* layer when one or more elements need to be returned to the appropriate
	2280	* global freelist.
	2281	*/
	2282	static void
	2283	mbuf_slab_free(void arg, mcache_obj_t list, __unused int purged)
	2284	{
	2285	mbuf_class_t class = (mbuf_class_t)arg;
	2286	mcache_obj_t *nlist;
	2287	unsigned int num = 0;
	2288	int w;
	2289
	2290	ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
	2291
	2292	lck_mtx_lock(mbuf_mlock);
	2293
	2294	for (;;) {
	2295	nlist = list->obj_next;
	2296	list->obj_next = NULL;
	2297	slab_free(class, list);
	2298	++num;
	2299	if ((list = nlist) == NULL) {
	2300	break;
	2301	}
	2302	}
	2303	m_free_cnt(class) += num;
	2304
	2305	if ((w = mb_waiters) > 0) {
	2306	mb_waiters = 0;
	2307	}
	2308	if (w) {
	2309	mbwdog_logger("waking up all threads");
	2310	}
	2311	lck_mtx_unlock(mbuf_mlock);
	2312
	2313	if (w != 0) {
	2314	wakeup(mb_waitchan);
	2315	}
	2316	}
	2317
	2318	/*
	2319	* Common auditor for rudimentary objects called by the CPU cache layer
	2320	* during an allocation or free request. For the former, this is called
	2321	* after the objects are obtained from either the bucket or slab layer
	2322	* and before they are returned to the caller. For the latter, this is
	2323	* called immediately during free and before placing the objects into
	2324	* the bucket or slab layer.
	2325	*/
	2326	static void
	2327	mbuf_slab_audit(void arg, mcache_obj_t list, boolean_t alloc)
	2328	{
	2329	mbuf_class_t class = (mbuf_class_t)arg;
	2330	mcache_audit_t *mca;
	2331
	2332	ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
	2333
	2334	while (list != NULL) {
	2335	lck_mtx_lock(mbuf_mlock);
	2336	mca = mcl_audit_buf2mca(class, list);
	2337
	2338	/* Do the sanity checks */
	2339	if (class == MC_MBUF) {
	2340	mcl_audit_mbuf(mca, list, FALSE, alloc);
	2341	ASSERT(mca->mca_uflags & MB_SCVALID);
	2342	} else {
	2343	mcl_audit_cluster(mca, list, m_maxsize(class),
	2344	alloc, TRUE);
	2345	ASSERT(!(mca->mca_uflags & MB_SCVALID));
	2346	}
	2347	/* Record this transaction */
	2348	if (mcltrace) {
	2349	mcache_buffer_log(mca, list, m_cache(class), &mb_start);
	2350	}
	2351
	2352	if (alloc) {
	2353	mca->mca_uflags \|= MB_INUSE;
	2354	} else {
	2355	mca->mca_uflags &= ~MB_INUSE;
	2356	}
	2357	/* Unpair the object (unconditionally) */
	2358	mca->mca_uptr = NULL;
	2359	lck_mtx_unlock(mbuf_mlock);
	2360
	2361	list = list->obj_next;
	2362	}
	2363	}
	2364
	2365	/*
	2366	* Common notify routine for all caches. It is called by mcache when
	2367	* one or more objects get freed. We use this indication to trigger
	2368	* the wakeup of any sleeping threads so that they can retry their
	2369	* allocation requests.
	2370	*/
	2371	static void
	2372	mbuf_slab_notify(void *arg, u_int32_t reason)
	2373	{
	2374	mbuf_class_t class = (mbuf_class_t)arg;
	2375	int w;
	2376
	2377	ASSERT(MBUF_CLASS_VALID(class));
	2378
	2379	if (reason != MCN_RETRYALLOC) {
	2380	return;
	2381	}
	2382
	2383	lck_mtx_lock(mbuf_mlock);
	2384	if ((w = mb_waiters) > 0) {
	2385	m_notified(class)++;
	2386	mb_waiters = 0;
	2387	}
	2388	if (w) {
	2389	mbwdog_logger("waking up all threads");
	2390	}
	2391	lck_mtx_unlock(mbuf_mlock);
	2392
	2393	if (w != 0) {
	2394	wakeup(mb_waitchan);
	2395	}
	2396	}
	2397
	2398	/*
	2399	* Obtain object(s) from the composite class's freelist.
	2400	*/
	2401	static unsigned int
	2402	cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
	2403	{
	2404	unsigned int need = num;
	2405	mcl_slab_t sp, clsp, *nsp;
	2406	struct mbuf *m;
	2407	mcache_obj_t *list = plist;
	2408	void *cl;
	2409
	2410	VERIFY(need > 0);
	2411	VERIFY(class != MC_MBUF_16KCL \|\| njcl > 0);
	2412	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
	2413
	2414	/* Get what we can from the freelist */
	2415	while ((*list = m_cobjlist(class)) != NULL) {
	2416	MRANGE(*list);
	2417
	2418	m = (struct mbuf )list;
	2419	sp = slab_get(m);
	2420	cl = m->m_ext.ext_buf;
	2421	clsp = slab_get(cl);
	2422	VERIFY(m->m_flags == M_EXT && cl != NULL);
	2423	VERIFY(m_get_rfa(m) != NULL && MBUF_IS_COMPOSITE(m));
	2424
	2425	if (class == MC_MBUF_CL) {
	2426	VERIFY(clsp->sl_refcnt >= 1 &&
	2427	clsp->sl_refcnt <= NCLPG);
	2428	} else {
	2429	VERIFY(clsp->sl_refcnt >= 1 &&
	2430	clsp->sl_refcnt <= NBCLPG);
	2431	}
	2432
	2433	if (class == MC_MBUF_16KCL) {
	2434	int k;
	2435	for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
	2436	nsp = nsp->sl_next;
	2437	/* Next slab must already be present */
	2438	VERIFY(nsp != NULL);
	2439	VERIFY(nsp->sl_refcnt == 1);
	2440	}
	2441	}
	2442
	2443	if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
	2444	!MBUF_IN_MAP(m_cobjlist(class))) {
	2445	slab_nextptr_panic(sp, m_cobjlist(class));
	2446	/* NOTREACHED */
	2447	}
	2448	(*list)->obj_next = NULL;
	2449	list = plist = &(list)->obj_next;
	2450
	2451	if (--need == 0) {
	2452	break;
	2453	}
	2454	}
	2455	m_infree(class) -= (num - need);
	2456
	2457	return num - need;
	2458	}
	2459
	2460	/*
	2461	* Place object(s) back into a composite class's freelist.
	2462	*/
	2463	static unsigned int
	2464	cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
	2465	{
	2466	mcache_obj_t o, tail;
	2467	unsigned int num = 0;
	2468	struct mbuf m, ms;
	2469	mcache_audit_t *mca = NULL;
	2470	mcache_obj_t *ref_list = NULL;
	2471	mcl_slab_t clsp, nsp;
	2472	void *cl;
	2473	mbuf_class_t cl_class;
	2474
	2475	ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
	2476	VERIFY(class != MC_MBUF_16KCL \|\| njcl > 0);
	2477	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
	2478
	2479	if (class == MC_MBUF_CL) {
	2480	cl_class = MC_CL;
	2481	} else if (class == MC_MBUF_BIGCL) {
	2482	cl_class = MC_BIGCL;
	2483	} else {
	2484	VERIFY(class == MC_MBUF_16KCL);
	2485	cl_class = MC_16KCL;
	2486	}
	2487
	2488	o = tail = list;
	2489
	2490	while ((m = ms = (struct mbuf *)o) != NULL) {
	2491	mcache_obj_t rfa, nexto = o->obj_next;
	2492
	2493	/* Do the mbuf sanity checks */
	2494	if (mclaudit != NULL) {
	2495	mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
	2496	if (mclverify) {
	2497	mcache_audit_free_verify(mca, m, 0,
	2498	m_maxsize(MC_MBUF));
	2499	}
	2500	ms = MCA_SAVED_MBUF_PTR(mca);
	2501	}
	2502
	2503	/* Do the cluster sanity checks */
	2504	cl = ms->m_ext.ext_buf;
	2505	clsp = slab_get(cl);
	2506	if (mclverify) {
	2507	size_t size = m_maxsize(cl_class);
	2508	mcache_audit_free_verify(mcl_audit_buf2mca(cl_class,
	2509	(mcache_obj_t *)cl), cl, 0, size);
	2510	}
	2511	VERIFY(ms->m_type == MT_FREE);
	2512	VERIFY(ms->m_flags == M_EXT);
	2513	VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
	2514	if (cl_class == MC_CL) {
	2515	VERIFY(clsp->sl_refcnt >= 1 &&
	2516	clsp->sl_refcnt <= NCLPG);
	2517	} else {
	2518	VERIFY(clsp->sl_refcnt >= 1 &&
	2519	clsp->sl_refcnt <= NBCLPG);
	2520	}
	2521	if (cl_class == MC_16KCL) {
	2522	int k;
	2523	for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
	2524	nsp = nsp->sl_next;
	2525	/* Next slab must already be present */
	2526	VERIFY(nsp != NULL);
	2527	VERIFY(nsp->sl_refcnt == 1);
	2528	}
	2529	}
	2530
	2531	/*
	2532	* If we're asked to purge, restore the actual mbuf using
	2533	* contents of the shadow structure (if auditing is enabled)
	2534	* and clear EXTF_COMPOSITE flag from the mbuf, as we are
	2535	* about to free it and the attached cluster into their caches.
	2536	*/
	2537	if (purged) {
	2538	/* Restore constructed mbuf fields */
	2539	if (mclaudit != NULL) {
	2540	mcl_audit_restore_mbuf(m, mca, TRUE);
	2541	}
	2542
	2543	MEXT_MINREF(m) = 0;
	2544	MEXT_REF(m) = 0;
	2545	MEXT_PREF(m) = 0;
	2546	MEXT_FLAGS(m) = 0;
	2547	MEXT_PRIV(m) = 0;
	2548	MEXT_PMBUF(m) = NULL;
	2549	MEXT_TOKEN(m) = 0;
	2550
	2551	rfa = (mcache_obj_t )(void )m_get_rfa(m);
	2552	m_set_ext(m, NULL, NULL, NULL);
	2553	rfa->obj_next = ref_list;
	2554	ref_list = rfa;
	2555
	2556	m->m_type = MT_FREE;
	2557	m->m_flags = m->m_len = 0;
	2558	m->m_next = m->m_nextpkt = NULL;
	2559
	2560	/* Save mbuf fields and make auditing happy */
	2561	if (mclaudit != NULL) {
	2562	mcl_audit_mbuf(mca, o, FALSE, FALSE);
	2563	}
	2564
	2565	VERIFY(m_total(class) > 0);
	2566	m_total(class)--;
	2567
	2568	/* Free the mbuf */
	2569	o->obj_next = NULL;
	2570	slab_free(MC_MBUF, o);
	2571
	2572	/* And free the cluster */
	2573	((mcache_obj_t *)cl)->obj_next = NULL;
	2574	if (class == MC_MBUF_CL) {
	2575	slab_free(MC_CL, cl);
	2576	} else if (class == MC_MBUF_BIGCL) {
	2577	slab_free(MC_BIGCL, cl);
	2578	} else {
	2579	slab_free(MC_16KCL, cl);
	2580	}
	2581	}
	2582
	2583	++num;
	2584	tail = o;
	2585	o = nexto;
	2586	}
	2587
	2588	if (!purged) {
	2589	tail->obj_next = m_cobjlist(class);
	2590	m_cobjlist(class) = list;
	2591	m_infree(class) += num;
	2592	} else if (ref_list != NULL) {
	2593	mcache_free_ext(ref_cache, ref_list);
	2594	}
	2595
	2596	return num;
	2597	}
	2598
	2599	/*
	2600	* Common allocator for composite objects called by the CPU cache layer
	2601	* during an allocation request whenever there is no available element in
	2602	* the bucket layer. It returns one or more composite elements from the
	2603	* appropriate global freelist. If the freelist is empty, it will attempt
	2604	* to obtain the rudimentary objects from their caches and construct them
	2605	* into composite mbuf + cluster objects.
	2606	*/
	2607	static unsigned int
	2608	mbuf_cslab_alloc(void arg, mcache_obj_t **plist, unsigned int needed,
	2609	int wait)
	2610	{
	2611	mbuf_class_t class = (mbuf_class_t)arg;
	2612	mbuf_class_t cl_class = 0;
	2613	unsigned int num = 0, cnum = 0, want = needed;
	2614	mcache_obj_t *ref_list = NULL;
	2615	mcache_obj_t *mp_list = NULL;
	2616	mcache_obj_t *clp_list = NULL;
	2617	mcache_obj_t **list;
	2618	struct ext_ref *rfa;
	2619	struct mbuf *m;
	2620	void *cl;
	2621
	2622	ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
	2623	ASSERT(needed > 0);
	2624
	2625	VERIFY(class != MC_MBUF_16KCL \|\| njcl > 0);
	2626
	2627	/* There should not be any slab for this class */
	2628	VERIFY(m_slab_cnt(class) == 0 &&
	2629	m_slablist(class).tqh_first == NULL &&
	2630	m_slablist(class).tqh_last == NULL);
	2631
	2632	lck_mtx_lock(mbuf_mlock);
	2633
	2634	/* Try using the freelist first */
	2635	num = cslab_alloc(class, plist, needed);
	2636	list = *plist;
	2637	if (num == needed) {
	2638	m_alloc_cnt(class) += num;
	2639	lck_mtx_unlock(mbuf_mlock);
	2640	return needed;
	2641	}
	2642
	2643	lck_mtx_unlock(mbuf_mlock);
	2644
	2645	/*
	2646	* We could not satisfy the request using the freelist alone;
	2647	* allocate from the appropriate rudimentary caches and use
	2648	* whatever we can get to construct the composite objects.
	2649	*/
	2650	needed -= num;
	2651
	2652	/*
	2653	* Mark these allocation requests as coming from a composite cache.
	2654	* Also, if the caller is willing to be blocked, mark the request
	2655	* with MCR_FAILOK such that we don't end up sleeping at the mbuf
	2656	* slab layer waiting for the individual object when one or more
	2657	* of the already-constructed composite objects are available.
	2658	*/
	2659	wait \|= MCR_COMP;
	2660	if (!(wait & MCR_NOSLEEP)) {
	2661	wait \|= MCR_FAILOK;
	2662	}
	2663
	2664	/* allocate mbufs */
	2665	needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
	2666	if (needed == 0) {
	2667	ASSERT(mp_list == NULL);
	2668	goto fail;
	2669	}
	2670
	2671	/* allocate clusters */
	2672	if (class == MC_MBUF_CL) {
	2673	cl_class = MC_CL;
	2674	} else if (class == MC_MBUF_BIGCL) {
	2675	cl_class = MC_BIGCL;
	2676	} else {
	2677	VERIFY(class == MC_MBUF_16KCL);
	2678	cl_class = MC_16KCL;
	2679	}
	2680	needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait);
	2681	if (needed == 0) {
	2682	ASSERT(clp_list == NULL);
	2683	goto fail;
	2684	}
	2685
	2686	needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
	2687	if (needed == 0) {
	2688	ASSERT(ref_list == NULL);
	2689	goto fail;
	2690	}
	2691
	2692	/*
	2693	* By this time "needed" is MIN(mbuf, cluster, ref). Any left
	2694	* overs will get freed accordingly before we return to caller.
	2695	*/
	2696	for (cnum = 0; cnum < needed; cnum++) {
	2697	struct mbuf *ms;
	2698
	2699	m = ms = (struct mbuf *)mp_list;
	2700	mp_list = mp_list->obj_next;
	2701
	2702	cl = clp_list;
	2703	clp_list = clp_list->obj_next;
	2704	((mcache_obj_t *)cl)->obj_next = NULL;
	2705
	2706	rfa = (struct ext_ref *)ref_list;
	2707	ref_list = ref_list->obj_next;
	2708	((mcache_obj_t )(void )rfa)->obj_next = NULL;
	2709
	2710	/*
	2711	* If auditing is enabled, construct the shadow mbuf
	2712	* in the audit structure instead of in the actual one.
	2713	* mbuf_cslab_audit() will take care of restoring the
	2714	* contents after the integrity check.
	2715	*/
	2716	if (mclaudit != NULL) {
	2717	mcache_audit_t mca, cl_mca;
	2718
	2719	lck_mtx_lock(mbuf_mlock);
	2720	mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
	2721	ms = MCA_SAVED_MBUF_PTR(mca);
	2722	cl_mca = mcl_audit_buf2mca(cl_class,
	2723	(mcache_obj_t *)cl);
	2724
	2725	/*
	2726	* Pair them up. Note that this is done at the time
	2727	* the mbuf+cluster objects are constructed. This
	2728	* information should be treated as "best effort"
	2729	* debugging hint since more than one mbufs can refer
	2730	* to a cluster. In that case, the cluster might not
	2731	* be freed along with the mbuf it was paired with.
	2732	*/
	2733	mca->mca_uptr = cl_mca;
	2734	cl_mca->mca_uptr = mca;
	2735
	2736	ASSERT(mca->mca_uflags & MB_SCVALID);
	2737	ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
	2738	lck_mtx_unlock(mbuf_mlock);
	2739
	2740	/* Technically, they are in the freelist */
	2741	if (mclverify) {
	2742	size_t size;
	2743
	2744	mcache_set_pattern(MCACHE_FREE_PATTERN, m,
	2745	m_maxsize(MC_MBUF));
	2746
	2747	if (class == MC_MBUF_CL) {
	2748	size = m_maxsize(MC_CL);
	2749	} else if (class == MC_MBUF_BIGCL) {
	2750	size = m_maxsize(MC_BIGCL);
	2751	} else {
	2752	size = m_maxsize(MC_16KCL);
	2753	}
	2754
	2755	mcache_set_pattern(MCACHE_FREE_PATTERN, cl,
	2756	size);
	2757	}
	2758	}
	2759
	2760	MBUF_INIT(ms, 0, MT_FREE);
	2761	if (class == MC_MBUF_16KCL) {
	2762	MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
	2763	} else if (class == MC_MBUF_BIGCL) {
	2764	MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
	2765	} else {
	2766	MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
	2767	}
	2768	VERIFY(ms->m_flags == M_EXT);
	2769	VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
	2770
	2771	list = (mcache_obj_t )m;
	2772	(*list)->obj_next = NULL;
	2773	list = plist = &(list)->obj_next;
	2774	}
	2775
	2776	fail:
	2777	/*
	2778	* Free up what's left of the above.
	2779	*/
	2780	if (mp_list != NULL) {
	2781	mcache_free_ext(m_cache(MC_MBUF), mp_list);
	2782	}
	2783	if (clp_list != NULL) {
	2784	mcache_free_ext(m_cache(cl_class), clp_list);
	2785	}
	2786	if (ref_list != NULL) {
	2787	mcache_free_ext(ref_cache, ref_list);
	2788	}
	2789
	2790	lck_mtx_lock(mbuf_mlock);
	2791	if (num > 0 \|\| cnum > 0) {
	2792	m_total(class) += cnum;
	2793	VERIFY(m_total(class) <= m_maxlimit(class));
	2794	m_alloc_cnt(class) += num + cnum;
	2795	}
	2796	if ((num + cnum) < want) {
	2797	m_fail_cnt(class) += (want - (num + cnum));
	2798	}
	2799	lck_mtx_unlock(mbuf_mlock);
	2800
	2801	return num + cnum;
	2802	}
	2803
	2804	/*
	2805	* Common de-allocator for composite objects called by the CPU cache
	2806	* layer when one or more elements need to be returned to the appropriate
	2807	* global freelist.
	2808	*/
	2809	static void
	2810	mbuf_cslab_free(void arg, mcache_obj_t list, int purged)
	2811	{
	2812	mbuf_class_t class = (mbuf_class_t)arg;
	2813	unsigned int num;
	2814	int w;
	2815
	2816	ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
	2817
	2818	lck_mtx_lock(mbuf_mlock);
	2819
	2820	num = cslab_free(class, list, purged);
	2821	m_free_cnt(class) += num;
	2822
	2823	if ((w = mb_waiters) > 0) {
	2824	mb_waiters = 0;
	2825	}
	2826	if (w) {
	2827	mbwdog_logger("waking up all threads");
	2828	}
	2829
	2830	lck_mtx_unlock(mbuf_mlock);
	2831
	2832	if (w != 0) {
	2833	wakeup(mb_waitchan);
	2834	}
	2835	}
	2836
	2837	/*
	2838	* Common auditor for composite objects called by the CPU cache layer
	2839	* during an allocation or free request. For the former, this is called
	2840	* after the objects are obtained from either the bucket or slab layer
	2841	* and before they are returned to the caller. For the latter, this is
	2842	* called immediately during free and before placing the objects into
	2843	* the bucket or slab layer.
	2844	*/
	2845	static void
	2846	mbuf_cslab_audit(void arg, mcache_obj_t list, boolean_t alloc)
	2847	{
	2848	mbuf_class_t class = (mbuf_class_t)arg, cl_class;
	2849	mcache_audit_t *mca;
	2850	struct mbuf m, ms;
	2851	mcl_slab_t clsp, nsp;
	2852	size_t cl_size;
	2853	void *cl;
	2854
	2855	ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
	2856	if (class == MC_MBUF_CL) {
	2857	cl_class = MC_CL;
	2858	} else if (class == MC_MBUF_BIGCL) {
	2859	cl_class = MC_BIGCL;
	2860	} else {
	2861	cl_class = MC_16KCL;
	2862	}
	2863	cl_size = m_maxsize(cl_class);
	2864
	2865	while ((m = ms = (struct mbuf *)list) != NULL) {
	2866	lck_mtx_lock(mbuf_mlock);
	2867	/* Do the mbuf sanity checks and record its transaction */
	2868	mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
	2869	mcl_audit_mbuf(mca, m, TRUE, alloc);
	2870	if (mcltrace) {
	2871	mcache_buffer_log(mca, m, m_cache(class), &mb_start);
	2872	}
	2873
	2874	if (alloc) {
	2875	mca->mca_uflags \|= MB_COMP_INUSE;
	2876	} else {
	2877	mca->mca_uflags &= ~MB_COMP_INUSE;
	2878	}
	2879
	2880	/*
	2881	* Use the shadow mbuf in the audit structure if we are
	2882	* freeing, since the contents of the actual mbuf has been
	2883	* pattern-filled by the above call to mcl_audit_mbuf().
	2884	*/
	2885	if (!alloc && mclverify) {
	2886	ms = MCA_SAVED_MBUF_PTR(mca);
	2887	}
	2888
	2889	/* Do the cluster sanity checks and record its transaction */
	2890	cl = ms->m_ext.ext_buf;
	2891	clsp = slab_get(cl);
	2892	VERIFY(ms->m_flags == M_EXT && cl != NULL);
	2893	VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
	2894	if (class == MC_MBUF_CL) {
	2895	VERIFY(clsp->sl_refcnt >= 1 &&
	2896	clsp->sl_refcnt <= NCLPG);
	2897	} else {
	2898	VERIFY(clsp->sl_refcnt >= 1 &&
	2899	clsp->sl_refcnt <= NBCLPG);
	2900	}
	2901
	2902	if (class == MC_MBUF_16KCL) {
	2903	int k;
	2904	for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
	2905	nsp = nsp->sl_next;
	2906	/* Next slab must already be present */
	2907	VERIFY(nsp != NULL);
	2908	VERIFY(nsp->sl_refcnt == 1);
	2909	}
	2910	}
	2911
	2912
	2913	mca = mcl_audit_buf2mca(cl_class, cl);
	2914	mcl_audit_cluster(mca, cl, cl_size, alloc, FALSE);
	2915	if (mcltrace) {
	2916	mcache_buffer_log(mca, cl, m_cache(class), &mb_start);
	2917	}
	2918
	2919	if (alloc) {
	2920	mca->mca_uflags \|= MB_COMP_INUSE;
	2921	} else {
	2922	mca->mca_uflags &= ~MB_COMP_INUSE;
	2923	}
	2924	lck_mtx_unlock(mbuf_mlock);
	2925
	2926	list = list->obj_next;
	2927	}
	2928	}
	2929
	2930	static void
	2931	m_vm_error_stats(uint32_t cnt, uint64_t ts, uint64_t *size,
	2932	uint64_t alloc_size, kern_return_t error)
	2933	{
	2934	cnt = cnt + 1;
	2935	*ts = net_uptime();
	2936	if (size) {
	2937	*size = alloc_size;
	2938	}
	2939	_CASSERT(sizeof(mb_kmem_stats) / sizeof(mb_kmem_stats[0]) ==
	2940	sizeof(mb_kmem_stats_labels) / sizeof(mb_kmem_stats_labels[0]));
	2941	switch (error) {
	2942	case KERN_SUCCESS:
	2943	break;
	2944	case KERN_INVALID_ARGUMENT:
	2945	mb_kmem_stats[0]++;
	2946	break;
	2947	case KERN_INVALID_ADDRESS:
	2948	mb_kmem_stats[1]++;
	2949	break;
	2950	case KERN_RESOURCE_SHORTAGE:
	2951	mb_kmem_stats[2]++;
	2952	break;
	2953	case KERN_NO_SPACE:
	2954	mb_kmem_stats[3]++;
	2955	break;
	2956	case KERN_FAILURE:
	2957	mb_kmem_stats[4]++;
	2958	break;
	2959	default:
	2960	mb_kmem_stats[5]++;
	2961	break;
	2962	}
	2963	}
	2964
	2965	static vm_offset_t
	2966	kmem_mb_alloc(vm_map_t mbmap, int size, int physContig, kern_return_t *err)
	2967	{
	2968	vm_offset_t addr = 0;
	2969	kern_return_t kr = KERN_SUCCESS;
	2970
	2971	if (!physContig) {
	2972	kr = kernel_memory_allocate(mbmap, &addr, size, 0,
	2973	KMA_KOBJECT \| KMA_LOMEM, VM_KERN_MEMORY_MBUF);
	2974	} else {
	2975	kr = kmem_alloc_contig(mbmap, &addr, size, PAGE_MASK, 0xfffff,
	2976	0, KMA_KOBJECT \| KMA_LOMEM, VM_KERN_MEMORY_MBUF);
	2977	}
	2978
	2979	if (kr != KERN_SUCCESS) {
	2980	addr = 0;
	2981	}
	2982	if (err) {
	2983	*err = kr;
	2984	}
	2985
	2986	return addr;
	2987	}
	2988
	2989	/*
	2990	* Allocate some number of mbuf clusters and place on cluster freelist.
	2991	*/
	2992	static int
	2993	m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
	2994	{
	2995	int i, count = 0;
	2996	vm_size_t size = 0;
	2997	int numpages = 0, large_buffer;
	2998	vm_offset_t page = 0;
	2999	mcache_audit_t *mca_list = NULL;
	3000	mcache_obj_t *con_list = NULL;
	3001	mcl_slab_t *sp;
	3002	mbuf_class_t class;
	3003	kern_return_t error;
	3004
	3005	/* Set if a buffer allocation needs allocation of multiple pages */
	3006	large_buffer = ((bufsize == m_maxsize(MC_16KCL)) &&
	3007	PAGE_SIZE < M16KCLBYTES);
	3008	VERIFY(bufsize == m_maxsize(MC_BIGCL) \|\|
	3009	bufsize == m_maxsize(MC_16KCL));
	3010
	3011	VERIFY((bufsize == PAGE_SIZE) \|\|
	3012	(bufsize > PAGE_SIZE && bufsize == m_maxsize(MC_16KCL)));
	3013
	3014	if (bufsize == m_size(MC_BIGCL)) {
	3015	class = MC_BIGCL;
	3016	} else {
	3017	class = MC_16KCL;
	3018	}
	3019
	3020	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
	3021
	3022	/*
	3023	* Multiple threads may attempt to populate the cluster map one
	3024	* after another. Since we drop the lock below prior to acquiring
	3025	* the physical page(s), our view of the cluster map may no longer
	3026	* be accurate, and we could end up over-committing the pages beyond
	3027	* the maximum allowed for each class. To prevent it, this entire
	3028	* operation (including the page mapping) is serialized.
	3029	*/
	3030	while (mb_clalloc_busy) {
	3031	mb_clalloc_waiters++;
	3032	(void) msleep(mb_clalloc_waitchan, mbuf_mlock,
	3033	(PZERO - 1), "m_clalloc", NULL);
	3034	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
	3035	}
	3036
	3037	/* We are busy now; tell everyone else to go away */
	3038	mb_clalloc_busy = TRUE;
	3039
	3040	/*
	3041	* Honor the caller's wish to block or not block. We have a way
	3042	* to grow the pool asynchronously using the mbuf worker thread.
	3043	*/
	3044	i = m_howmany(num, bufsize);
	3045	if (i <= 0 \|\| (wait & M_DONTWAIT)) {
	3046	goto out;
	3047	}
	3048
	3049	lck_mtx_unlock(mbuf_mlock);
	3050
	3051	size = round_page(i * bufsize);
	3052	page = kmem_mb_alloc(mb_map, size, large_buffer, &error);
	3053
	3054	/*
	3055	* If we did ask for "n" 16KB physically contiguous chunks
	3056	* and didn't get them, then please try again without this
	3057	* restriction.
	3058	*/
	3059	net_update_uptime();
	3060	if (large_buffer && page == 0) {
	3061	m_vm_error_stats(&mb_kmem_contig_failed,
	3062	&mb_kmem_contig_failed_ts,
	3063	&mb_kmem_contig_failed_size,
	3064	size, error);
	3065	page = kmem_mb_alloc(mb_map, size, 0, &error);
	3066	}
	3067
	3068	if (page == 0) {
	3069	m_vm_error_stats(&mb_kmem_failed,
	3070	&mb_kmem_failed_ts,
	3071	&mb_kmem_failed_size,
	3072	size, error);
	3073	#if PAGE_SIZE == 4096
	3074	if (bufsize == m_maxsize(MC_BIGCL)) {
	3075	#else
	3076	if (bufsize >= m_maxsize(MC_BIGCL)) {
	3077	#endif
	3078	/* Try for 1 page if failed */
	3079	size = PAGE_SIZE;
	3080	page = kmem_mb_alloc(mb_map, size, 0, &error);
	3081	if (page == 0) {
	3082	m_vm_error_stats(&mb_kmem_one_failed,
	3083	&mb_kmem_one_failed_ts,
	3084	NULL, size, error);
	3085	}
	3086	}
	3087
	3088	if (page == 0) {
	3089	lck_mtx_lock(mbuf_mlock);
	3090	goto out;
	3091	}
	3092	}
	3093
	3094	VERIFY(IS_P2ALIGNED(page, PAGE_SIZE));
	3095	numpages = size / PAGE_SIZE;
	3096
	3097	/* If auditing is enabled, allocate the audit structures now */
	3098	if (mclaudit != NULL) {
	3099	int needed;
	3100
	3101	/*
	3102	* Yes, I realize this is a waste of memory for clusters
	3103	* that never get transformed into mbufs, as we may end
	3104	* up with NMBPG-1 unused audit structures per cluster.
	3105	* But doing so tremendously simplifies the allocation
	3106	* strategy, since at this point we are not holding the
	3107	* mbuf lock and the caller is okay to be blocked.
	3108	*/
	3109	if (bufsize == PAGE_SIZE) {
	3110	needed = numpages * NMBPG;
	3111
	3112	i = mcache_alloc_ext(mcl_audit_con_cache,
	3113	&con_list, needed, MCR_SLEEP);
	3114
	3115	VERIFY(con_list != NULL && i == needed);
	3116	} else {
	3117	/*
	3118	* if multiple 4K pages are being used for a
	3119	* 16K cluster
	3120	*/
	3121	needed = numpages / NSLABSP16KB;
	3122	}
	3123
	3124	i = mcache_alloc_ext(mcache_audit_cache,
	3125	(mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
	3126
	3127	VERIFY(mca_list != NULL && i == needed);
	3128	}
	3129
	3130	lck_mtx_lock(mbuf_mlock);
	3131
	3132	for (i = 0; i < numpages; i++, page += PAGE_SIZE) {
	3133	ppnum_t offset =
	3134	((unsigned char *)page - mbutl) >> PAGE_SHIFT;
	3135	ppnum_t new_page = pmap_find_phys(kernel_pmap, page);
	3136
	3137	/*
	3138	* If there is a mapper the appropriate I/O page is
	3139	* returned; zero out the page to discard its past
	3140	* contents to prevent exposing leftover kernel memory.
	3141	*/
	3142	VERIFY(offset < mcl_pages);
	3143	if (mcl_paddr_base != 0) {
	3144	bzero((void *)(uintptr_t) page, PAGE_SIZE);
	3145	new_page = IOMapperInsertPage(mcl_paddr_base,
	3146	offset, new_page);
	3147	}
	3148	mcl_paddr[offset] = new_page;
	3149
	3150	/* Pattern-fill this fresh page */
	3151	if (mclverify) {
	3152	mcache_set_pattern(MCACHE_FREE_PATTERN,
	3153	(caddr_t)page, PAGE_SIZE);
	3154	}
	3155	if (bufsize == PAGE_SIZE) {
	3156	mcache_obj_t *buf;
	3157	/* One for the entire page */
	3158	sp = slab_get((void *)page);
	3159	if (mclaudit != NULL) {
	3160	mcl_audit_init((void *)page,
	3161	&mca_list, &con_list,
	3162	AUDIT_CONTENTS_SIZE, NMBPG);
	3163	}
	3164	VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
	3165	slab_init(sp, class, SLF_MAPPED, (void *)page,
	3166	(void *)page, PAGE_SIZE, 0, 1);
	3167	buf = (mcache_obj_t *)page;
	3168	buf->obj_next = NULL;
	3169
	3170	/* Insert this slab */
	3171	slab_insert(sp, class);
	3172
	3173	/* Update stats now since slab_get drops the lock */
	3174	++m_infree(class);
	3175	++m_total(class);
	3176	VERIFY(m_total(class) <= m_maxlimit(class));
	3177	if (class == MC_BIGCL) {
	3178	mbstat.m_bigclfree = m_infree(MC_BIGCL) +
	3179	m_infree(MC_MBUF_BIGCL);
	3180	mbstat.m_bigclusters = m_total(MC_BIGCL);
	3181	}
	3182	++count;
	3183	} else if ((bufsize > PAGE_SIZE) &&
	3184	(i % NSLABSP16KB) == 0) {
	3185	union m16kcluster m16kcl = (union m16kcluster )page;
	3186	mcl_slab_t *nsp;
	3187	int k;
	3188
	3189	/* One for the entire 16KB */
	3190	sp = slab_get(m16kcl);
	3191	if (mclaudit != NULL) {
	3192	mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
	3193	}
	3194
	3195	VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
	3196	slab_init(sp, MC_16KCL, SLF_MAPPED,
	3197	m16kcl, m16kcl, bufsize, 0, 1);
	3198	m16kcl->m16kcl_next = NULL;
	3199
	3200	/*
	3201	* 2nd-Nth page's slab is part of the first one,
	3202	* where N is NSLABSP16KB.
	3203	*/
	3204	for (k = 1; k < NSLABSP16KB; k++) {
	3205	nsp = slab_get(((union mbigcluster *)page) + k);
	3206	VERIFY(nsp->sl_refcnt == 0 &&
	3207	nsp->sl_flags == 0);
	3208	slab_init(nsp, MC_16KCL,
	3209	SLF_MAPPED \| SLF_PARTIAL,
	3210	m16kcl, NULL, 0, 0, 0);
	3211	}
	3212	/* Insert this slab */
	3213	slab_insert(sp, MC_16KCL);
	3214
	3215	/* Update stats now since slab_get drops the lock */
	3216	++m_infree(MC_16KCL);
	3217	++m_total(MC_16KCL);
	3218	VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
	3219	++count;
	3220	}
	3221	}
	3222	VERIFY(mca_list == NULL && con_list == NULL);
	3223
	3224	if (!mb_peak_newreport && mbuf_report_usage(class)) {
	3225	mb_peak_newreport = TRUE;
	3226	}
	3227
	3228	/* We're done; let others enter */
	3229	mb_clalloc_busy = FALSE;
	3230	if (mb_clalloc_waiters > 0) {
	3231	mb_clalloc_waiters = 0;
	3232	wakeup(mb_clalloc_waitchan);
	3233	}
	3234
	3235	return count;
	3236	out:
	3237	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
	3238
	3239	mtracelarge_register(size);
	3240
	3241	/* We're done; let others enter */
	3242	mb_clalloc_busy = FALSE;
	3243	if (mb_clalloc_waiters > 0) {
	3244	mb_clalloc_waiters = 0;
	3245	wakeup(mb_clalloc_waitchan);
	3246	}
	3247
	3248	/*
	3249	* When non-blocking we kick a thread if we have to grow the
	3250	* pool or if the number of free clusters is less than requested.
	3251	*/
	3252	if (i > 0 && mbuf_worker_ready && mbuf_worker_needs_wakeup) {
	3253	mbwdog_logger("waking up the worker thread to to grow %s by %d",
	3254	m_cname(class), i);
	3255	wakeup((caddr_t)&mbuf_worker_needs_wakeup);
	3256	mbuf_worker_needs_wakeup = FALSE;
	3257	}
	3258	if (class == MC_BIGCL) {
	3259	if (i > 0) {
	3260	/*
	3261	* Remember total number of 4KB clusters needed
	3262	* at this time.
	3263	*/
	3264	i += m_total(MC_BIGCL);
	3265	if (i > m_region_expand(MC_BIGCL)) {
	3266	m_region_expand(MC_BIGCL) = i;
	3267	}
	3268	}
	3269	if (m_infree(MC_BIGCL) >= num) {
	3270	return 1;
	3271	}
	3272	} else {
	3273	if (i > 0) {
	3274	/*
	3275	* Remember total number of 16KB clusters needed
	3276	* at this time.
	3277	*/
	3278	i += m_total(MC_16KCL);
	3279	if (i > m_region_expand(MC_16KCL)) {
	3280	m_region_expand(MC_16KCL) = i;
	3281	}
	3282	}
	3283	if (m_infree(MC_16KCL) >= num) {
	3284	return 1;
	3285	}
	3286	}
	3287	return 0;
	3288	}
	3289
	3290	/*
	3291	* Populate the global freelist of the corresponding buffer class.
	3292	*/
	3293	static int
	3294	freelist_populate(mbuf_class_t class, unsigned int num, int wait)
	3295	{
	3296	mcache_obj_t *o = NULL;
	3297	int i, numpages = 0, count;
	3298	mbuf_class_t super_class;
	3299
	3300	VERIFY(class == MC_MBUF \|\| class == MC_CL \|\| class == MC_BIGCL \|\|
	3301	class == MC_16KCL);
	3302
	3303	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
	3304
	3305	VERIFY(PAGE_SIZE == m_maxsize(MC_BIGCL) \|\|
	3306	PAGE_SIZE == m_maxsize(MC_16KCL));
	3307
	3308	if (m_maxsize(class) >= PAGE_SIZE) {
	3309	return m_clalloc(num, wait, m_maxsize(class)) != 0;
	3310	}
	3311
	3312	/*
	3313	* The rest of the function will allocate pages and will slice
	3314	* them up into the right size
	3315	*/
	3316
	3317	numpages = (num * m_size(class) + PAGE_SIZE - 1) / PAGE_SIZE;
	3318
	3319	/* Currently assume that pages are 4K or 16K */
	3320	if (PAGE_SIZE == m_maxsize(MC_BIGCL)) {
	3321	super_class = MC_BIGCL;
	3322	} else {
	3323	super_class = MC_16KCL;
	3324	}
	3325
	3326	i = m_clalloc(numpages, wait, m_maxsize(super_class));
	3327
	3328	/* how many objects will we cut the page into? */
	3329	int numobj = PAGE_SIZE / m_maxsize(class);
	3330
	3331	for (count = 0; count < numpages; count++) {
	3332	/* respect totals, minlimit, maxlimit */
	3333	if (m_total(super_class) <= m_minlimit(super_class) \|\|
	3334	m_total(class) >= m_maxlimit(class)) {
	3335	break;
	3336	}
	3337
	3338	if ((o = slab_alloc(super_class, wait)) == NULL) {
	3339	break;
	3340	}
	3341
	3342	struct mbuf m = (struct mbuf )o;
	3343	union mcluster c = (union mcluster )o;
	3344	union mbigcluster mbc = (union mbigcluster )o;
	3345	mcl_slab_t *sp = slab_get(o);
	3346	mcache_audit_t *mca = NULL;
	3347
	3348	/*
	3349	* since one full page will be converted to MC_MBUF or
	3350	* MC_CL, verify that the reference count will match that
	3351	* assumption
	3352	*/
	3353	VERIFY(sp->sl_refcnt == 1 && slab_is_detached(sp));
	3354	VERIFY((sp->sl_flags & (SLF_MAPPED \| SLF_PARTIAL)) == SLF_MAPPED);
	3355	/*
	3356	* Make sure that the cluster is unmolested
	3357	* while in freelist
	3358	*/
	3359	if (mclverify) {
	3360	mca = mcl_audit_buf2mca(super_class,
	3361	(mcache_obj_t *)o);
	3362	mcache_audit_free_verify(mca,
	3363	(mcache_obj_t *)o, 0, m_maxsize(super_class));
	3364	}
	3365
	3366	/* Reinitialize it as an mbuf or 2K or 4K slab */
	3367	slab_init(sp, class, sp->sl_flags,
	3368	sp->sl_base, NULL, PAGE_SIZE, 0, numobj);
	3369
	3370	VERIFY(sp->sl_head == NULL);
	3371
	3372	VERIFY(m_total(super_class) >= 1);
	3373	m_total(super_class)--;
	3374
	3375	if (super_class == MC_BIGCL) {
	3376	mbstat.m_bigclusters = m_total(MC_BIGCL);
	3377	}
	3378
	3379	m_total(class) += numobj;
	3380	VERIFY(m_total(class) <= m_maxlimit(class));
	3381	m_infree(class) += numobj;
	3382
	3383	if (!mb_peak_newreport && mbuf_report_usage(class)) {
	3384	mb_peak_newreport = TRUE;
	3385	}
	3386
	3387	i = numobj;
	3388	if (class == MC_MBUF) {
	3389	mbstat.m_mbufs = m_total(MC_MBUF);
	3390	mtype_stat_add(MT_FREE, NMBPG);
	3391	while (i--) {
	3392	/*
	3393	* If auditing is enabled, construct the
	3394	* shadow mbuf in the audit structure
	3395	* instead of the actual one.
	3396	* mbuf_slab_audit() will take care of
	3397	* restoring the contents after the
	3398	* integrity check.
	3399	*/
	3400	if (mclaudit != NULL) {
	3401	struct mbuf *ms;
	3402	mca = mcl_audit_buf2mca(MC_MBUF,
	3403	(mcache_obj_t *)m);
	3404	ms = MCA_SAVED_MBUF_PTR(mca);
	3405	ms->m_type = MT_FREE;
	3406	} else {
	3407	m->m_type = MT_FREE;
	3408	}
	3409	m->m_next = sp->sl_head;
	3410	sp->sl_head = (void *)m++;
	3411	}
	3412	} else if (class == MC_CL) { /* MC_CL */
	3413	mbstat.m_clfree =
	3414	m_infree(MC_CL) + m_infree(MC_MBUF_CL);
	3415	mbstat.m_clusters = m_total(MC_CL);
	3416	while (i--) {
	3417	c->mcl_next = sp->sl_head;
	3418	sp->sl_head = (void *)c++;
	3419	}
	3420	} else {
	3421	VERIFY(class == MC_BIGCL);
	3422	mbstat.m_bigclusters = m_total(MC_BIGCL);
	3423	mbstat.m_bigclfree = m_infree(MC_BIGCL) +
	3424	m_infree(MC_MBUF_BIGCL);
	3425	while (i--) {
	3426	mbc->mbc_next = sp->sl_head;
	3427	sp->sl_head = (void *)mbc++;
	3428	}
	3429	}
	3430
	3431	/* Insert into the mbuf or 2k or 4k slab list */
	3432	slab_insert(sp, class);
	3433
	3434	if ((i = mb_waiters) > 0) {
	3435	mb_waiters = 0;
	3436	}
	3437	if (i != 0) {
	3438	mbwdog_logger("waking up all threads");
	3439	wakeup(mb_waitchan);
	3440	}
	3441	}
	3442	return count != 0;
	3443	}
	3444
	3445	/*
	3446	* For each class, initialize the freelist to hold m_minlimit() objects.
	3447	*/
	3448	static void
	3449	freelist_init(mbuf_class_t class)
	3450	{
	3451	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
	3452
	3453	VERIFY(class == MC_CL \|\| class == MC_BIGCL);
	3454	VERIFY(m_total(class) == 0);
	3455	VERIFY(m_minlimit(class) > 0);
	3456
	3457	while (m_total(class) < m_minlimit(class)) {
	3458	(void) freelist_populate(class, m_minlimit(class), M_WAIT);
	3459	}
	3460
	3461	VERIFY(m_total(class) >= m_minlimit(class));
	3462	}
	3463
	3464	/*
	3465	* (Inaccurately) check if it might be worth a trip back to the
	3466	* mcache layer due the availability of objects there. We'll
	3467	* end up back here if there's nothing up there.
	3468	*/
	3469	static boolean_t
	3470	mbuf_cached_above(mbuf_class_t class, int wait)
	3471	{
	3472	switch (class) {
	3473	case MC_MBUF:
	3474	if (wait & MCR_COMP) {
	3475	return !mcache_bkt_isempty(m_cache(MC_MBUF_CL)) \|\|
	3476	!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL));
	3477	}
	3478	break;
	3479
	3480	case MC_CL:
	3481	if (wait & MCR_COMP) {
	3482	return !mcache_bkt_isempty(m_cache(MC_MBUF_CL));
	3483	}
	3484	break;
	3485
	3486	case MC_BIGCL:
	3487	if (wait & MCR_COMP) {
	3488	return !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL));
	3489	}
	3490	break;
	3491
	3492	case MC_16KCL:
	3493	if (wait & MCR_COMP) {
	3494	return !mcache_bkt_isempty(m_cache(MC_MBUF_16KCL));
	3495	}
	3496	break;
	3497
	3498	case MC_MBUF_CL:
	3499	case MC_MBUF_BIGCL:
	3500	case MC_MBUF_16KCL:
	3501	break;
	3502
	3503	default:
	3504	VERIFY(0);
	3505	/* NOTREACHED */
	3506	}
	3507
	3508	return !mcache_bkt_isempty(m_cache(class));
	3509	}
	3510
	3511	/*
	3512	* If possible, convert constructed objects to raw ones.
	3513	*/
	3514	static boolean_t
	3515	mbuf_steal(mbuf_class_t class, unsigned int num)
	3516	{
	3517	mcache_obj_t *top = NULL;
	3518	mcache_obj_t **list = &top;
	3519	unsigned int tot = 0;
	3520
	3521	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
	3522
	3523	switch (class) {
	3524	case MC_MBUF:
	3525	case MC_CL:
	3526	case MC_BIGCL:
	3527	case MC_16KCL:
	3528	return FALSE;
	3529
	3530	case MC_MBUF_CL:
	3531	case MC_MBUF_BIGCL:
	3532	case MC_MBUF_16KCL:
	3533	/* Get the required number of constructed objects if possible */
	3534	if (m_infree(class) > m_minlimit(class)) {
	3535	tot = cslab_alloc(class, &list,
	3536	MIN(num, m_infree(class)));
	3537	}
	3538
	3539	/* And destroy them to get back the raw objects */
	3540	if (top != NULL) {
	3541	(void) cslab_free(class, top, 1);
	3542	}
	3543	break;
	3544
	3545	default:
	3546	VERIFY(0);
	3547	/* NOTREACHED */
	3548	}
	3549
	3550	return tot == num;
	3551	}
	3552
	3553	static void
	3554	m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
	3555	{
	3556	int m, bmap = 0;
	3557
	3558	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
	3559
	3560	VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
	3561	VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
	3562	VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
	3563
	3564	/*
	3565	* This logic can be made smarter; for now, simply mark
	3566	* all other related classes as potential victims.
	3567	*/
	3568	switch (class) {
	3569	case MC_MBUF:
	3570	m_wantpurge(MC_CL)++;
	3571	m_wantpurge(MC_BIGCL)++;
	3572	m_wantpurge(MC_MBUF_CL)++;
	3573	m_wantpurge(MC_MBUF_BIGCL)++;
	3574	break;
	3575
	3576	case MC_CL:
	3577	m_wantpurge(MC_MBUF)++;
	3578	m_wantpurge(MC_BIGCL)++;
	3579	m_wantpurge(MC_MBUF_BIGCL)++;
	3580	if (!comp) {
	3581	m_wantpurge(MC_MBUF_CL)++;
	3582	}
	3583	break;
	3584
	3585	case MC_BIGCL:
	3586	m_wantpurge(MC_MBUF)++;
	3587	m_wantpurge(MC_CL)++;
	3588	m_wantpurge(MC_MBUF_CL)++;
	3589	if (!comp) {
	3590	m_wantpurge(MC_MBUF_BIGCL)++;
	3591	}
	3592	break;
	3593
	3594	case MC_16KCL:
	3595	if (!comp) {
	3596	m_wantpurge(MC_MBUF_16KCL)++;
	3597	}
	3598	break;
	3599
	3600	default:
	3601	VERIFY(0);
	3602	/* NOTREACHED */
	3603	}
	3604
	3605	/*
	3606	* Run through each marked class and check if we really need to
	3607	* purge (and therefore temporarily disable) the per-CPU caches
	3608	* layer used by the class. If so, remember the classes since
	3609	* we are going to drop the lock below prior to purging.
	3610	*/
	3611	for (m = 0; m < NELEM(mbuf_table); m++) {
	3612	if (m_wantpurge(m) > 0) {
	3613	m_wantpurge(m) = 0;
	3614	/*
	3615	* Try hard to steal the required number of objects
	3616	* from the freelist of other mbuf classes. Only
	3617	* purge and disable the per-CPU caches layer when
	3618	* we don't have enough; it's the last resort.
	3619	*/
	3620	if (!mbuf_steal(m, num)) {
	3621	bmap \|= (1 << m);
	3622	}
	3623	}
	3624	}
	3625
	3626	lck_mtx_unlock(mbuf_mlock);
	3627
	3628	if (bmap != 0) {
	3629	/* signal the domains to drain */
	3630	net_drain_domains();
	3631
	3632	/* Sigh; we have no other choices but to ask mcache to purge */
	3633	for (m = 0; m < NELEM(mbuf_table); m++) {
	3634	if ((bmap & (1 << m)) &&
	3635	mcache_purge_cache(m_cache(m), TRUE)) {
	3636	lck_mtx_lock(mbuf_mlock);
	3637	m_purge_cnt(m)++;
	3638	mbstat.m_drain++;
	3639	lck_mtx_unlock(mbuf_mlock);
	3640	}
	3641	}
	3642	} else {
	3643	/*
	3644	* Request mcache to reap extra elements from all of its caches;
	3645	* note that all reaps are serialized and happen only at a fixed
	3646	* interval.
	3647	*/
	3648	mcache_reap();
	3649	}
	3650	lck_mtx_lock(mbuf_mlock);
	3651	}
	3652
	3653	static inline struct mbuf *
	3654	m_get_common(int wait, short type, int hdr)
	3655	{
	3656	struct mbuf *m;
	3657	int mcflags = MSLEEPF(wait);
	3658
	3659	/* Is this due to a non-blocking retry? If so, then try harder */
	3660	if (mcflags & MCR_NOSLEEP) {
	3661	mcflags \|= MCR_TRYHARD;
	3662	}
	3663
	3664	m = mcache_alloc(m_cache(MC_MBUF), mcflags);
	3665	if (m != NULL) {
	3666	MBUF_INIT(m, hdr, type);
	3667	mtype_stat_inc(type);
	3668	mtype_stat_dec(MT_FREE);
	3669	}
	3670	return m;
	3671	}
	3672
	3673	/*
	3674	* Space allocation routines; these are also available as macros
	3675	* for critical paths.
	3676	*/
	3677	#define _M_GET(wait, type) m_get_common(wait, type, 0)
	3678	#define _M_GETHDR(wait, type) m_get_common(wait, type, 1)
	3679	#define _M_RETRY(wait, type) _M_GET(wait, type)
	3680	#define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
	3681	#define _MGET(m, how, type) ((m) = _M_GET(how, type))
	3682	#define _MGETHDR(m, how, type) ((m) = _M_GETHDR(how, type))
	3683
	3684	struct mbuf *
	3685	m_get(int wait, int type)
	3686	{
	3687	return _M_GET(wait, type);
	3688	}
	3689
	3690	struct mbuf *
	3691	m_gethdr(int wait, int type)
	3692	{
	3693	return _M_GETHDR(wait, type);
	3694	}
	3695
	3696	struct mbuf *
	3697	m_retry(int wait, int type)
	3698	{
	3699	return _M_RETRY(wait, type);
	3700	}
	3701
	3702	struct mbuf *
	3703	m_retryhdr(int wait, int type)
	3704	{
	3705	return _M_RETRYHDR(wait, type);
	3706	}
	3707
	3708	struct mbuf *
	3709	m_getclr(int wait, int type)
	3710	{
	3711	struct mbuf *m;
	3712
	3713	_MGET(m, wait, type);
	3714	if (m != NULL) {
	3715	bzero(MTOD(m, caddr_t), MLEN);
	3716	}
	3717	return m;
	3718	}
	3719
	3720	static int
	3721	m_free_paired(struct mbuf *m)
	3722	{
	3723	VERIFY((m->m_flags & M_EXT) && (MEXT_FLAGS(m) & EXTF_PAIRED));
	3724
	3725	membar_sync();
	3726	if (MEXT_PMBUF(m) == m) {
	3727	volatile UInt16 addr = (volatile UInt16 )&MEXT_PREF(m);
	3728	int16_t oprefcnt, prefcnt;
	3729
	3730	/*
	3731	* Paired ref count might be negative in case we lose
	3732	* against another thread clearing MEXT_PMBUF, in the
	3733	* event it occurs after the above memory barrier sync.
	3734	* In that case just ignore as things have been unpaired.
	3735	*/
	3736	do {
	3737	oprefcnt = *addr;
	3738	prefcnt = oprefcnt - 1;
	3739	} while (!OSCompareAndSwap16(oprefcnt, prefcnt, addr));
	3740
	3741	if (prefcnt > 1) {
	3742	return 1;
	3743	} else if (prefcnt == 1) {
	3744	(*(m_get_ext_free(m)))(m->m_ext.ext_buf,
	3745	m->m_ext.ext_size, m_get_ext_arg(m));
	3746	return 1;
	3747	} else if (prefcnt == 0) {
	3748	VERIFY(MBUF_IS_PAIRED(m));
	3749
	3750	/*
	3751	* Restore minref to its natural value, so that
	3752	* the caller will be able to free the cluster
	3753	* as appropriate.
	3754	*/
	3755	MEXT_MINREF(m) = 0;
	3756
	3757	/*
	3758	* Clear MEXT_PMBUF, but leave EXTF_PAIRED intact
	3759	* as it is immutable. atomic_set_ptr also causes
	3760	* memory barrier sync.
	3761	*/
	3762	atomic_set_ptr(&MEXT_PMBUF(m), NULL);
	3763
	3764	switch (m->m_ext.ext_size) {
	3765	case MCLBYTES:
	3766	m_set_ext(m, m_get_rfa(m), NULL, NULL);
	3767	break;
	3768
	3769	case MBIGCLBYTES:
	3770	m_set_ext(m, m_get_rfa(m), m_bigfree, NULL);
	3771	break;
	3772
	3773	case M16KCLBYTES:
	3774	m_set_ext(m, m_get_rfa(m), m_16kfree, NULL);
	3775	break;
	3776
	3777	default:
	3778	VERIFY(0);
	3779	/* NOTREACHED */
	3780	}
	3781	}
	3782	}
	3783
	3784	/*
	3785	* Tell caller the unpair has occurred, and that the reference
	3786	* count on the external cluster held for the paired mbuf should
	3787	* now be dropped.
	3788	*/
	3789	return 0;
	3790	}
	3791
	3792	struct mbuf *
	3793	m_free(struct mbuf *m)
	3794	{
	3795	struct mbuf *n = m->m_next;
	3796
	3797	if (m->m_type == MT_FREE) {
	3798	panic("m_free: freeing an already freed mbuf");
	3799	}
	3800
	3801	if (m->m_flags & M_PKTHDR) {
	3802	/* Check for scratch area overflow */
	3803	m_redzone_verify(m);
	3804	/* Free the aux data and tags if there is any */
	3805	m_tag_delete_chain(m, NULL);
	3806
	3807	m_do_tx_compl_callback(m, NULL);
	3808	}
	3809
	3810	if (m->m_flags & M_EXT) {
	3811	uint16_t refcnt;
	3812	uint32_t composite;
	3813	m_ext_free_func_t m_free_func;
	3814
	3815	if (MBUF_IS_PAIRED(m) && m_free_paired(m)) {
	3816	return n;
	3817	}
	3818
	3819	refcnt = m_decref(m);
	3820	composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
	3821	m_free_func = m_get_ext_free(m);
	3822
	3823	if (refcnt == MEXT_MINREF(m) && !composite) {
	3824	if (m_free_func == NULL) {
	3825	mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
	3826	} else if (m_free_func == m_bigfree) {
	3827	mcache_free(m_cache(MC_BIGCL),
	3828	m->m_ext.ext_buf);
	3829	} else if (m_free_func == m_16kfree) {
	3830	mcache_free(m_cache(MC_16KCL),
	3831	m->m_ext.ext_buf);
	3832	} else {
	3833	(*m_free_func)(m->m_ext.ext_buf,
	3834	m->m_ext.ext_size, m_get_ext_arg(m));
	3835	}
	3836	mcache_free(ref_cache, m_get_rfa(m));
	3837	m_set_ext(m, NULL, NULL, NULL);
	3838	} else if (refcnt == MEXT_MINREF(m) && composite) {
	3839	VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
	3840	VERIFY(m->m_type != MT_FREE);
	3841
	3842	mtype_stat_dec(m->m_type);
	3843	mtype_stat_inc(MT_FREE);
	3844
	3845	m->m_type = MT_FREE;
	3846	m->m_flags = M_EXT;
	3847	m->m_len = 0;
	3848	m->m_next = m->m_nextpkt = NULL;
	3849
	3850	MEXT_FLAGS(m) &= ~EXTF_READONLY;
	3851
	3852	/* "Free" into the intermediate cache */
	3853	if (m_free_func == NULL) {
	3854	mcache_free(m_cache(MC_MBUF_CL), m);
	3855	} else if (m_free_func == m_bigfree) {
	3856	mcache_free(m_cache(MC_MBUF_BIGCL), m);
	3857	} else {
	3858	VERIFY(m_free_func == m_16kfree);
	3859	mcache_free(m_cache(MC_MBUF_16KCL), m);
	3860	}
	3861	return n;
	3862	}
	3863	}
	3864
	3865	if (m->m_type != MT_FREE) {
	3866	mtype_stat_dec(m->m_type);
	3867	mtype_stat_inc(MT_FREE);
	3868	}
	3869
	3870	m->m_type = MT_FREE;
	3871	m->m_flags = m->m_len = 0;
	3872	m->m_next = m->m_nextpkt = NULL;
	3873
	3874	mcache_free(m_cache(MC_MBUF), m);
	3875
	3876	return n;
	3877	}
	3878
	3879	__private_extern__ struct mbuf *
	3880	m_clattach(struct mbuf *m, int type, caddr_t extbuf,
	3881	void (*extfree)(caddr_t, u_int, caddr_t), u_int extsize, caddr_t extarg,
	3882	int wait, int pair)
	3883	{
	3884	struct ext_ref *rfa = NULL;
	3885
	3886	/*
	3887	* If pairing is requested and an existing mbuf is provided, reject
	3888	* it if it's already been paired to another cluster. Otherwise,
	3889	* allocate a new one or free any existing below.
	3890	*/
	3891	if ((m != NULL && MBUF_IS_PAIRED(m)) \|\|
	3892	(m == NULL && (m = _M_GETHDR(wait, type)) == NULL)) {
	3893	return NULL;
	3894	}
	3895
	3896	if (m->m_flags & M_EXT) {
	3897	u_int16_t refcnt;
	3898	u_int32_t composite;
	3899	m_ext_free_func_t m_free_func;
	3900
	3901	refcnt = m_decref(m);
	3902	composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
	3903	VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED) && MEXT_PMBUF(m) == NULL);
	3904	m_free_func = m_get_ext_free(m);
	3905	if (refcnt == MEXT_MINREF(m) && !composite) {
	3906	if (m_free_func == NULL) {
	3907	mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
	3908	} else if (m_free_func == m_bigfree) {
	3909	mcache_free(m_cache(MC_BIGCL),
	3910	m->m_ext.ext_buf);
	3911	} else if (m_free_func == m_16kfree) {
	3912	mcache_free(m_cache(MC_16KCL),
	3913	m->m_ext.ext_buf);
	3914	} else {
	3915	(*m_free_func)(m->m_ext.ext_buf,
	3916	m->m_ext.ext_size, m_get_ext_arg(m));
	3917	}
	3918	/* Re-use the reference structure */
	3919	rfa = m_get_rfa(m);
	3920	} else if (refcnt == MEXT_MINREF(m) && composite) {
	3921	VERIFY(m->m_type != MT_FREE);
	3922
	3923	mtype_stat_dec(m->m_type);
	3924	mtype_stat_inc(MT_FREE);
	3925
	3926	m->m_type = MT_FREE;
	3927	m->m_flags = M_EXT;
	3928	m->m_len = 0;
	3929	m->m_next = m->m_nextpkt = NULL;
	3930
	3931	MEXT_FLAGS(m) &= ~EXTF_READONLY;
	3932
	3933	/* "Free" into the intermediate cache */
	3934	if (m_free_func == NULL) {
	3935	mcache_free(m_cache(MC_MBUF_CL), m);
	3936	} else if (m_free_func == m_bigfree) {
	3937	mcache_free(m_cache(MC_MBUF_BIGCL), m);
	3938	} else {
	3939	VERIFY(m_free_func == m_16kfree);
	3940	mcache_free(m_cache(MC_MBUF_16KCL), m);
	3941	}
	3942	/*
	3943	* Allocate a new mbuf, since we didn't divorce
	3944	* the composite mbuf + cluster pair above.
	3945	*/
	3946	if ((m = _M_GETHDR(wait, type)) == NULL) {
	3947	return NULL;
	3948	}
	3949	}
	3950	}
	3951
	3952	if (rfa == NULL &&
	3953	(rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
	3954	m_free(m);
	3955	return NULL;
	3956	}
	3957
	3958	if (!pair) {
	3959	MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa,
	3960	0, 1, 0, 0, 0, NULL);
	3961	} else {
	3962	MEXT_INIT(m, extbuf, extsize, extfree, (caddr_t)m, rfa,
	3963	1, 1, 1, EXTF_PAIRED, 0, m);
	3964	}
	3965
	3966	return m;
	3967	}
	3968
	3969	/*
	3970	* Perform `fast' allocation mbuf clusters from a cache of recently-freed
	3971	* clusters. (If the cache is empty, new clusters are allocated en-masse.)
	3972	*/
	3973	struct mbuf *
	3974	m_getcl(int wait, int type, int flags)
	3975	{
	3976	struct mbuf *m;
	3977	int mcflags = MSLEEPF(wait);
	3978	int hdr = (flags & M_PKTHDR);
	3979
	3980	/* Is this due to a non-blocking retry? If so, then try harder */
	3981	if (mcflags & MCR_NOSLEEP) {
	3982	mcflags \|= MCR_TRYHARD;
	3983	}
	3984
	3985	m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags);
	3986	if (m != NULL) {
	3987	u_int16_t flag;
	3988	struct ext_ref *rfa;
	3989	void *cl;
	3990
	3991	VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
	3992	cl = m->m_ext.ext_buf;
	3993	rfa = m_get_rfa(m);
	3994
	3995	ASSERT(cl != NULL && rfa != NULL);
	3996	VERIFY(MBUF_IS_COMPOSITE(m) && m_get_ext_free(m) == NULL);
	3997
	3998	flag = MEXT_FLAGS(m);
	3999
	4000	MBUF_INIT(m, hdr, type);
	4001	MBUF_CL_INIT(m, cl, rfa, 1, flag);
	4002
	4003	mtype_stat_inc(type);
	4004	mtype_stat_dec(MT_FREE);
	4005	}
	4006	return m;
	4007	}
	4008
	4009	/* m_mclget() add an mbuf cluster to a normal mbuf */
	4010	struct mbuf *
	4011	m_mclget(struct mbuf *m, int wait)
	4012	{
	4013	struct ext_ref *rfa;
	4014
	4015	if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
	4016	return m;
	4017	}
	4018
	4019	m->m_ext.ext_buf = m_mclalloc(wait);
	4020	if (m->m_ext.ext_buf != NULL) {
	4021	MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
	4022	} else {
	4023	mcache_free(ref_cache, rfa);
	4024	}
	4025	return m;
	4026	}
	4027
	4028	/* Allocate an mbuf cluster */
	4029	caddr_t
	4030	m_mclalloc(int wait)
	4031	{
	4032	int mcflags = MSLEEPF(wait);
	4033
	4034	/* Is this due to a non-blocking retry? If so, then try harder */
	4035	if (mcflags & MCR_NOSLEEP) {
	4036	mcflags \|= MCR_TRYHARD;
	4037	}
	4038
	4039	return mcache_alloc(m_cache(MC_CL), mcflags);
	4040	}
	4041
	4042	/* Free an mbuf cluster */
	4043	void
	4044	m_mclfree(caddr_t p)
	4045	{
	4046	mcache_free(m_cache(MC_CL), p);
	4047	}
	4048
	4049	/*
	4050	* mcl_hasreference() checks if a cluster of an mbuf is referenced by
	4051	* another mbuf; see comments in m_incref() regarding EXTF_READONLY.
	4052	*/
	4053	int
	4054	m_mclhasreference(struct mbuf *m)
	4055	{
	4056	if (!(m->m_flags & M_EXT)) {
	4057	return 0;
	4058	}
	4059
	4060	ASSERT(m_get_rfa(m) != NULL);
	4061
	4062	return (MEXT_FLAGS(m) & EXTF_READONLY) ? 1 : 0;
	4063	}
	4064
	4065	__private_extern__ caddr_t
	4066	m_bigalloc(int wait)
	4067	{
	4068	int mcflags = MSLEEPF(wait);
	4069
	4070	/* Is this due to a non-blocking retry? If so, then try harder */
	4071	if (mcflags & MCR_NOSLEEP) {
	4072	mcflags \|= MCR_TRYHARD;
	4073	}
	4074
	4075	return mcache_alloc(m_cache(MC_BIGCL), mcflags);
	4076	}
	4077
	4078	__private_extern__ void
	4079	m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
	4080	{
	4081	mcache_free(m_cache(MC_BIGCL), p);
	4082	}
	4083
	4084	/* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
	4085	__private_extern__ struct mbuf *
	4086	m_mbigget(struct mbuf *m, int wait)
	4087	{
	4088	struct ext_ref *rfa;
	4089
	4090	if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
	4091	return m;
	4092	}
	4093
	4094	m->m_ext.ext_buf = m_bigalloc(wait);
	4095	if (m->m_ext.ext_buf != NULL) {
	4096	MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
	4097	} else {
	4098	mcache_free(ref_cache, rfa);
	4099	}
	4100	return m;
	4101	}
	4102
	4103	__private_extern__ caddr_t
	4104	m_16kalloc(int wait)
	4105	{
	4106	int mcflags = MSLEEPF(wait);
	4107
	4108	/* Is this due to a non-blocking retry? If so, then try harder */
	4109	if (mcflags & MCR_NOSLEEP) {
	4110	mcflags \|= MCR_TRYHARD;
	4111	}
	4112
	4113	return mcache_alloc(m_cache(MC_16KCL), mcflags);
	4114	}
	4115
	4116	__private_extern__ void
	4117	m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
	4118	{
	4119	mcache_free(m_cache(MC_16KCL), p);
	4120	}
	4121
	4122	/* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
	4123	__private_extern__ struct mbuf *
	4124	m_m16kget(struct mbuf *m, int wait)
	4125	{
	4126	struct ext_ref *rfa;
	4127
	4128	if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
	4129	return m;
	4130	}
	4131
	4132	m->m_ext.ext_buf = m_16kalloc(wait);
	4133	if (m->m_ext.ext_buf != NULL) {
	4134	MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
	4135	} else {
	4136	mcache_free(ref_cache, rfa);
	4137	}
	4138	return m;
	4139	}
	4140
	4141	/*
	4142	* "Move" mbuf pkthdr from "from" to "to".
	4143	* "from" must have M_PKTHDR set, and "to" must be empty.
	4144	*/
	4145	void
	4146	m_copy_pkthdr(struct mbuf to, struct mbuf from)
	4147	{
	4148	VERIFY(from->m_flags & M_PKTHDR);
	4149
	4150	/* Check for scratch area overflow */
	4151	m_redzone_verify(from);
	4152
	4153	if (to->m_flags & M_PKTHDR) {
	4154	/* Check for scratch area overflow */
	4155	m_redzone_verify(to);
	4156	/* We will be taking over the tags of 'to' */
	4157	m_tag_delete_chain(to, NULL);
	4158	}
	4159	to->m_pkthdr = from->m_pkthdr; /* especially tags */
	4160	m_classifier_init(from, 0); /* purge classifier info */
	4161	m_tag_init(from, 1); /* purge all tags from src */
	4162	m_scratch_init(from); /* clear src scratch area */
	4163	to->m_flags = (from->m_flags & M_COPYFLAGS) \| (to->m_flags & M_EXT);
	4164	if ((to->m_flags & M_EXT) == 0) {
	4165	to->m_data = to->m_pktdat;
	4166	}
	4167	m_redzone_init(to); /* setup red zone on dst */
	4168	}
	4169
	4170	/*
	4171	* Duplicate "from"'s mbuf pkthdr in "to".
	4172	* "from" must have M_PKTHDR set, and "to" must be empty.
	4173	* In particular, this does a deep copy of the packet tags.
	4174	*/
	4175	static int
	4176	m_dup_pkthdr(struct mbuf to, struct mbuf from, int how)
	4177	{
	4178	VERIFY(from->m_flags & M_PKTHDR);
	4179
	4180	/* Check for scratch area overflow */
	4181	m_redzone_verify(from);
	4182
	4183	if (to->m_flags & M_PKTHDR) {
	4184	/* Check for scratch area overflow */
	4185	m_redzone_verify(to);
	4186	/* We will be taking over the tags of 'to' */
	4187	m_tag_delete_chain(to, NULL);
	4188	}
	4189	to->m_flags = (from->m_flags & M_COPYFLAGS) \| (to->m_flags & M_EXT);
	4190	if ((to->m_flags & M_EXT) == 0) {
	4191	to->m_data = to->m_pktdat;
	4192	}
	4193	to->m_pkthdr = from->m_pkthdr;
	4194	m_redzone_init(to); /* setup red zone on dst */
	4195	m_tag_init(to, 0); /* preserve dst static tags */
	4196	return m_tag_copy_chain(to, from, how);
	4197	}
	4198
	4199	void
	4200	m_copy_pftag(struct mbuf to, struct mbuf from)
	4201	{
	4202	memcpy(m_pftag(to), m_pftag(from), sizeof(struct pf_mtag));
	4203	#if PF_ECN
	4204	m_pftag(to)->pftag_hdr = NULL;
	4205	m_pftag(to)->pftag_flags &= ~(PF_TAG_HDR_INET \| PF_TAG_HDR_INET6);
	4206	#endif /* PF_ECN */
	4207	}
	4208
	4209	void
	4210	m_copy_necptag(struct mbuf to, struct mbuf from)
	4211	{
	4212	memcpy(m_necptag(to), m_necptag(from), sizeof(struct necp_mtag_));
	4213	}
	4214
	4215	void
	4216	m_classifier_init(struct mbuf *m, uint32_t pktf_mask)
	4217	{
	4218	VERIFY(m->m_flags & M_PKTHDR);
	4219
	4220	m->m_pkthdr.pkt_proto = 0;
	4221	m->m_pkthdr.pkt_flowsrc = 0;
	4222	m->m_pkthdr.pkt_flowid = 0;
	4223	m->m_pkthdr.pkt_flags &= pktf_mask; /* caller-defined mask */
	4224	/* preserve service class and interface info for loopback packets */
	4225	if (!(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
	4226	(void) m_set_service_class(m, MBUF_SC_BE);
	4227	}
	4228	if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO)) {
	4229	m->m_pkthdr.pkt_ifainfo = 0;
	4230	}
	4231	/*
	4232	* Preserve timestamp if requested
	4233	*/
	4234	if (!(m->m_pkthdr.pkt_flags & PKTF_TS_VALID)) {
	4235	m->m_pkthdr.pkt_timestamp = 0;
	4236	}
	4237	}
	4238
	4239	void
	4240	m_copy_classifier(struct mbuf to, struct mbuf from)
	4241	{
	4242	VERIFY(to->m_flags & M_PKTHDR);
	4243	VERIFY(from->m_flags & M_PKTHDR);
	4244
	4245	to->m_pkthdr.pkt_proto = from->m_pkthdr.pkt_proto;
	4246	to->m_pkthdr.pkt_flowsrc = from->m_pkthdr.pkt_flowsrc;
	4247	to->m_pkthdr.pkt_flowid = from->m_pkthdr.pkt_flowid;
	4248	to->m_pkthdr.pkt_flags = from->m_pkthdr.pkt_flags;
	4249	(void) m_set_service_class(to, from->m_pkthdr.pkt_svc);
	4250	to->m_pkthdr.pkt_ifainfo = from->m_pkthdr.pkt_ifainfo;
	4251	}
	4252
	4253	/*
	4254	* Return a list of mbuf hdrs that point to clusters. Try for num_needed;
	4255	* if wantall is not set, return whatever number were available. Set up the
	4256	* first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
	4257	* are chained on the m_nextpkt field. Any packets requested beyond this
	4258	* are chained onto the last packet header's m_next field. The size of
	4259	* the cluster is controlled by the parameter bufsize.
	4260	*/
	4261	__private_extern__ struct mbuf *
	4262	m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
	4263	int wait, int wantall, size_t bufsize)
	4264	{
	4265	struct mbuf *m;
	4266	struct mbuf *np, top;
	4267	unsigned int pnum, needed = *num_needed;
	4268	mcache_obj_t *mp_list = NULL;
	4269	int mcflags = MSLEEPF(wait);
	4270	u_int16_t flag;
	4271	struct ext_ref *rfa;
	4272	mcache_t *cp;
	4273	void *cl;
	4274
	4275	ASSERT(bufsize == m_maxsize(MC_CL) \|\|
	4276	bufsize == m_maxsize(MC_BIGCL) \|\|
	4277	bufsize == m_maxsize(MC_16KCL));
	4278
	4279	/*
	4280	* Caller must first check for njcl because this
	4281	* routine is internal and not exposed/used via KPI.
	4282	*/
	4283	VERIFY(bufsize != m_maxsize(MC_16KCL) \|\| njcl > 0);
	4284
	4285	top = NULL;
	4286	np = &top;
	4287	pnum = 0;
	4288
	4289	/*
	4290	* The caller doesn't want all the requested buffers; only some.
	4291	* Try hard to get what we can, but don't block. This effectively
	4292	* overrides MCR_SLEEP, since this thread will not go to sleep
	4293	* if we can't get all the buffers.
	4294	*/
	4295	if (!wantall \|\| (mcflags & MCR_NOSLEEP)) {
	4296	mcflags \|= MCR_TRYHARD;
	4297	}
	4298
	4299	/* Allocate the composite mbuf + cluster elements from the cache */
	4300	if (bufsize == m_maxsize(MC_CL)) {
	4301	cp = m_cache(MC_MBUF_CL);
	4302	} else if (bufsize == m_maxsize(MC_BIGCL)) {
	4303	cp = m_cache(MC_MBUF_BIGCL);
	4304	} else {
	4305	cp = m_cache(MC_MBUF_16KCL);
	4306	}
	4307	needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
	4308
	4309	for (pnum = 0; pnum < needed; pnum++) {
	4310	m = (struct mbuf *)mp_list;
	4311	mp_list = mp_list->obj_next;
	4312
	4313	VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
	4314	cl = m->m_ext.ext_buf;
	4315	rfa = m_get_rfa(m);
	4316
	4317	ASSERT(cl != NULL && rfa != NULL);
	4318	VERIFY(MBUF_IS_COMPOSITE(m));
	4319
	4320	flag = MEXT_FLAGS(m);
	4321
	4322	MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
	4323	if (bufsize == m_maxsize(MC_16KCL)) {
	4324	MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
	4325	} else if (bufsize == m_maxsize(MC_BIGCL)) {
	4326	MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
	4327	} else {
	4328	MBUF_CL_INIT(m, cl, rfa, 1, flag);
	4329	}
	4330
	4331	if (num_with_pkthdrs > 0) {
	4332	--num_with_pkthdrs;
	4333	}
	4334
	4335	*np = m;
	4336	if (num_with_pkthdrs > 0) {
	4337	np = &m->m_nextpkt;
	4338	} else {
	4339	np = &m->m_next;
	4340	}
	4341	}
	4342	ASSERT(pnum != *num_needed \|\| mp_list == NULL);
	4343	if (mp_list != NULL) {
	4344	mcache_free_ext(cp, mp_list);
	4345	}
	4346
	4347	if (pnum > 0) {
	4348	mtype_stat_add(MT_DATA, pnum);
	4349	mtype_stat_sub(MT_FREE, pnum);
	4350	}
	4351
	4352	if (wantall && (pnum != *num_needed)) {
	4353	if (top != NULL) {
	4354	m_freem_list(top);
	4355	}
	4356	return NULL;
	4357	}
	4358
	4359	if (pnum > *num_needed) {
	4360	printf("%s: File a radar related to <rdar://10146739>. \
	4361	needed = %u, pnum = %u, num_needed = %u \n",
	4362	__func__, needed, pnum, *num_needed);
	4363	}
	4364
	4365	*num_needed = pnum;
	4366	return top;
	4367	}
	4368
	4369	/*
	4370	* Return list of mbuf linked by m_nextpkt. Try for numlist, and if
	4371	* wantall is not set, return whatever number were available. The size of
	4372	* each mbuf in the list is controlled by the parameter packetlen. Each
	4373	* mbuf of the list may have a chain of mbufs linked by m_next. Each mbuf
	4374	* in the chain is called a segment. If maxsegments is not null and the
	4375	* value pointed to is not null, this specify the maximum number of segments
	4376	* for a chain of mbufs. If maxsegments is zero or the value pointed to
	4377	* is zero the caller does not have any restriction on the number of segments.
	4378	* The actual number of segments of a mbuf chain is return in the value
	4379	* pointed to by maxsegments.
	4380	*/
	4381	__private_extern__ struct mbuf *
	4382	m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
	4383	unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
	4384	{
	4385	struct mbuf *np, top, *first = NULL;
	4386	size_t bufsize, r_bufsize;
	4387	unsigned int num = 0;
	4388	unsigned int nsegs = 0;
	4389	unsigned int needed, resid;
	4390	int mcflags = MSLEEPF(wait);
	4391	mcache_obj_t mp_list = NULL, rmp_list = NULL;
	4392	mcache_t cp = NULL, rcp = NULL;
	4393
	4394	if (*numlist == 0) {
	4395	return NULL;
	4396	}
	4397
	4398	top = NULL;
	4399	np = &top;
	4400
	4401	if (wantsize == 0) {
	4402	if (packetlen <= MINCLSIZE) {
	4403	bufsize = packetlen;
	4404	} else if (packetlen > m_maxsize(MC_CL)) {
	4405	/* Use 4KB if jumbo cluster pool isn't available */
	4406	if (packetlen <= m_maxsize(MC_BIGCL) \|\| njcl == 0) {
	4407	bufsize = m_maxsize(MC_BIGCL);
	4408	} else {
	4409	bufsize = m_maxsize(MC_16KCL);
	4410	}
	4411	} else {
	4412	bufsize = m_maxsize(MC_CL);
	4413	}
	4414	} else if (wantsize == m_maxsize(MC_CL) \|\|
	4415	wantsize == m_maxsize(MC_BIGCL) \|\|
	4416	(wantsize == m_maxsize(MC_16KCL) && njcl > 0)) {
	4417	bufsize = wantsize;
	4418	} else {
	4419	*numlist = 0;
	4420	return NULL;
	4421	}
	4422
	4423	if (bufsize <= MHLEN) {
	4424	nsegs = 1;
	4425	} else if (bufsize <= MINCLSIZE) {
	4426	if (maxsegments != NULL && *maxsegments == 1) {
	4427	bufsize = m_maxsize(MC_CL);
	4428	nsegs = 1;
	4429	} else {
	4430	nsegs = 2;
	4431	}
	4432	} else if (bufsize == m_maxsize(MC_16KCL)) {
	4433	VERIFY(njcl > 0);
	4434	nsegs = ((packetlen - 1) >> M16KCLSHIFT) + 1;
	4435	} else if (bufsize == m_maxsize(MC_BIGCL)) {
	4436	nsegs = ((packetlen - 1) >> MBIGCLSHIFT) + 1;
	4437	} else {
	4438	nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
	4439	}
	4440	if (maxsegments != NULL) {
	4441	if (maxsegments && nsegs > maxsegments) {
	4442	*maxsegments = nsegs;
	4443	*numlist = 0;
	4444	return NULL;
	4445	}
	4446	*maxsegments = nsegs;
	4447	}
	4448
	4449	/*
	4450	* The caller doesn't want all the requested buffers; only some.
	4451	* Try hard to get what we can, but don't block. This effectively
	4452	* overrides MCR_SLEEP, since this thread will not go to sleep
	4453	* if we can't get all the buffers.
	4454	*/
	4455	if (!wantall \|\| (mcflags & MCR_NOSLEEP)) {
	4456	mcflags \|= MCR_TRYHARD;
	4457	}
	4458
	4459	/*
	4460	* Simple case where all elements in the lists/chains are mbufs.
	4461	* Unless bufsize is greater than MHLEN, each segment chain is made
	4462	* up of exactly 1 mbuf. Otherwise, each segment chain is made up
	4463	* of 2 mbufs; the second one is used for the residual data, i.e.
	4464	* the remaining data that cannot fit into the first mbuf.
	4465	*/
	4466	if (bufsize <= MINCLSIZE) {
	4467	/* Allocate the elements in one shot from the mbuf cache */
	4468	ASSERT(bufsize <= MHLEN \|\| nsegs == 2);
	4469	cp = m_cache(MC_MBUF);
	4470	needed = mcache_alloc_ext(cp, &mp_list,
	4471	(numlist) nsegs, mcflags);
	4472
	4473	/*
	4474	* The number of elements must be even if we are to use an
	4475	* mbuf (instead of a cluster) to store the residual data.
	4476	* If we couldn't allocate the requested number of mbufs,
	4477	* trim the number down (if it's odd) in order to avoid
	4478	* creating a partial segment chain.
	4479	*/
	4480	if (bufsize > MHLEN && (needed & 0x1)) {
	4481	needed--;
	4482	}
	4483
	4484	while (num < needed) {
	4485	struct mbuf *m;
	4486
	4487	m = (struct mbuf *)mp_list;
	4488	mp_list = mp_list->obj_next;
	4489	ASSERT(m != NULL);
	4490
	4491	MBUF_INIT(m, 1, MT_DATA);
	4492	num++;
	4493	if (bufsize > MHLEN) {
	4494	/* A second mbuf for this segment chain */
	4495	m->m_next = (struct mbuf *)mp_list;
	4496	mp_list = mp_list->obj_next;
	4497	ASSERT(m->m_next != NULL);
	4498
	4499	MBUF_INIT(m->m_next, 0, MT_DATA);
	4500	num++;
	4501	}
	4502	*np = m;
	4503	np = &m->m_nextpkt;
	4504	}
	4505	ASSERT(num != *numlist \|\| mp_list == NULL);
	4506
	4507	if (num > 0) {
	4508	mtype_stat_add(MT_DATA, num);
	4509	mtype_stat_sub(MT_FREE, num);
	4510	}
	4511	num /= nsegs;
	4512
	4513	/* We've got them all; return to caller */
	4514	if (num == *numlist) {
	4515	return top;
	4516	}
	4517
	4518	goto fail;
	4519	}
	4520
	4521	/*
	4522	* Complex cases where elements are made up of one or more composite
	4523	* mbufs + cluster, depending on packetlen. Each N-segment chain can
	4524	* be illustrated as follows:
	4525	*
	4526	* [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
	4527	*
	4528	* Every composite mbuf + cluster element comes from the intermediate
	4529	* cache (either MC_MBUF_CL or MC_MBUF_BIGCL). For space efficiency,
	4530	* the last composite element will come from the MC_MBUF_CL cache,
	4531	* unless the residual data is larger than 2KB where we use the
	4532	* big cluster composite cache (MC_MBUF_BIGCL) instead. Residual
	4533	* data is defined as extra data beyond the first element that cannot
	4534	* fit into the previous element, i.e. there is no residual data if
	4535	* the chain only has 1 segment.
	4536	*/
	4537	r_bufsize = bufsize;
	4538	resid = packetlen > bufsize ? packetlen % bufsize : 0;
	4539	if (resid > 0) {
	4540	/* There is residual data; figure out the cluster size */
	4541	if (wantsize == 0 && packetlen > MINCLSIZE) {
	4542	/*
	4543	* Caller didn't request that all of the segments
	4544	* in the chain use the same cluster size; use the
	4545	* smaller of the cluster sizes.
	4546	*/
	4547	if (njcl > 0 && resid > m_maxsize(MC_BIGCL)) {
	4548	r_bufsize = m_maxsize(MC_16KCL);
	4549	} else if (resid > m_maxsize(MC_CL)) {
	4550	r_bufsize = m_maxsize(MC_BIGCL);
	4551	} else {
	4552	r_bufsize = m_maxsize(MC_CL);
	4553	}
	4554	} else {
	4555	/* Use the same cluster size as the other segments */
	4556	resid = 0;
	4557	}
	4558	}
	4559
	4560	needed = *numlist;
	4561	if (resid > 0) {
	4562	/*
	4563	* Attempt to allocate composite mbuf + cluster elements for
	4564	* the residual data in each chain; record the number of such
	4565	* elements that can be allocated so that we know how many
	4566	* segment chains we can afford to create.
	4567	*/
	4568	if (r_bufsize <= m_maxsize(MC_CL)) {
	4569	rcp = m_cache(MC_MBUF_CL);
	4570	} else if (r_bufsize <= m_maxsize(MC_BIGCL)) {
	4571	rcp = m_cache(MC_MBUF_BIGCL);
	4572	} else {
	4573	rcp = m_cache(MC_MBUF_16KCL);
	4574	}
	4575	needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
	4576
	4577	if (needed == 0) {
	4578	goto fail;
	4579	}
	4580
	4581	/* This is temporarily reduced for calculation */
	4582	ASSERT(nsegs > 1);
	4583	nsegs--;
	4584	}
	4585
	4586	/*
	4587	* Attempt to allocate the rest of the composite mbuf + cluster
	4588	* elements for the number of segment chains that we need.
	4589	*/
	4590	if (bufsize <= m_maxsize(MC_CL)) {
	4591	cp = m_cache(MC_MBUF_CL);
	4592	} else if (bufsize <= m_maxsize(MC_BIGCL)) {
	4593	cp = m_cache(MC_MBUF_BIGCL);
	4594	} else {
	4595	cp = m_cache(MC_MBUF_16KCL);
	4596	}
	4597	needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
	4598
	4599	/* Round it down to avoid creating a partial segment chain */
	4600	needed = (needed / nsegs) * nsegs;
	4601	if (needed == 0) {
	4602	goto fail;
	4603	}
	4604
	4605	if (resid > 0) {
	4606	/*
	4607	* We're about to construct the chain(s); take into account
	4608	* the number of segments we have created above to hold the
	4609	* residual data for each chain, as well as restore the
	4610	* original count of segments per chain.
	4611	*/
	4612	ASSERT(nsegs > 0);
	4613	needed += needed / nsegs;
	4614	nsegs++;
	4615	}
	4616
	4617	for (;;) {
	4618	struct mbuf *m;
	4619	u_int16_t flag;
	4620	struct ext_ref *rfa;
	4621	void *cl;
	4622	int pkthdr;
	4623	m_ext_free_func_t m_free_func;
	4624
	4625	++num;
	4626	if (nsegs == 1 \|\| (num % nsegs) != 0 \|\| resid == 0) {
	4627	m = (struct mbuf *)mp_list;
	4628	mp_list = mp_list->obj_next;
	4629	} else {
	4630	m = (struct mbuf *)rmp_list;
	4631	rmp_list = rmp_list->obj_next;
	4632	}
	4633	m_free_func = m_get_ext_free(m);
	4634	ASSERT(m != NULL);
	4635	VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
	4636	VERIFY(m_free_func == NULL \|\| m_free_func == m_bigfree \|\|
	4637	m_free_func == m_16kfree);
	4638
	4639	cl = m->m_ext.ext_buf;
	4640	rfa = m_get_rfa(m);
	4641
	4642	ASSERT(cl != NULL && rfa != NULL);
	4643	VERIFY(MBUF_IS_COMPOSITE(m));
	4644
	4645	flag = MEXT_FLAGS(m);
	4646
	4647	pkthdr = (nsegs == 1 \|\| (num % nsegs) == 1);
	4648	if (pkthdr) {
	4649	first = m;
	4650	}
	4651	MBUF_INIT(m, pkthdr, MT_DATA);
	4652	if (m_free_func == m_16kfree) {
	4653	MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
	4654	} else if (m_free_func == m_bigfree) {
	4655	MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
	4656	} else {
	4657	MBUF_CL_INIT(m, cl, rfa, 1, flag);
	4658	}
	4659
	4660	*np = m;
	4661	if ((num % nsegs) == 0) {
	4662	np = &first->m_nextpkt;
	4663	} else {
	4664	np = &m->m_next;
	4665	}
	4666
	4667	if (num == needed) {
	4668	break;
	4669	}
	4670	}
	4671
	4672	if (num > 0) {
	4673	mtype_stat_add(MT_DATA, num);
	4674	mtype_stat_sub(MT_FREE, num);
	4675	}
	4676
	4677	num /= nsegs;
	4678
	4679	/* We've got them all; return to caller */
	4680	if (num == *numlist) {
	4681	ASSERT(mp_list == NULL && rmp_list == NULL);
	4682	return top;
	4683	}
	4684
	4685	fail:
	4686	/* Free up what's left of the above */
	4687	if (mp_list != NULL) {
	4688	mcache_free_ext(cp, mp_list);
	4689	}
	4690	if (rmp_list != NULL) {
	4691	mcache_free_ext(rcp, rmp_list);
	4692	}
	4693	if (wantall && top != NULL) {
	4694	m_freem_list(top);
	4695	*numlist = 0;
	4696	return NULL;
	4697	}
	4698	*numlist = num;
	4699	return top;
	4700	}
	4701
	4702	/*
	4703	* Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
	4704	* packets on receive ring.
	4705	*/
	4706	__private_extern__ struct mbuf *
	4707	m_getpacket_how(int wait)
	4708	{
	4709	unsigned int num_needed = 1;
	4710
	4711	return m_getpackets_internal(&num_needed, 1, wait, 1,
	4712	m_maxsize(MC_CL));
	4713	}
	4714
	4715	/*
	4716	* Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
	4717	* packets on receive ring.
	4718	*/
	4719	struct mbuf *
	4720	m_getpacket(void)
	4721	{
	4722	unsigned int num_needed = 1;
	4723
	4724	return m_getpackets_internal(&num_needed, 1, M_WAIT, 1,
	4725	m_maxsize(MC_CL));
	4726	}
	4727
	4728	/*
	4729	* Return a list of mbuf hdrs that point to clusters. Try for num_needed;
	4730	* if this can't be met, return whatever number were available. Set up the
	4731	* first num_with_pkthdrs with mbuf hdrs configured as packet headers. These
	4732	* are chained on the m_nextpkt field. Any packets requested beyond this are
	4733	* chained onto the last packet header's m_next field.
	4734	*/
	4735	struct mbuf *
	4736	m_getpackets(int num_needed, int num_with_pkthdrs, int how)
	4737	{
	4738	unsigned int n = num_needed;
	4739
	4740	return m_getpackets_internal(&n, num_with_pkthdrs, how, 0,
	4741	m_maxsize(MC_CL));
	4742	}
	4743
	4744	/*
	4745	* Return a list of mbuf hdrs set up as packet hdrs chained together
	4746	* on the m_nextpkt field
	4747	*/
	4748	struct mbuf *
	4749	m_getpackethdrs(int num_needed, int how)
	4750	{
	4751	struct mbuf *m;
	4752	struct mbuf *np, top;
	4753
	4754	top = NULL;
	4755	np = &top;
	4756
	4757	while (num_needed--) {
	4758	m = _M_RETRYHDR(how, MT_DATA);
	4759	if (m == NULL) {
	4760	break;
	4761	}
	4762
	4763	*np = m;
	4764	np = &m->m_nextpkt;
	4765	}
	4766
	4767	return top;
	4768	}
	4769
	4770	/*
	4771	* Free an mbuf list (m_nextpkt) while following m_next. Returns the count
	4772	* for mbufs packets freed. Used by the drivers.
	4773	*/
	4774	int
	4775	m_freem_list(struct mbuf *m)
	4776	{
	4777	struct mbuf *nextpkt;
	4778	mcache_obj_t *mp_list = NULL;
	4779	mcache_obj_t *mcl_list = NULL;
	4780	mcache_obj_t *mbc_list = NULL;
	4781	mcache_obj_t *m16k_list = NULL;
	4782	mcache_obj_t *m_mcl_list = NULL;
	4783	mcache_obj_t *m_mbc_list = NULL;
	4784	mcache_obj_t *m_m16k_list = NULL;
	4785	mcache_obj_t *ref_list = NULL;
	4786	int pktcount = 0;
	4787	int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
	4788
	4789	while (m != NULL) {
	4790	pktcount++;
	4791
	4792	nextpkt = m->m_nextpkt;
	4793	m->m_nextpkt = NULL;
	4794
	4795	while (m != NULL) {
	4796	struct mbuf *next = m->m_next;
	4797	mcache_obj_t o, rfa;
	4798	u_int32_t composite;
	4799	u_int16_t refcnt;
	4800	m_ext_free_func_t m_free_func;
	4801
	4802	if (m->m_type == MT_FREE) {
	4803	panic("m_free: freeing an already freed mbuf");
	4804	}
	4805
	4806	if (m->m_flags & M_PKTHDR) {
	4807	/* Check for scratch area overflow */
	4808	m_redzone_verify(m);
	4809	/* Free the aux data and tags if there is any */
	4810	m_tag_delete_chain(m, NULL);
	4811	}
	4812
	4813	if (!(m->m_flags & M_EXT)) {
	4814	mt_free++;
	4815	goto simple_free;
	4816	}
	4817
	4818	if (MBUF_IS_PAIRED(m) && m_free_paired(m)) {
	4819	m = next;
	4820	continue;
	4821	}
	4822
	4823	mt_free++;
	4824
	4825	o = (mcache_obj_t )(void )m->m_ext.ext_buf;
	4826	refcnt = m_decref(m);
	4827	composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
	4828	m_free_func = m_get_ext_free(m);
	4829	if (refcnt == MEXT_MINREF(m) && !composite) {
	4830	if (m_free_func == NULL) {
	4831	o->obj_next = mcl_list;
	4832	mcl_list = o;
	4833	} else if (m_free_func == m_bigfree) {
	4834	o->obj_next = mbc_list;
	4835	mbc_list = o;
	4836	} else if (m_free_func == m_16kfree) {
	4837	o->obj_next = m16k_list;
	4838	m16k_list = o;
	4839	} else {
	4840	(*(m_free_func))((caddr_t)o,
	4841	m->m_ext.ext_size,
	4842	m_get_ext_arg(m));
	4843	}
	4844	rfa = (mcache_obj_t )(void )m_get_rfa(m);
	4845	rfa->obj_next = ref_list;
	4846	ref_list = rfa;
	4847	m_set_ext(m, NULL, NULL, NULL);
	4848	} else if (refcnt == MEXT_MINREF(m) && composite) {
	4849	VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
	4850	VERIFY(m->m_type != MT_FREE);
	4851	/*
	4852	* Amortize the costs of atomic operations
	4853	* by doing them at the end, if possible.
	4854	*/
	4855	if (m->m_type == MT_DATA) {
	4856	mt_data++;
	4857	} else if (m->m_type == MT_HEADER) {
	4858	mt_header++;
	4859	} else if (m->m_type == MT_SONAME) {
	4860	mt_soname++;
	4861	} else if (m->m_type == MT_TAG) {
	4862	mt_tag++;
	4863	} else {
	4864	mtype_stat_dec(m->m_type);
	4865	}
	4866
	4867	m->m_type = MT_FREE;
	4868	m->m_flags = M_EXT;
	4869	m->m_len = 0;
	4870	m->m_next = m->m_nextpkt = NULL;
	4871
	4872	MEXT_FLAGS(m) &= ~EXTF_READONLY;
	4873
	4874	/* "Free" into the intermediate cache */
	4875	o = (mcache_obj_t *)m;
	4876	if (m_free_func == NULL) {
	4877	o->obj_next = m_mcl_list;
	4878	m_mcl_list = o;
	4879	} else if (m_free_func == m_bigfree) {
	4880	o->obj_next = m_mbc_list;
	4881	m_mbc_list = o;
	4882	} else {
	4883	VERIFY(m_free_func == m_16kfree);
	4884	o->obj_next = m_m16k_list;
	4885	m_m16k_list = o;
	4886	}
	4887	m = next;
	4888	continue;
	4889	}
	4890	simple_free:
	4891	/*
	4892	* Amortize the costs of atomic operations
	4893	* by doing them at the end, if possible.
	4894	*/
	4895	if (m->m_type == MT_DATA) {
	4896	mt_data++;
	4897	} else if (m->m_type == MT_HEADER) {
	4898	mt_header++;
	4899	} else if (m->m_type == MT_SONAME) {
	4900	mt_soname++;
	4901	} else if (m->m_type == MT_TAG) {
	4902	mt_tag++;
	4903	} else if (m->m_type != MT_FREE) {
	4904	mtype_stat_dec(m->m_type);
	4905	}
	4906
	4907	m->m_type = MT_FREE;
	4908	m->m_flags = m->m_len = 0;
	4909	m->m_next = m->m_nextpkt = NULL;
	4910
	4911	((mcache_obj_t *)m)->obj_next = mp_list;
	4912	mp_list = (mcache_obj_t *)m;
	4913
	4914	m = next;
	4915	}
	4916
	4917	m = nextpkt;
	4918	}
	4919
	4920	if (mt_free > 0) {
	4921	mtype_stat_add(MT_FREE, mt_free);
	4922	}
	4923	if (mt_data > 0) {
	4924	mtype_stat_sub(MT_DATA, mt_data);
	4925	}
	4926	if (mt_header > 0) {
	4927	mtype_stat_sub(MT_HEADER, mt_header);
	4928	}
	4929	if (mt_soname > 0) {
	4930	mtype_stat_sub(MT_SONAME, mt_soname);
	4931	}
	4932	if (mt_tag > 0) {
	4933	mtype_stat_sub(MT_TAG, mt_tag);
	4934	}
	4935
	4936	if (mp_list != NULL) {
	4937	mcache_free_ext(m_cache(MC_MBUF), mp_list);
	4938	}
	4939	if (mcl_list != NULL) {
	4940	mcache_free_ext(m_cache(MC_CL), mcl_list);
	4941	}
	4942	if (mbc_list != NULL) {
	4943	mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
	4944	}
	4945	if (m16k_list != NULL) {
	4946	mcache_free_ext(m_cache(MC_16KCL), m16k_list);
	4947	}
	4948	if (m_mcl_list != NULL) {
	4949	mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
	4950	}
	4951	if (m_mbc_list != NULL) {
	4952	mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
	4953	}
	4954	if (m_m16k_list != NULL) {
	4955	mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
	4956	}
	4957	if (ref_list != NULL) {
	4958	mcache_free_ext(ref_cache, ref_list);
	4959	}
	4960
	4961	return pktcount;
	4962	}
	4963
	4964	void
	4965	m_freem(struct mbuf *m)
	4966	{
	4967	while (m != NULL) {
	4968	m = m_free(m);
	4969	}
	4970	}
	4971
	4972	/*
	4973	* Mbuffer utility routines.
	4974	*/
	4975	/*
	4976	* Set the m_data pointer of a newly allocated mbuf to place an object of the
	4977	* specified size at the end of the mbuf, longword aligned.
	4978	*
	4979	* NB: Historically, we had M_ALIGN(), MH_ALIGN(), and MEXT_ALIGN() as
	4980	* separate macros, each asserting that it was called at the proper moment.
	4981	* This required callers to themselves test the storage type and call the
	4982	* right one. Rather than require callers to be aware of those layout
	4983	* decisions, we centralize here.
	4984	*/
	4985	void
	4986	m_align(struct mbuf *m, int len)
	4987	{
	4988	int adjust = 0;
	4989
	4990	/* At this point data must point to start */
	4991	VERIFY(m->m_data == M_START(m));
	4992	VERIFY(len >= 0);
	4993	VERIFY(len <= M_SIZE(m));
	4994	adjust = M_SIZE(m) - len;
	4995	m->m_data += adjust & ~(sizeof(long) - 1);
	4996	}
	4997
	4998	/*
	4999	* Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
	5000	* copy junk along. Does not adjust packet header length.
	5001	*/
	5002	struct mbuf *
	5003	m_prepend(struct mbuf *m, int len, int how)
	5004	{
	5005	struct mbuf *mn;
	5006
	5007	_MGET(mn, how, m->m_type);
	5008	if (mn == NULL) {
	5009	m_freem(m);
	5010	return NULL;
	5011	}
	5012	if (m->m_flags & M_PKTHDR) {
	5013	M_COPY_PKTHDR(mn, m);
	5014	m->m_flags &= ~M_PKTHDR;
	5015	}
	5016	mn->m_next = m;
	5017	m = mn;
	5018	if (m->m_flags & M_PKTHDR) {
	5019	VERIFY(len <= MHLEN);
	5020	MH_ALIGN(m, len);
	5021	} else {
	5022	VERIFY(len <= MLEN);
	5023	M_ALIGN(m, len);
	5024	}
	5025	m->m_len = len;
	5026	return m;
	5027	}
	5028
	5029	/*
	5030	* Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
	5031	* chain, copy junk along, and adjust length.
	5032	*/
	5033	struct mbuf *
	5034	m_prepend_2(struct mbuf *m, int len, int how, int align)
	5035	{
	5036	if (M_LEADINGSPACE(m) >= len &&
	5037	(!align \|\| IS_P2ALIGNED((m->m_data - len), sizeof(u_int32_t)))) {
	5038	m->m_data -= len;
	5039	m->m_len += len;
	5040	} else {
	5041	m = m_prepend(m, len, how);
	5042	}
	5043	if ((m) && (m->m_flags & M_PKTHDR)) {
	5044	m->m_pkthdr.len += len;
	5045	}
	5046	return m;
	5047	}
	5048
	5049	/*
	5050	* Make a copy of an mbuf chain starting "off0" bytes from the beginning,
	5051	* continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
	5052	* The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
	5053	*/
	5054	int MCFail;
	5055
	5056	struct mbuf *
	5057	m_copym_mode(struct mbuf *m, int off0, int len, int wait, uint32_t mode)
	5058	{
	5059	struct mbuf n, mhdr = NULL, **np;
	5060	int off = off0;
	5061	struct mbuf *top;
	5062	int copyhdr = 0;
	5063
	5064	if (off < 0 \|\| len < 0) {
	5065	panic("m_copym: invalid offset %d or len %d", off, len);
	5066	}
	5067
	5068	VERIFY((mode != M_COPYM_MUST_COPY_HDR &&
	5069	mode != M_COPYM_MUST_MOVE_HDR) \|\| (m->m_flags & M_PKTHDR));
	5070
	5071	if ((off == 0 && (m->m_flags & M_PKTHDR)) \|\|
	5072	mode == M_COPYM_MUST_COPY_HDR \|\| mode == M_COPYM_MUST_MOVE_HDR) {
	5073	mhdr = m;
	5074	copyhdr = 1;
	5075	}
	5076
	5077	while (off >= m->m_len) {
	5078	if (m->m_next == NULL) {
	5079	panic("m_copym: invalid mbuf chain");
	5080	}
	5081	off -= m->m_len;
	5082	m = m->m_next;
	5083	}
	5084	np = &top;
	5085	top = NULL;
	5086
	5087	while (len > 0) {
	5088	if (m == NULL) {
	5089	if (len != M_COPYALL) {
	5090	panic("m_copym: len != M_COPYALL");
	5091	}
	5092	break;
	5093	}
	5094
	5095	if (copyhdr) {
	5096	n = _M_RETRYHDR(wait, m->m_type);
	5097	} else {
	5098	n = _M_RETRY(wait, m->m_type);
	5099	}
	5100	*np = n;
	5101
	5102	if (n == NULL) {
	5103	goto nospace;
	5104	}
	5105
	5106	if (copyhdr != 0) {
	5107	if ((mode == M_COPYM_MOVE_HDR) \|\|
	5108	(mode == M_COPYM_MUST_MOVE_HDR)) {
	5109	M_COPY_PKTHDR(n, mhdr);
	5110	} else if ((mode == M_COPYM_COPY_HDR) \|\|
	5111	(mode == M_COPYM_MUST_COPY_HDR)) {
	5112	if (m_dup_pkthdr(n, mhdr, wait) == 0) {
	5113	goto nospace;
	5114	}
	5115	}
	5116	if (len == M_COPYALL) {
	5117	n->m_pkthdr.len -= off0;
	5118	} else {
	5119	n->m_pkthdr.len = len;
	5120	}
	5121	copyhdr = 0;
	5122	/*
	5123	* There is data to copy from the packet header mbuf
	5124	* if it is empty or it is before the starting offset
	5125	*/
	5126	if (mhdr != m) {
	5127	np = &n->m_next;
	5128	continue;
	5129	}
	5130	}
	5131	n->m_len = MIN(len, (m->m_len - off));
	5132	if (m->m_flags & M_EXT) {
	5133	n->m_ext = m->m_ext;
	5134	m_incref(m);
	5135	n->m_data = m->m_data + off;
	5136	n->m_flags \|= M_EXT;
	5137	} else {
	5138	/*
	5139	* Limit to the capacity of the destination
	5140	*/
	5141	if (n->m_flags & M_PKTHDR) {
	5142	n->m_len = MIN(n->m_len, MHLEN);
	5143	} else {
	5144	n->m_len = MIN(n->m_len, MLEN);
	5145	}
	5146
	5147	if (MTOD(n, char ) + n->m_len > ((char )n) + MSIZE) {
	5148	panic("%s n %p copy overflow",
	5149	__func__, n);
	5150	}
	5151
	5152	bcopy(MTOD(m, caddr_t) + off, MTOD(n, caddr_t),
	5153	(unsigned)n->m_len);
	5154	}
	5155	if (len != M_COPYALL) {
	5156	len -= n->m_len;
	5157	}
	5158	off = 0;
	5159	m = m->m_next;
	5160	np = &n->m_next;
	5161	}
	5162
	5163	if (top == NULL) {
	5164	MCFail++;
	5165	}
	5166
	5167	return top;
	5168	nospace:
	5169
	5170	m_freem(top);
	5171	MCFail++;
	5172	return NULL;
	5173	}
	5174
	5175
	5176	struct mbuf *
	5177	m_copym(struct mbuf *m, int off0, int len, int wait)
	5178	{
	5179	return m_copym_mode(m, off0, len, wait, M_COPYM_MOVE_HDR);
	5180	}
	5181
	5182	/*
	5183	* Equivalent to m_copym except that all necessary mbuf hdrs are allocated
	5184	* within this routine also, the last mbuf and offset accessed are passed
	5185	* out and can be passed back in to avoid having to rescan the entire mbuf
	5186	* list (normally hung off of the socket)
	5187	*/
	5188	struct mbuf *
	5189	m_copym_with_hdrs(struct mbuf *m0, int off0, int len0, int wait,
	5190	struct mbuf *m_lastm, int m_off, uint32_t mode)
	5191	{
	5192	struct mbuf m = m0, n, **np = NULL;
	5193	int off = off0, len = len0;
	5194	struct mbuf *top = NULL;
	5195	int mcflags = MSLEEPF(wait);
	5196	int copyhdr = 0;
	5197	int type = 0;
	5198	mcache_obj_t *list = NULL;
	5199	int needed = 0;
	5200
	5201	if (off == 0 && (m->m_flags & M_PKTHDR)) {
	5202	copyhdr = 1;
	5203	}
	5204
	5205	if (m_lastm != NULL && *m_lastm != NULL) {
	5206	m = *m_lastm;
	5207	off = *m_off;
	5208	} else {
	5209	while (off >= m->m_len) {
	5210	off -= m->m_len;
	5211	m = m->m_next;
	5212	}
	5213	}
	5214
	5215	n = m;
	5216	while (len > 0) {
	5217	needed++;
	5218	ASSERT(n != NULL);
	5219	len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
	5220	n = n->m_next;
	5221	}
	5222	needed++;
	5223	len = len0;
	5224
	5225	/*
	5226	* If the caller doesn't want to be put to sleep, mark it with
	5227	* MCR_TRYHARD so that we may reclaim buffers from other places
	5228	* before giving up.
	5229	*/
	5230	if (mcflags & MCR_NOSLEEP) {
	5231	mcflags \|= MCR_TRYHARD;
	5232	}
	5233
	5234	if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
	5235	mcflags) != needed) {
	5236	goto nospace;
	5237	}
	5238
	5239	needed = 0;
	5240	while (len > 0) {
	5241	n = (struct mbuf *)list;
	5242	list = list->obj_next;
	5243	ASSERT(n != NULL && m != NULL);
	5244
	5245	type = (top == NULL) ? MT_HEADER : m->m_type;
	5246	MBUF_INIT(n, (top == NULL), type);
	5247
	5248	if (top == NULL) {
	5249	top = n;
	5250	np = &top->m_next;
	5251	continue;
	5252	} else {
	5253	needed++;
	5254	*np = n;
	5255	}
	5256
	5257	if (copyhdr) {
	5258	if ((mode == M_COPYM_MOVE_HDR) \|\|
	5259	(mode == M_COPYM_MUST_MOVE_HDR)) {
	5260	M_COPY_PKTHDR(n, m);
	5261	} else if ((mode == M_COPYM_COPY_HDR) \|\|
	5262	(mode == M_COPYM_MUST_COPY_HDR)) {
	5263	if (m_dup_pkthdr(n, m, wait) == 0) {
	5264	goto nospace;
	5265	}
	5266	}
	5267	n->m_pkthdr.len = len;
	5268	copyhdr = 0;
	5269	}
	5270	n->m_len = MIN(len, (m->m_len - off));
	5271
	5272	if (m->m_flags & M_EXT) {
	5273	n->m_ext = m->m_ext;
	5274	m_incref(m);
	5275	n->m_data = m->m_data + off;
	5276	n->m_flags \|= M_EXT;
	5277	} else {
	5278	if (MTOD(n, char ) + n->m_len > ((char )n) + MSIZE) {
	5279	panic("%s n %p copy overflow",
	5280	__func__, n);
	5281	}
	5282
	5283	bcopy(MTOD(m, caddr_t) + off, MTOD(n, caddr_t),
	5284	(unsigned)n->m_len);
	5285	}
	5286	len -= n->m_len;
	5287
	5288	if (len == 0) {
	5289	if (m_lastm != NULL && m_off != NULL) {
	5290	if ((off + n->m_len) == m->m_len) {
	5291	*m_lastm = m->m_next;
	5292	*m_off = 0;
	5293	} else {
	5294	*m_lastm = m;
	5295	*m_off = off + n->m_len;
	5296	}
	5297	}
	5298	break;
	5299	}
	5300	off = 0;
	5301	m = m->m_next;
	5302	np = &n->m_next;
	5303	}
	5304
	5305	mtype_stat_inc(MT_HEADER);
	5306	mtype_stat_add(type, needed);
	5307	mtype_stat_sub(MT_FREE, needed + 1);
	5308
	5309	ASSERT(list == NULL);
	5310	return top;
	5311
	5312	nospace:
	5313	if (list != NULL) {
	5314	mcache_free_ext(m_cache(MC_MBUF), list);
	5315	}
	5316	if (top != NULL) {
	5317	m_freem(top);
	5318	}
	5319	MCFail++;
	5320	return NULL;
	5321	}
	5322
	5323	/*
	5324	* Copy data from an mbuf chain starting "off" bytes from the beginning,
	5325	* continuing for "len" bytes, into the indicated buffer.
	5326	*/
	5327	void
	5328	m_copydata(struct mbuf m, int off, int len, void vp)
	5329	{
	5330	int off0 = off, len0 = len;
	5331	struct mbuf *m0 = m;
	5332	unsigned count;
	5333	char *cp = vp;
	5334
	5335	if (__improbable(off < 0 \|\| len < 0)) {
	5336	panic("%s: invalid offset %d or len %d", __func__, off, len);
	5337	/* NOTREACHED */
	5338	}
	5339
	5340	while (off > 0) {
	5341	if (__improbable(m == NULL)) {
	5342	panic("%s: invalid mbuf chain %p [off %d, len %d]",
	5343	__func__, m0, off0, len0);
	5344	/* NOTREACHED */
	5345	}
	5346	if (off < m->m_len) {
	5347	break;
	5348	}
	5349	off -= m->m_len;
	5350	m = m->m_next;
	5351	}
	5352	while (len > 0) {
	5353	if (__improbable(m == NULL)) {
	5354	panic("%s: invalid mbuf chain %p [off %d, len %d]",
	5355	__func__, m0, off0, len0);
	5356	/* NOTREACHED */
	5357	}
	5358	count = MIN(m->m_len - off, len);
	5359	bcopy(MTOD(m, caddr_t) + off, cp, count);
	5360	len -= count;
	5361	cp += count;
	5362	off = 0;
	5363	m = m->m_next;
	5364	}
	5365	}
	5366
	5367	/*
	5368	* Concatenate mbuf chain n to m. Both chains must be of the same type
	5369	* (e.g. MT_DATA). Any m_pkthdr is not updated.
	5370	*/
	5371	void
	5372	m_cat(struct mbuf m, struct mbuf n)
	5373	{
	5374	while (m->m_next) {
	5375	m = m->m_next;
	5376	}
	5377	while (n) {
	5378	if ((m->m_flags & M_EXT) \|\|
	5379	m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
	5380	/* just join the two chains */
	5381	m->m_next = n;
	5382	return;
	5383	}
	5384	/* splat the data from one into the other */
	5385	bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
	5386	(u_int)n->m_len);
	5387	m->m_len += n->m_len;
	5388	n = m_free(n);
	5389	}
	5390	}
	5391
	5392	void
	5393	m_adj(struct mbuf *mp, int req_len)
	5394	{
	5395	int len = req_len;
	5396	struct mbuf *m;
	5397	int count;
	5398
	5399	if ((m = mp) == NULL) {
	5400	return;
	5401	}
	5402	if (len >= 0) {
	5403	/*
	5404	* Trim from head.
	5405	*/
	5406	while (m != NULL && len > 0) {
	5407	if (m->m_len <= len) {
	5408	len -= m->m_len;
	5409	m->m_len = 0;
	5410	m = m->m_next;
	5411	} else {
	5412	m->m_len -= len;
	5413	m->m_data += len;
	5414	len = 0;
	5415	}
	5416	}
	5417	m = mp;
	5418	if (m->m_flags & M_PKTHDR) {
	5419	m->m_pkthdr.len -= (req_len - len);
	5420	}
	5421	} else {
	5422	/*
	5423	* Trim from tail. Scan the mbuf chain,
	5424	* calculating its length and finding the last mbuf.
	5425	* If the adjustment only affects this mbuf, then just
	5426	* adjust and return. Otherwise, rescan and truncate
	5427	* after the remaining size.
	5428	*/
	5429	len = -len;
	5430	count = 0;
	5431	for (;;) {
	5432	count += m->m_len;
	5433	if (m->m_next == (struct mbuf *)0) {
	5434	break;
	5435	}
	5436	m = m->m_next;
	5437	}
	5438	if (m->m_len >= len) {
	5439	m->m_len -= len;
	5440	m = mp;
	5441	if (m->m_flags & M_PKTHDR) {
	5442	m->m_pkthdr.len -= len;
	5443	}
	5444	return;
	5445	}
	5446	count -= len;
	5447	if (count < 0) {
	5448	count = 0;
	5449	}
	5450	/*
	5451	* Correct length for chain is "count".
	5452	* Find the mbuf with last data, adjust its length,
	5453	* and toss data from remaining mbufs on chain.
	5454	*/
	5455	m = mp;
	5456	if (m->m_flags & M_PKTHDR) {
	5457	m->m_pkthdr.len = count;
	5458	}
	5459	for (; m; m = m->m_next) {
	5460	if (m->m_len >= count) {
	5461	m->m_len = count;
	5462	break;
	5463	}
	5464	count -= m->m_len;
	5465	}
	5466	while ((m = m->m_next)) {
	5467	m->m_len = 0;
	5468	}
	5469	}
	5470	}
	5471
	5472	/*
	5473	* Rearange an mbuf chain so that len bytes are contiguous
	5474	* and in the data area of an mbuf (so that mtod and dtom
	5475	* will work for a structure of size len). Returns the resulting
	5476	* mbuf chain on success, frees it and returns null on failure.
	5477	* If there is room, it will add up to max_protohdr-len extra bytes to the
	5478	* contiguous region in an attempt to avoid being called next time.
	5479	*/
	5480	int MPFail;
	5481
	5482	struct mbuf *
	5483	m_pullup(struct mbuf *n, int len)
	5484	{
	5485	struct mbuf *m;
	5486	int count;
	5487	int space;
	5488
	5489	/* check invalid arguments */
	5490	if (n == NULL) {
	5491	panic("%s: n == NULL", __func__);
	5492	}
	5493	if (len < 0) {
	5494	os_log_info(OS_LOG_DEFAULT, "%s: failed negative len %d",
	5495	__func__, len);
	5496	goto bad;
	5497	}
	5498	if (len > MLEN) {
	5499	os_log_info(OS_LOG_DEFAULT, "%s: failed len %d too big",
	5500	__func__, len);
	5501	goto bad;
	5502	}
	5503	if ((n->m_flags & M_EXT) == 0 &&
	5504	n->m_data >= &n->m_dat[MLEN]) {
	5505	os_log_info(OS_LOG_DEFAULT, "%s: m_data out of bounds",
	5506	__func__);
	5507	goto bad;
	5508	}
	5509
	5510	/*
	5511	* If first mbuf has no cluster, and has room for len bytes
	5512	* without shifting current data, pullup into it,
	5513	* otherwise allocate a new mbuf to prepend to the chain.
	5514	*/
	5515	if ((n->m_flags & M_EXT) == 0 &&
	5516	len < &n->m_dat[MLEN] - n->m_data && n->m_next != NULL) {
	5517	if (n->m_len >= len) {
	5518	return n;
	5519	}
	5520	m = n;
	5521	n = n->m_next;
	5522	len -= m->m_len;
	5523	} else {
	5524	if (len > MHLEN) {
	5525	goto bad;
	5526	}
	5527	_MGET(m, M_DONTWAIT, n->m_type);
	5528	if (m == 0) {
	5529	goto bad;
	5530	}
	5531	m->m_len = 0;
	5532	if (n->m_flags & M_PKTHDR) {
	5533	M_COPY_PKTHDR(m, n);
	5534	n->m_flags &= ~M_PKTHDR;
	5535	}
	5536	}
	5537	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
	5538	do {
	5539	count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
	5540	bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
	5541	(unsigned)count);
	5542	len -= count;
	5543	m->m_len += count;
	5544	n->m_len -= count;
	5545	space -= count;
	5546	if (n->m_len != 0) {
	5547	n->m_data += count;
	5548	} else {
	5549	n = m_free(n);
	5550	}
	5551	} while (len > 0 && n != NULL);
	5552	if (len > 0) {
	5553	(void) m_free(m);
	5554	goto bad;
	5555	}
	5556	m->m_next = n;
	5557	return m;
	5558	bad:
	5559	m_freem(n);
	5560	MPFail++;
	5561	return 0;
	5562	}
	5563
	5564	/*
	5565	* Like m_pullup(), except a new mbuf is always allocated, and we allow
	5566	* the amount of empty space before the data in the new mbuf to be specified
	5567	* (in the event that the caller expects to prepend later).
	5568	*/
	5569	__private_extern__ int MSFail = 0;
	5570
	5571	__private_extern__ struct mbuf *
	5572	m_copyup(struct mbuf *n, int len, int dstoff)
	5573	{
	5574	struct mbuf *m;
	5575	int count, space;
	5576
	5577	VERIFY(len >= 0 && dstoff >= 0);
	5578
	5579	if (len > (MHLEN - dstoff)) {
	5580	goto bad;
	5581	}
	5582	MGET(m, M_DONTWAIT, n->m_type);
	5583	if (m == NULL) {
	5584	goto bad;
	5585	}
	5586	m->m_len = 0;
	5587	if (n->m_flags & M_PKTHDR) {
	5588	m_copy_pkthdr(m, n);
	5589	n->m_flags &= ~M_PKTHDR;
	5590	}
	5591	m->m_data += dstoff;
	5592	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
	5593	do {
	5594	count = min(min(max(len, max_protohdr), space), n->m_len);
	5595	memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
	5596	(unsigned)count);
	5597	len -= count;
	5598	m->m_len += count;
	5599	n->m_len -= count;
	5600	space -= count;
	5601	if (n->m_len) {
	5602	n->m_data += count;
	5603	} else {
	5604	n = m_free(n);
	5605	}
	5606	} while (len > 0 && n);
	5607	if (len > 0) {
	5608	(void) m_free(m);
	5609	goto bad;
	5610	}
	5611	m->m_next = n;
	5612	return m;
	5613	bad:
	5614	m_freem(n);
	5615	MSFail++;
	5616	return NULL;
	5617	}
	5618
	5619	/*
	5620	* Partition an mbuf chain in two pieces, returning the tail --
	5621	* all but the first len0 bytes. In case of failure, it returns NULL and
	5622	* attempts to restore the chain to its original state.
	5623	*/
	5624	struct mbuf *
	5625	m_split(struct mbuf *m0, int len0, int wait)
	5626	{
	5627	return m_split0(m0, len0, wait, 1);
	5628	}
	5629
	5630	static struct mbuf *
	5631	m_split0(struct mbuf *m0, int len0, int wait, int copyhdr)
	5632	{
	5633	struct mbuf m, n;
	5634	unsigned len = len0, remain;
	5635
	5636	/*
	5637	* First iterate to the mbuf which contains the first byte of
	5638	* data at offset len0
	5639	*/
	5640	for (m = m0; m && len > m->m_len; m = m->m_next) {
	5641	len -= m->m_len;
	5642	}
	5643	if (m == NULL) {
	5644	return NULL;
	5645	}
	5646	/*
	5647	* len effectively is now the offset in the current
	5648	* mbuf where we have to perform split.
	5649	*
	5650	* remain becomes the tail length.
	5651	* Note that len can also be == m->m_len
	5652	*/
	5653	remain = m->m_len - len;
	5654
	5655	/*
	5656	* If current mbuf len contains the entire remaining offset len,
	5657	* just make the second mbuf chain pointing to next mbuf onwards
	5658	* and return after making necessary adjustments
	5659	*/
	5660	if (copyhdr && (m0->m_flags & M_PKTHDR) && remain == 0) {
	5661	_MGETHDR(n, wait, m0->m_type);
	5662	if (n == NULL) {
	5663	return NULL;
	5664	}
	5665	n->m_next = m->m_next;
	5666	m->m_next = NULL;
	5667	n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
	5668	n->m_pkthdr.len = m0->m_pkthdr.len - len0;
	5669	m0->m_pkthdr.len = len0;
	5670	return n;
	5671	}
	5672	if (copyhdr && (m0->m_flags & M_PKTHDR)) {
	5673	_MGETHDR(n, wait, m0->m_type);
	5674	if (n == NULL) {
	5675	return NULL;
	5676	}
	5677	n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
	5678	n->m_pkthdr.len = m0->m_pkthdr.len - len0;
	5679	m0->m_pkthdr.len = len0;
	5680
	5681	/*
	5682	* If current points to external storage
	5683	* then it can be shared by making last mbuf
	5684	* of head chain and first mbuf of current chain
	5685	* pointing to different data offsets
	5686	*/
	5687	if (m->m_flags & M_EXT) {
	5688	goto extpacket;
	5689	}
	5690	if (remain > MHLEN) {
	5691	/* m can't be the lead packet */
	5692	MH_ALIGN(n, 0);
	5693	n->m_next = m_split(m, len, wait);
	5694	if (n->m_next == NULL) {
	5695	(void) m_free(n);
	5696	return NULL;
	5697	} else {
	5698	return n;
	5699	}
	5700	} else {
	5701	MH_ALIGN(n, remain);
	5702	}
	5703	} else if (remain == 0) {
	5704	n = m->m_next;
	5705	m->m_next = NULL;
	5706	return n;
	5707	} else {
	5708	_MGET(n, wait, m->m_type);
	5709	if (n == NULL) {
	5710	return NULL;
	5711	}
	5712
	5713	if ((m->m_flags & M_EXT) == 0) {
	5714	VERIFY(remain <= MLEN);
	5715	M_ALIGN(n, remain);
	5716	}
	5717	}
	5718	extpacket:
	5719	if (m->m_flags & M_EXT) {
	5720	n->m_flags \|= M_EXT;
	5721	n->m_ext = m->m_ext;
	5722	m_incref(m);
	5723	n->m_data = m->m_data + len;
	5724	} else {
	5725	bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain);
	5726	}
	5727	n->m_len = remain;
	5728	m->m_len = len;
	5729	n->m_next = m->m_next;
	5730	m->m_next = NULL;
	5731	return n;
	5732	}
	5733
	5734	/*
	5735	* Routine to copy from device local memory into mbufs.
	5736	*/
	5737	struct mbuf *
	5738	m_devget(char buf, int totlen, int off0, struct ifnet ifp,
	5739	void (copy)(const void , void *, size_t))
	5740	{
	5741	struct mbuf *m;
	5742	struct mbuf top = NULL, *mp = &top;
	5743	int off = off0, len;
	5744	char *cp;
	5745	char *epkt;
	5746
	5747	cp = buf;
	5748	epkt = cp + totlen;
	5749	if (off) {
	5750	/*
	5751	* If 'off' is non-zero, packet is trailer-encapsulated,
	5752	* so we have to skip the type and length fields.
	5753	*/
	5754	cp += off + 2 * sizeof(u_int16_t);
	5755	totlen -= 2 * sizeof(u_int16_t);
	5756	}
	5757	_MGETHDR(m, M_DONTWAIT, MT_DATA);
	5758	if (m == NULL) {
	5759	return NULL;
	5760	}
	5761	m->m_pkthdr.rcvif = ifp;
	5762	m->m_pkthdr.len = totlen;
	5763	m->m_len = MHLEN;
	5764
	5765	while (totlen > 0) {
	5766	if (top != NULL) {
	5767	_MGET(m, M_DONTWAIT, MT_DATA);
	5768	if (m == NULL) {
	5769	m_freem(top);
	5770	return NULL;
	5771	}
	5772	m->m_len = MLEN;
	5773	}
	5774	len = MIN(totlen, epkt - cp);
	5775	if (len >= MINCLSIZE) {
	5776	MCLGET(m, M_DONTWAIT);
	5777	if (m->m_flags & M_EXT) {
	5778	m->m_len = len = MIN(len, m_maxsize(MC_CL));
	5779	} else {
	5780	/* give up when it's out of cluster mbufs */
	5781	if (top != NULL) {
	5782	m_freem(top);
	5783	}
	5784	m_freem(m);
	5785	return NULL;
	5786	}
	5787	} else {
	5788	/*
	5789	* Place initial small packet/header at end of mbuf.
	5790	*/
	5791	if (len < m->m_len) {
	5792	if (top == NULL &&
	5793	len + max_linkhdr <= m->m_len) {
	5794	m->m_data += max_linkhdr;
	5795	}
	5796	m->m_len = len;
	5797	} else {
	5798	len = m->m_len;
	5799	}
	5800	}
	5801	if (copy) {
	5802	copy(cp, MTOD(m, caddr_t), (unsigned)len);
	5803	} else {
	5804	bcopy(cp, MTOD(m, caddr_t), (unsigned)len);
	5805	}
	5806	cp += len;
	5807	*mp = m;
	5808	mp = &m->m_next;
	5809	totlen -= len;
	5810	if (cp == epkt) {
	5811	cp = buf;
	5812	}
	5813	}
	5814	return top;
	5815	}
	5816
	5817	#ifndef MBUF_GROWTH_NORMAL_THRESH
	5818	#define MBUF_GROWTH_NORMAL_THRESH 25
	5819	#endif
	5820
	5821	/*
	5822	* Cluster freelist allocation check.
	5823	*/
	5824	static int
	5825	m_howmany(int num, size_t bufsize)
	5826	{
	5827	int i = 0, j = 0;
	5828	u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters;
	5829	u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree;
	5830	u_int32_t sumclusters, freeclusters;
	5831	u_int32_t percent_pool, percent_kmem;
	5832	u_int32_t mb_growth, mb_growth_thresh;
	5833
	5834	VERIFY(bufsize == m_maxsize(MC_BIGCL) \|\|
	5835	bufsize == m_maxsize(MC_16KCL));
	5836
	5837	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
	5838
	5839	/* Numbers in 2K cluster units */
	5840	m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
	5841	m_clusters = m_total(MC_CL);
	5842	m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
	5843	m_16kclusters = m_total(MC_16KCL);
	5844	sumclusters = m_mbclusters + m_clusters + m_bigclusters;
	5845
	5846	m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT;
	5847	m_clfree = m_infree(MC_CL);
	5848	m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT;
	5849	m_16kclfree = m_infree(MC_16KCL);
	5850	freeclusters = m_mbfree + m_clfree + m_bigclfree;
	5851
	5852	/* Bail if we've maxed out the mbuf memory map */
	5853	if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) \|\|
	5854	(njcl > 0 && bufsize == m_maxsize(MC_16KCL) &&
	5855	(m_16kclusters << NCLPJCLSHIFT) >= njcl)) {
	5856	mbwdog_logger("maxed out nclusters (%u >= %u) or njcl (%u >= %u)",
	5857	sumclusters, nclusters,
	5858	(m_16kclusters << NCLPJCLSHIFT), njcl);
	5859	return 0;
	5860	}
	5861
	5862	if (bufsize == m_maxsize(MC_BIGCL)) {
	5863	/* Under minimum */
	5864	if (m_bigclusters < m_minlimit(MC_BIGCL)) {
	5865	return m_minlimit(MC_BIGCL) - m_bigclusters;
	5866	}
	5867
	5868	percent_pool =
	5869	((sumclusters - freeclusters) * 100) / sumclusters;
	5870	percent_kmem = (sumclusters * 100) / nclusters;
	5871
	5872	/*
	5873	* If a light/normal user, grow conservatively (75%)
	5874	* If a heavy user, grow aggressively (50%)
	5875	*/
	5876	if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH) {
	5877	mb_growth = MB_GROWTH_NORMAL;
	5878	} else {
	5879	mb_growth = MB_GROWTH_AGGRESSIVE;
	5880	}
	5881
	5882	if (percent_kmem < 5) {
	5883	/* For initial allocations */
	5884	i = num;
	5885	} else {
	5886	/* Return if >= MBIGCL_LOWAT clusters available */
	5887	if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT &&
	5888	m_total(MC_BIGCL) >=
	5889	MBIGCL_LOWAT + m_minlimit(MC_BIGCL)) {
	5890	return 0;
	5891	}
	5892
	5893	/* Ensure at least num clusters are accessible */
	5894	if (num >= m_infree(MC_BIGCL)) {
	5895	i = num - m_infree(MC_BIGCL);
	5896	}
	5897	if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL)) {
	5898	j = num - (m_total(MC_BIGCL) -
	5899	m_minlimit(MC_BIGCL));
	5900	}
	5901
	5902	i = MAX(i, j);
	5903
	5904	/*
	5905	* Grow pool if percent_pool > 75 (normal growth)
	5906	* or percent_pool > 50 (aggressive growth).
	5907	*/
	5908	mb_growth_thresh = 100 - (100 / (1 << mb_growth));
	5909	if (percent_pool > mb_growth_thresh) {
	5910	j = ((sumclusters + num) >> mb_growth) -
	5911	freeclusters;
	5912	}
	5913	i = MAX(i, j);
	5914	}
	5915
	5916	/* Check to ensure we didn't go over limits */
	5917	if (i + m_bigclusters >= m_maxlimit(MC_BIGCL)) {
	5918	i = m_maxlimit(MC_BIGCL) - m_bigclusters;
	5919	}
	5920	if ((i << 1) + sumclusters >= nclusters) {
	5921	i = (nclusters - sumclusters) >> 1;
	5922	}
	5923	VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
	5924	VERIFY(sumclusters + (i << 1) <= nclusters);
	5925	} else { /* 16K CL */
	5926	VERIFY(njcl > 0);
	5927	/* Ensure at least num clusters are available */
	5928	if (num >= m_16kclfree) {
	5929	i = num - m_16kclfree;
	5930	}
	5931
	5932	/* Always grow 16KCL pool aggressively */
	5933	if (((m_16kclusters + num) >> 1) > m_16kclfree) {
	5934	j = ((m_16kclusters + num) >> 1) - m_16kclfree;
	5935	}
	5936	i = MAX(i, j);
	5937
	5938	/* Check to ensure we don't go over limit */
	5939	if ((i + m_total(MC_16KCL)) >= m_maxlimit(MC_16KCL)) {
	5940	i = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
	5941	}
	5942	}
	5943	return i;
	5944	}
	5945	/*
	5946	* Return the number of bytes in the mbuf chain, m.
	5947	*/
	5948	unsigned int
	5949	m_length(struct mbuf *m)
	5950	{
	5951	struct mbuf *m0;
	5952	unsigned int pktlen;
	5953
	5954	if (m->m_flags & M_PKTHDR) {
	5955	return m->m_pkthdr.len;
	5956	}
	5957
	5958	pktlen = 0;
	5959	for (m0 = m; m0 != NULL; m0 = m0->m_next) {
	5960	pktlen += m0->m_len;
	5961	}
	5962	return pktlen;
	5963	}
	5964
	5965	/*
	5966	* Copy data from a buffer back into the indicated mbuf chain,
	5967	* starting "off" bytes from the beginning, extending the mbuf
	5968	* chain if necessary.
	5969	*/
	5970	void
	5971	m_copyback(struct mbuf m0, int off, int len, const void cp)
	5972	{
	5973	#if DEBUG
	5974	struct mbuf *origm = m0;
	5975	int error;
	5976	#endif /* DEBUG */
	5977
	5978	if (m0 == NULL) {
	5979	return;
	5980	}
	5981
	5982	#if DEBUG
	5983	error =
	5984	#endif /* DEBUG */
	5985	m_copyback0(&m0, off, len, cp,
	5986	M_COPYBACK0_COPYBACK \| M_COPYBACK0_EXTEND, M_DONTWAIT);
	5987
	5988	#if DEBUG
	5989	if (error != 0 \|\| (m0 != NULL && origm != m0)) {
	5990	panic("m_copyback");
	5991	}
	5992	#endif /* DEBUG */
	5993	}
	5994
	5995	struct mbuf *
	5996	m_copyback_cow(struct mbuf m0, int off, int len, const void cp, int how)
	5997	{
	5998	int error;
	5999
	6000	/* don't support chain expansion */
	6001	VERIFY(off + len <= m_length(m0));
	6002
	6003	error = m_copyback0(&m0, off, len, cp,
	6004	M_COPYBACK0_COPYBACK \| M_COPYBACK0_COW, how);
	6005	if (error) {
	6006	/*
	6007	* no way to recover from partial success.
	6008	* just free the chain.
	6009	*/
	6010	m_freem(m0);
	6011	return NULL;
	6012	}
	6013	return m0;
	6014	}
	6015
	6016	/*
	6017	* m_makewritable: ensure the specified range writable.
	6018	*/
	6019	int
	6020	m_makewritable(struct mbuf **mp, int off, int len, int how)
	6021	{
	6022	int error;
	6023	#if DEBUG
	6024	struct mbuf *n;
	6025	int origlen, reslen;
	6026
	6027	origlen = m_length(*mp);
	6028	#endif /* DEBUG */
	6029
	6030	#if 0 /* M_COPYALL is large enough */
	6031	if (len == M_COPYALL) {
	6032	len = m_length(mp) - off; / XXX */
	6033	}
	6034	#endif
	6035
	6036	error = m_copyback0(mp, off, len, NULL,
	6037	M_COPYBACK0_PRESERVE \| M_COPYBACK0_COW, how);
	6038
	6039	#if DEBUG
	6040	reslen = 0;
	6041	for (n = *mp; n; n = n->m_next) {
	6042	reslen += n->m_len;
	6043	}
	6044	if (origlen != reslen) {
	6045	panic("m_makewritable: length changed");
	6046	}
	6047	if (((mp)->m_flags & M_PKTHDR) && reslen != (mp)->m_pkthdr.len) {
	6048	panic("m_makewritable: inconsist");
	6049	}
	6050	#endif /* DEBUG */
	6051
	6052	return error;
	6053	}
	6054
	6055	static int
	6056	m_copyback0(struct mbuf *mp0, int off, int len, const void vp, int flags,
	6057	int how)
	6058	{
	6059	int mlen;
	6060	struct mbuf m, n;
	6061	struct mbuf **mp;
	6062	int totlen = 0;
	6063	const char *cp = vp;
	6064
	6065	VERIFY(mp0 != NULL);
	6066	VERIFY(*mp0 != NULL);
	6067	VERIFY((flags & M_COPYBACK0_PRESERVE) == 0 \|\| cp == NULL);
	6068	VERIFY((flags & M_COPYBACK0_COPYBACK) == 0 \|\| cp != NULL);
	6069
	6070	/*
	6071	* we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
	6072	* assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
	6073	*/
	6074
	6075	VERIFY((~flags & (M_COPYBACK0_EXTEND \| M_COPYBACK0_COW)) != 0);
	6076
	6077	mp = mp0;
	6078	m = *mp;
	6079	while (off > (mlen = m->m_len)) {
	6080	off -= mlen;
	6081	totlen += mlen;
	6082	if (m->m_next == NULL) {
	6083	int tspace;
	6084	extend:
	6085	if (!(flags & M_COPYBACK0_EXTEND)) {
	6086	goto out;
	6087	}
	6088
	6089	/*
	6090	* try to make some space at the end of "m".
	6091	*/
	6092
	6093	mlen = m->m_len;
	6094	if (off + len >= MINCLSIZE &&
	6095	!(m->m_flags & M_EXT) && m->m_len == 0) {
	6096	MCLGET(m, how);
	6097	}
	6098	tspace = M_TRAILINGSPACE(m);
	6099	if (tspace > 0) {
	6100	tspace = MIN(tspace, off + len);
	6101	VERIFY(tspace > 0);
	6102	bzero(mtod(m, char *) + m->m_len,
	6103	MIN(off, tspace));
	6104	m->m_len += tspace;
	6105	off += mlen;
	6106	totlen -= mlen;
	6107	continue;
	6108	}
	6109
	6110	/*
	6111	* need to allocate an mbuf.
	6112	*/
	6113
	6114	if (off + len >= MINCLSIZE) {
	6115	n = m_getcl(how, m->m_type, 0);
	6116	} else {
	6117	n = _M_GET(how, m->m_type);
	6118	}
	6119	if (n == NULL) {
	6120	goto out;
	6121	}
	6122	n->m_len = 0;
	6123	n->m_len = MIN(M_TRAILINGSPACE(n), off + len);
	6124	bzero(mtod(n, char *), MIN(n->m_len, off));
	6125	m->m_next = n;
	6126	}
	6127	mp = &m->m_next;
	6128	m = m->m_next;
	6129	}
	6130	while (len > 0) {
	6131	mlen = m->m_len - off;
	6132	if (mlen != 0 && m_mclhasreference(m)) {
	6133	char *datap;
	6134	int eatlen;
	6135
	6136	/*
	6137	* this mbuf is read-only.
	6138	* allocate a new writable mbuf and try again.
	6139	*/
	6140
	6141	#if DIAGNOSTIC
	6142	if (!(flags & M_COPYBACK0_COW)) {
	6143	panic("m_copyback0: read-only");
	6144	}
	6145	#endif /* DIAGNOSTIC */
	6146
	6147	/*
	6148	* if we're going to write into the middle of
	6149	* a mbuf, split it first.
	6150	*/
	6151	if (off > 0 && len < mlen) {
	6152	n = m_split0(m, off, how, 0);
	6153	if (n == NULL) {
	6154	goto enobufs;
	6155	}
	6156	m->m_next = n;
	6157	mp = &m->m_next;
	6158	m = n;
	6159	off = 0;
	6160	continue;
	6161	}
	6162
	6163	/*
	6164	* XXX TODO coalesce into the trailingspace of
	6165	* the previous mbuf when possible.
	6166	*/
	6167
	6168	/*
	6169	* allocate a new mbuf. copy packet header if needed.
	6170	*/
	6171	n = _M_GET(how, m->m_type);
	6172	if (n == NULL) {
	6173	goto enobufs;
	6174	}
	6175	if (off == 0 && (m->m_flags & M_PKTHDR)) {
	6176	M_COPY_PKTHDR(n, m);
	6177	n->m_len = MHLEN;
	6178	} else {
	6179	if (len >= MINCLSIZE) {
	6180	MCLGET(n, M_DONTWAIT);
	6181	}
	6182	n->m_len =
	6183	(n->m_flags & M_EXT) ? MCLBYTES : MLEN;
	6184	}
	6185	if (n->m_len > len) {
	6186	n->m_len = len;
	6187	}
	6188
	6189	/*
	6190	* free the region which has been overwritten.
	6191	* copying data from old mbufs if requested.
	6192	*/
	6193	if (flags & M_COPYBACK0_PRESERVE) {
	6194	datap = mtod(n, char *);
	6195	} else {
	6196	datap = NULL;
	6197	}
	6198	eatlen = n->m_len;
	6199	VERIFY(off == 0 \|\| eatlen >= mlen);
	6200	if (off > 0) {
	6201	VERIFY(len >= mlen);
	6202	m->m_len = off;
	6203	m->m_next = n;
	6204	if (datap) {
	6205	m_copydata(m, off, mlen, datap);
	6206	datap += mlen;
	6207	}
	6208	eatlen -= mlen;
	6209	mp = &m->m_next;
	6210	m = m->m_next;
	6211	}
	6212	while (m != NULL && m_mclhasreference(m) &&
	6213	n->m_type == m->m_type && eatlen > 0) {
	6214	mlen = MIN(eatlen, m->m_len);
	6215	if (datap) {
	6216	m_copydata(m, 0, mlen, datap);
	6217	datap += mlen;
	6218	}
	6219	m->m_data += mlen;
	6220	m->m_len -= mlen;
	6221	eatlen -= mlen;
	6222	if (m->m_len == 0) {
	6223	*mp = m = m_free(m);
	6224	}
	6225	}
	6226	if (eatlen > 0) {
	6227	n->m_len -= eatlen;
	6228	}
	6229	n->m_next = m;
	6230	*mp = m = n;
	6231	continue;
	6232	}
	6233	mlen = MIN(mlen, len);
	6234	if (flags & M_COPYBACK0_COPYBACK) {
	6235	bcopy(cp, mtod(m, caddr_t) + off, (unsigned)mlen);
	6236	cp += mlen;
	6237	}
	6238	len -= mlen;
	6239	mlen += off;
	6240	off = 0;
	6241	totlen += mlen;
	6242	if (len == 0) {
	6243	break;
	6244	}
	6245	if (m->m_next == NULL) {
	6246	goto extend;
	6247	}
	6248	mp = &m->m_next;
	6249	m = m->m_next;
	6250	}
	6251	out:
	6252	if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) {
	6253	VERIFY(flags & M_COPYBACK0_EXTEND);
	6254	m->m_pkthdr.len = totlen;
	6255	}
	6256
	6257	return 0;
	6258
	6259	enobufs:
	6260	return ENOBUFS;
	6261	}
	6262
	6263	uint64_t
	6264	mcl_to_paddr(char *addr)
	6265	{
	6266	vm_offset_t base_phys;
	6267
	6268	if (!MBUF_IN_MAP(addr)) {
	6269	return 0;
	6270	}
	6271	base_phys = mcl_paddr[atop_64(addr - (char *)mbutl)];
	6272
	6273	if (base_phys == 0) {
	6274	return 0;
	6275	}
	6276	return (uint64_t)(ptoa_64(base_phys) \| ((uint64_t)addr & PAGE_MASK));
	6277	}
	6278
	6279	/*
	6280	* Dup the mbuf chain passed in. The whole thing. No cute additional cruft.
	6281	* And really copy the thing. That way, we don't "precompute" checksums
	6282	* for unsuspecting consumers. Assumption: m->m_nextpkt == 0. Trick: for
	6283	* small packets, don't dup into a cluster. That way received packets
	6284	* don't take up too much room in the sockbuf (cf. sbspace()).
	6285	*/
	6286	int MDFail;
	6287
	6288	struct mbuf *
	6289	m_dup(struct mbuf *m, int how)
	6290	{
	6291	struct mbuf n, *np;
	6292	struct mbuf *top;
	6293	int copyhdr = 0;
	6294
	6295	np = &top;
	6296	top = NULL;
	6297	if (m->m_flags & M_PKTHDR) {
	6298	copyhdr = 1;
	6299	}
	6300
	6301	/*
	6302	* Quick check: if we have one mbuf and its data fits in an
	6303	* mbuf with packet header, just copy and go.
	6304	*/
	6305	if (m->m_next == NULL) {
	6306	/* Then just move the data into an mbuf and be done... */
	6307	if (copyhdr) {
	6308	if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
	6309	if ((n = _M_GETHDR(how, m->m_type)) == NULL) {
	6310	return NULL;
	6311	}
	6312	n->m_len = m->m_len;
	6313	m_dup_pkthdr(n, m, how);
	6314	bcopy(m->m_data, n->m_data, m->m_len);
	6315	return n;
	6316	}
	6317	} else if (m->m_len <= MLEN) {
	6318	if ((n = _M_GET(how, m->m_type)) == NULL) {
	6319	return NULL;
	6320	}
	6321	bcopy(m->m_data, n->m_data, m->m_len);
	6322	n->m_len = m->m_len;
	6323	return n;
	6324	}
	6325	}
	6326	while (m != NULL) {
	6327	#if BLUE_DEBUG
	6328	printf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
	6329	m->m_data);
	6330	#endif
	6331	if (copyhdr) {
	6332	n = _M_GETHDR(how, m->m_type);
	6333	} else {
	6334	n = _M_GET(how, m->m_type);
	6335	}
	6336	if (n == NULL) {
	6337	goto nospace;
	6338	}
	6339	if (m->m_flags & M_EXT) {
	6340	if (m->m_len <= m_maxsize(MC_CL)) {
	6341	MCLGET(n, how);
	6342	} else if (m->m_len <= m_maxsize(MC_BIGCL)) {
	6343	n = m_mbigget(n, how);
	6344	} else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0) {
	6345	n = m_m16kget(n, how);
	6346	}
	6347	if (!(n->m_flags & M_EXT)) {
	6348	(void) m_free(n);
	6349	goto nospace;
	6350	}
	6351	} else {
	6352	VERIFY((copyhdr == 1 && m->m_len <= MHLEN) \|\|
	6353	(copyhdr == 0 && m->m_len <= MLEN));
	6354	}
	6355	*np = n;
	6356	if (copyhdr) {
	6357	/* Don't use M_COPY_PKTHDR: preserve m_data */
	6358	m_dup_pkthdr(n, m, how);
	6359	copyhdr = 0;
	6360	if (!(n->m_flags & M_EXT)) {
	6361	n->m_data = n->m_pktdat;
	6362	}
	6363	}
	6364	n->m_len = m->m_len;
	6365	/*
	6366	* Get the dup on the same bdry as the original
	6367	* Assume that the two mbufs have the same offset to data area
	6368	* (up to word boundaries)
	6369	*/
	6370	bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len);
	6371	m = m->m_next;
	6372	np = &n->m_next;
	6373	#if BLUE_DEBUG
	6374	printf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
	6375	n->m_data);
	6376	#endif
	6377	}
	6378
	6379	if (top == NULL) {
	6380	MDFail++;
	6381	}
	6382	return top;
	6383
	6384	nospace:
	6385	m_freem(top);
	6386	MDFail++;
	6387	return NULL;
	6388	}
	6389
	6390	#define MBUF_MULTIPAGES(m) \
	6391	(((m)->m_flags & M_EXT) && \
	6392	((IS_P2ALIGNED((m)->m_data, PAGE_SIZE) \
	6393	&& (m)->m_len > PAGE_SIZE) \|\| \
	6394	(!IS_P2ALIGNED((m)->m_data, PAGE_SIZE) && \
	6395	P2ROUNDUP((m)->m_data, PAGE_SIZE) < ((uintptr_t)(m)->m_data + (m)->m_len))))
	6396
	6397	static struct mbuf *
	6398	m_expand(struct mbuf m, struct mbuf *last)
	6399	{
	6400	struct mbuf *top = NULL;
	6401	struct mbuf **nm = &top;
	6402	uintptr_t data0, data;
	6403	unsigned int len0, len;
	6404
	6405	VERIFY(MBUF_MULTIPAGES(m));
	6406	VERIFY(m->m_next == NULL);
	6407	data0 = (uintptr_t)m->m_data;
	6408	len0 = m->m_len;
	6409	*last = top;
	6410
	6411	for (;;) {
	6412	struct mbuf *n;
	6413
	6414	data = data0;
	6415	if (IS_P2ALIGNED(data, PAGE_SIZE) && len0 > PAGE_SIZE) {
	6416	len = PAGE_SIZE;
	6417	} else if (!IS_P2ALIGNED(data, PAGE_SIZE) &&
	6418	P2ROUNDUP(data, PAGE_SIZE) < (data + len0)) {
	6419	len = P2ROUNDUP(data, PAGE_SIZE) - data;
	6420	} else {
	6421	len = len0;
	6422	}
	6423
	6424	VERIFY(len > 0);
	6425	VERIFY(m->m_flags & M_EXT);
	6426	m->m_data = (void *)data;
	6427	m->m_len = len;
	6428
	6429	nm = last = m;
	6430	nm = &m->m_next;
	6431	m->m_next = NULL;
	6432
	6433	data0 += len;
	6434	len0 -= len;
	6435	if (len0 == 0) {
	6436	break;
	6437	}
	6438
	6439	n = _M_RETRY(M_DONTWAIT, MT_DATA);
	6440	if (n == NULL) {
	6441	m_freem(top);
	6442	top = *last = NULL;
	6443	break;
	6444	}
	6445
	6446	n->m_ext = m->m_ext;
	6447	m_incref(m);
	6448	n->m_flags \|= M_EXT;
	6449	m = n;
	6450	}
	6451	return top;
	6452	}
	6453
	6454	struct mbuf *
	6455	m_normalize(struct mbuf *m)
	6456	{
	6457	struct mbuf *top = NULL;
	6458	struct mbuf **nm = &top;
	6459	boolean_t expanded = FALSE;
	6460
	6461	while (m != NULL) {
	6462	struct mbuf *n;
	6463
	6464	n = m->m_next;
	6465	m->m_next = NULL;
	6466
	6467	/* Does the data cross one or more page boundaries? */
	6468	if (MBUF_MULTIPAGES(m)) {
	6469	struct mbuf *last;
	6470	if ((m = m_expand(m, &last)) == NULL) {
	6471	m_freem(n);
	6472	m_freem(top);
	6473	top = NULL;
	6474	break;
	6475	}
	6476	*nm = m;
	6477	nm = &last->m_next;
	6478	expanded = TRUE;
	6479	} else {
	6480	*nm = m;
	6481	nm = &m->m_next;
	6482	}
	6483	m = n;
	6484	}
	6485	if (expanded) {
	6486	atomic_add_32(&mb_normalized, 1);
	6487	}
	6488	return top;
	6489	}
	6490
	6491	/*
	6492	* Append the specified data to the indicated mbuf chain,
	6493	* Extend the mbuf chain if the new data does not fit in
	6494	* existing space.
	6495	*
	6496	* Return 1 if able to complete the job; otherwise 0.
	6497	*/
	6498	int
	6499	m_append(struct mbuf *m0, int len, caddr_t cp)
	6500	{
	6501	struct mbuf m, n;
	6502	int remainder, space;
	6503
	6504	for (m = m0; m->m_next != NULL; m = m->m_next) {
	6505	;
	6506	}
	6507	remainder = len;
	6508	space = M_TRAILINGSPACE(m);
	6509	if (space > 0) {
	6510	/*
	6511	* Copy into available space.
	6512	*/
	6513	if (space > remainder) {
	6514	space = remainder;
	6515	}
	6516	bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
	6517	m->m_len += space;
	6518	cp += space;
	6519	remainder -= space;
	6520	}
	6521	while (remainder > 0) {
	6522	/*
	6523	* Allocate a new mbuf; could check space
	6524	* and allocate a cluster instead.
	6525	*/
	6526	n = m_get(M_WAITOK, m->m_type);
	6527	if (n == NULL) {
	6528	break;
	6529	}
	6530	n->m_len = min(MLEN, remainder);
	6531	bcopy(cp, mtod(n, caddr_t), n->m_len);
	6532	cp += n->m_len;
	6533	remainder -= n->m_len;
	6534	m->m_next = n;
	6535	m = n;
	6536	}
	6537	if (m0->m_flags & M_PKTHDR) {
	6538	m0->m_pkthdr.len += len - remainder;
	6539	}
	6540	return remainder == 0;
	6541	}
	6542
	6543	struct mbuf *
	6544	m_last(struct mbuf *m)
	6545	{
	6546	while (m->m_next != NULL) {
	6547	m = m->m_next;
	6548	}
	6549	return m;
	6550	}
	6551
	6552	unsigned int
	6553	m_fixhdr(struct mbuf *m0)
	6554	{
	6555	u_int len;
	6556
	6557	VERIFY(m0->m_flags & M_PKTHDR);
	6558
	6559	len = m_length2(m0, NULL);
	6560	m0->m_pkthdr.len = len;
	6561	return len;
	6562	}
	6563
	6564	unsigned int
	6565	m_length2(struct mbuf m0, struct mbuf *last)
	6566	{
	6567	struct mbuf *m;
	6568	u_int len;
	6569
	6570	len = 0;
	6571	for (m = m0; m != NULL; m = m->m_next) {
	6572	len += m->m_len;
	6573	if (m->m_next == NULL) {
	6574	break;
	6575	}
	6576	}
	6577	if (last != NULL) {
	6578	*last = m;
	6579	}
	6580	return len;
	6581	}
	6582
	6583	/*
	6584	* Defragment a mbuf chain, returning the shortest possible chain of mbufs
	6585	* and clusters. If allocation fails and this cannot be completed, NULL will
	6586	* be returned, but the passed in chain will be unchanged. Upon success,
	6587	* the original chain will be freed, and the new chain will be returned.
	6588	*
	6589	* If a non-packet header is passed in, the original mbuf (chain?) will
	6590	* be returned unharmed.
	6591	*
	6592	* If offset is specfied, the first mbuf in the chain will have a leading
	6593	* space of the amount stated by the "off" parameter.
	6594	*
	6595	* This routine requires that the m_pkthdr.header field of the original
	6596	* mbuf chain is cleared by the caller.
	6597	*/
	6598	struct mbuf *
	6599	m_defrag_offset(struct mbuf *m0, u_int32_t off, int how)
	6600	{
	6601	struct mbuf m_new = NULL, m_final = NULL;
	6602	int progress = 0, length, pktlen;
	6603
	6604	if (!(m0->m_flags & M_PKTHDR)) {
	6605	return m0;
	6606	}
	6607
	6608	VERIFY(off < MHLEN);
	6609	m_fixhdr(m0); /* Needed sanity check */
	6610
	6611	pktlen = m0->m_pkthdr.len + off;
	6612	if (pktlen > MHLEN) {
	6613	m_final = m_getcl(how, MT_DATA, M_PKTHDR);
	6614	} else {
	6615	m_final = m_gethdr(how, MT_DATA);
	6616	}
	6617
	6618	if (m_final == NULL) {
	6619	goto nospace;
	6620	}
	6621
	6622	if (off > 0) {
	6623	pktlen -= off;
	6624	m_final->m_data += off;
	6625	}
	6626
	6627	/*
	6628	* Caller must have handled the contents pointed to by this
	6629	* pointer before coming here, as otherwise it will point to
	6630	* the original mbuf which will get freed upon success.
	6631	*/
	6632	VERIFY(m0->m_pkthdr.pkt_hdr == NULL);
	6633
	6634	if (m_dup_pkthdr(m_final, m0, how) == 0) {
	6635	goto nospace;
	6636	}
	6637
	6638	m_new = m_final;
	6639
	6640	while (progress < pktlen) {
	6641	length = pktlen - progress;
	6642	if (length > MCLBYTES) {
	6643	length = MCLBYTES;
	6644	}
	6645	length -= ((m_new == m_final) ? off : 0);
	6646	if (length < 0) {
	6647	goto nospace;
	6648	}
	6649
	6650	if (m_new == NULL) {
	6651	if (length > MLEN) {
	6652	m_new = m_getcl(how, MT_DATA, 0);
	6653	} else {
	6654	m_new = m_get(how, MT_DATA);
	6655	}
	6656	if (m_new == NULL) {
	6657	goto nospace;
	6658	}
	6659	}
	6660
	6661	m_copydata(m0, progress, length, mtod(m_new, caddr_t));
	6662	progress += length;
	6663	m_new->m_len = length;
	6664	if (m_new != m_final) {
	6665	m_cat(m_final, m_new);
	6666	}
	6667	m_new = NULL;
	6668	}
	6669	m_freem(m0);
	6670	m0 = m_final;
	6671	return m0;
	6672	nospace:
	6673	if (m_final) {
	6674	m_freem(m_final);
	6675	}
	6676	return NULL;
	6677	}
	6678
	6679	struct mbuf *
	6680	m_defrag(struct mbuf *m0, int how)
	6681	{
	6682	return m_defrag_offset(m0, 0, how);
	6683	}
	6684
	6685	void
	6686	m_mchtype(struct mbuf *m, int t)
	6687	{
	6688	mtype_stat_inc(t);
	6689	mtype_stat_dec(m->m_type);
	6690	(m)->m_type = t;
	6691	}
	6692
	6693	void *
	6694	m_mtod(struct mbuf *m)
	6695	{
	6696	return MTOD(m, void *);
	6697	}
	6698
	6699	struct mbuf *
	6700	m_dtom(void *x)
	6701	{
	6702	return (struct mbuf *)((uintptr_t)(x) & ~(MSIZE - 1));
	6703	}
	6704
	6705	void
	6706	m_mcheck(struct mbuf *m)
	6707	{
	6708	_MCHECK(m);
	6709	}
	6710
	6711	/*
	6712	* Return a pointer to mbuf/offset of location in mbuf chain.
	6713	*/
	6714	struct mbuf *
	6715	m_getptr(struct mbuf m, int loc, int off)
	6716	{
	6717	while (loc >= 0) {
	6718	/* Normal end of search. */
	6719	if (m->m_len > loc) {
	6720	*off = loc;
	6721	return m;
	6722	} else {
	6723	loc -= m->m_len;
	6724	if (m->m_next == NULL) {
	6725	if (loc == 0) {
	6726	/* Point at the end of valid data. */
	6727	*off = m->m_len;
	6728	return m;
	6729	}
	6730	return NULL;
	6731	}
	6732	m = m->m_next;
	6733	}
	6734	}
	6735	return NULL;
	6736	}
	6737
	6738	/*
	6739	* Inform the corresponding mcache(s) that there's a waiter below.
	6740	*/
	6741	static void
	6742	mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
	6743	{
	6744	mcache_waiter_inc(m_cache(class));
	6745	if (comp) {
	6746	if (class == MC_CL) {
	6747	mcache_waiter_inc(m_cache(MC_MBUF_CL));
	6748	} else if (class == MC_BIGCL) {
	6749	mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
	6750	} else if (class == MC_16KCL) {
	6751	mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
	6752	} else {
	6753	mcache_waiter_inc(m_cache(MC_MBUF_CL));
	6754	mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
	6755	}
	6756	}
	6757	}
	6758
	6759	/*
	6760	* Inform the corresponding mcache(s) that there's no more waiter below.
	6761	*/
	6762	static void
	6763	mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
	6764	{
	6765	mcache_waiter_dec(m_cache(class));
	6766	if (comp) {
	6767	if (class == MC_CL) {
	6768	mcache_waiter_dec(m_cache(MC_MBUF_CL));
	6769	} else if (class == MC_BIGCL) {
	6770	mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
	6771	} else if (class == MC_16KCL) {
	6772	mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
	6773	} else {
	6774	mcache_waiter_dec(m_cache(MC_MBUF_CL));
	6775	mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
	6776	}
	6777	}
	6778	}
	6779
	6780	static bool mbuf_watchdog_defunct_active = false;
	6781
	6782	static uint32_t
	6783	mbuf_watchdog_socket_space(struct socket *so)
	6784	{
	6785	if (so == NULL) {
	6786	return 0;
	6787	}
	6788
	6789	return so->so_snd.sb_mbcnt + so->so_rcv.sb_mbcnt;
	6790	}
	6791
	6792	struct mbuf_watchdog_defunct_args {
	6793	struct proc *top_app;
	6794	uint32_t top_app_space_used;
	6795	};
	6796
	6797	static int
	6798	mbuf_watchdog_defunct_iterate(proc_t p, void *arg)
	6799	{
	6800	struct fileproc *fp = NULL;
	6801	struct mbuf_watchdog_defunct_args *args =
	6802	(struct mbuf_watchdog_defunct_args *)arg;
	6803	uint32_t space_used = 0;
	6804
	6805	proc_fdlock(p);
	6806	fdt_foreach(fp, p) {
	6807	struct fileglob *fg = fp->fp_glob;
	6808	struct socket *so = NULL;
	6809
	6810	if (FILEGLOB_DTYPE(fg) != DTYPE_SOCKET) {
	6811	continue;
	6812	}
	6813	so = (struct socket *)fp->fp_glob->fg_data;
	6814	/*
	6815	* We calculate the space without the socket
	6816	* lock because we don't want to be blocked
	6817	* by another process that called send() and
	6818	* is stuck waiting for mbufs.
	6819	*
	6820	* These variables are 32-bit so we don't have
	6821	* to worry about incomplete reads.
	6822	*/
	6823	space_used += mbuf_watchdog_socket_space(so);
	6824	}
	6825	proc_fdunlock(p);
	6826	if (space_used > args->top_app_space_used) {
	6827	if (args->top_app != NULL) {
	6828	proc_rele(args->top_app);
	6829	}
	6830	args->top_app = p;
	6831	args->top_app_space_used = space_used;
	6832
	6833	return PROC_CLAIMED;
	6834	} else {
	6835	return PROC_RETURNED;
	6836	}
	6837	}
	6838
	6839	extern char proc_name_address(void p);
	6840
	6841	static void
	6842	mbuf_watchdog_defunct(thread_call_param_t arg0, thread_call_param_t arg1)
	6843	{
	6844	#pragma unused(arg0, arg1)
	6845	struct mbuf_watchdog_defunct_args args = {};
	6846	struct fileproc *fp = NULL;
	6847
	6848	proc_iterate(PROC_ALLPROCLIST,
	6849	mbuf_watchdog_defunct_iterate, &args, NULL, NULL);
	6850
	6851	/*
	6852	* Defunct all sockets from this app.
	6853	*/
	6854	if (args.top_app != NULL) {
	6855	os_log(OS_LOG_DEFAULT, "%s: defuncting all sockets from %s.%d",
	6856	__func__,
	6857	proc_name_address(args.top_app),
	6858	proc_pid(args.top_app));
	6859	proc_fdlock(args.top_app);
	6860	fdt_foreach(fp, args.top_app) {
	6861	struct fileglob *fg = fp->fp_glob;
	6862	struct socket *so = NULL;
	6863
	6864	if (FILEGLOB_DTYPE(fg) != DTYPE_SOCKET) {
	6865	continue;
	6866	}
	6867	so = (struct socket *)fp->fp_glob->fg_data;
	6868	socket_lock(so, 0);
	6869	if (sosetdefunct(args.top_app, so,
	6870	SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL,
	6871	TRUE) == 0) {
	6872	sodefunct(args.top_app, so,
	6873	SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
	6874	}
	6875	socket_unlock(so, 0);
	6876	}
	6877	proc_fdunlock(args.top_app);
	6878	proc_rele(args.top_app);
	6879	mbstat.m_forcedefunct++;
	6880	}
	6881	mbuf_watchdog_defunct_active = false;
	6882	}
	6883
	6884	/*
	6885	* Called during slab (blocking and non-blocking) allocation. If there
	6886	* is at least one waiter, and the time since the first waiter is blocked
	6887	* is greater than the watchdog timeout, panic the system.
	6888	*/
	6889	static void
	6890	mbuf_watchdog(void)
	6891	{
	6892	struct timeval now;
	6893	unsigned int since;
	6894	static thread_call_t defunct_tcall = NULL;
	6895
	6896	if (mb_waiters == 0 \|\| !mb_watchdog) {
	6897	return;
	6898	}
	6899
	6900	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
	6901
	6902	microuptime(&now);
	6903	since = now.tv_sec - mb_wdtstart.tv_sec;
	6904
	6905	/*
	6906	* Check if we are about to panic the system due
	6907	* to lack of mbufs and start defuncting sockets
	6908	* from processes that use too many sockets.
	6909	*
	6910	* We're always called with the mbuf_mlock held,
	6911	* so that also protects mbuf_watchdog_defunct_active.
	6912	*/
	6913	if (since >= MB_WDT_MAXTIME / 2 && !mbuf_watchdog_defunct_active) {
	6914	/*
	6915	* Start a thread to defunct sockets
	6916	* from apps that are over-using their socket
	6917	* buffers.
	6918	*/
	6919	if (defunct_tcall == NULL) {
	6920	defunct_tcall =
	6921	thread_call_allocate_with_options(mbuf_watchdog_defunct,
	6922	NULL,
	6923	THREAD_CALL_PRIORITY_KERNEL,
	6924	THREAD_CALL_OPTIONS_ONCE);
	6925	}
	6926	if (defunct_tcall != NULL) {
	6927	mbuf_watchdog_defunct_active = true;
	6928	thread_call_enter(defunct_tcall);
	6929	}
	6930	}
	6931	if (since >= MB_WDT_MAXTIME) {
	6932	panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__,
	6933	mb_waiters, since, mbuf_dump());
	6934	/* NOTREACHED */
	6935	}
	6936	}
	6937
	6938	/*
	6939	* Called during blocking allocation. Returns TRUE if one or more objects
	6940	* are available at the per-CPU caches layer and that allocation should be
	6941	* retried at that level.
	6942	*/
	6943	static boolean_t
	6944	mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
	6945	{
	6946	boolean_t mcache_retry = FALSE;
	6947
	6948	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
	6949
	6950	/* Check if there's anything at the cache layer */
	6951	if (mbuf_cached_above(class, wait)) {
	6952	mcache_retry = TRUE;
	6953	goto done;
	6954	}
	6955
	6956	/* Nothing? Then try hard to get it from somewhere */
	6957	m_reclaim(class, num, (wait & MCR_COMP));
	6958
	6959	/* We tried hard and got something? */
	6960	if (m_infree(class) > 0) {
	6961	mbstat.m_wait++;
	6962	goto done;
	6963	} else if (mbuf_cached_above(class, wait)) {
	6964	mbstat.m_wait++;
	6965	mcache_retry = TRUE;
	6966	goto done;
	6967	} else if (wait & MCR_TRYHARD) {
	6968	mcache_retry = TRUE;
	6969	goto done;
	6970	}
	6971
	6972	/*
	6973	* There's really nothing for us right now; inform the
	6974	* cache(s) that there is a waiter below and go to sleep.
	6975	*/
	6976	mbuf_waiter_inc(class, (wait & MCR_COMP));
	6977
	6978	VERIFY(!(wait & MCR_NOSLEEP));
	6979
	6980	/*
	6981	* If this is the first waiter, arm the watchdog timer. Otherwise
	6982	* check if we need to panic the system due to watchdog timeout.
	6983	*/
	6984	if (mb_waiters == 0) {
	6985	microuptime(&mb_wdtstart);
	6986	} else {
	6987	mbuf_watchdog();
	6988	}
	6989
	6990	mb_waiters++;
	6991	m_region_expand(class) += m_total(class) + num;
	6992	/* wake up the worker thread */
	6993	if (mbuf_worker_ready &&
	6994	mbuf_worker_needs_wakeup) {
	6995	wakeup((caddr_t)&mbuf_worker_needs_wakeup);
	6996	mbuf_worker_needs_wakeup = FALSE;
	6997	}
	6998	mbwdog_logger("waiting (%d mbufs in class %s)", num, m_cname(class));
	6999	(void) msleep(mb_waitchan, mbuf_mlock, (PZERO - 1), m_cname(class), NULL);
	7000	mbwdog_logger("woke up (%d mbufs in class %s) ", num, m_cname(class));
	7001
	7002	/* We are now up; stop getting notified until next round */
	7003	mbuf_waiter_dec(class, (wait & MCR_COMP));
	7004
	7005	/* We waited and got something */
	7006	if (m_infree(class) > 0) {
	7007	mbstat.m_wait++;
	7008	goto done;
	7009	} else if (mbuf_cached_above(class, wait)) {
	7010	mbstat.m_wait++;
	7011	mcache_retry = TRUE;
	7012	}
	7013	done:
	7014	return mcache_retry;
	7015	}
	7016
	7017	__attribute__((noreturn))
	7018	static void
	7019	mbuf_worker_thread(void)
	7020	{
	7021	int mbuf_expand;
	7022
	7023	while (1) {
	7024	lck_mtx_lock(mbuf_mlock);
	7025	mbwdog_logger("worker thread running");
	7026	mbuf_worker_run_cnt++;
	7027	mbuf_expand = 0;
	7028	/*
	7029	* Allocations are based on page size, so if we have depleted
	7030	* the reserved spaces, try to free mbufs from the major classes.
	7031	*/
	7032	#if PAGE_SIZE == 4096
	7033	uint32_t m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
	7034	uint32_t m_clusters = m_total(MC_CL);
	7035	uint32_t m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
	7036	uint32_t sumclusters = m_mbclusters + m_clusters + m_bigclusters;
	7037	if (sumclusters >= nclusters) {
	7038	mbwdog_logger("reclaiming bigcl");
	7039	mbuf_drain_locked(TRUE);
	7040	m_reclaim(MC_BIGCL, 4, FALSE);
	7041	}
	7042	#else
	7043	uint32_t m_16kclusters = m_total(MC_16KCL);
	7044	if (njcl > 0 && (m_16kclusters << NCLPJCLSHIFT) >= njcl) {
	7045	mbwdog_logger("reclaiming 16kcl");
	7046	mbuf_drain_locked(TRUE);
	7047	m_reclaim(MC_16KCL, 4, FALSE);
	7048	}
	7049	#endif
	7050	if (m_region_expand(MC_CL) > 0) {
	7051	int n;
	7052	mb_expand_cl_cnt++;
	7053	/* Adjust to current number of cluster in use */
	7054	n = m_region_expand(MC_CL) -
	7055	(m_total(MC_CL) - m_infree(MC_CL));
	7056	if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL)) {
	7057	n = m_maxlimit(MC_CL) - m_total(MC_CL);
	7058	}
	7059	if (n > 0) {
	7060	mb_expand_cl_total += n;
	7061	}
	7062	m_region_expand(MC_CL) = 0;
	7063
	7064	if (n > 0) {
	7065	mbwdog_logger("expanding MC_CL by %d", n);
	7066	freelist_populate(MC_CL, n, M_WAIT);
	7067	}
	7068	}
	7069	if (m_region_expand(MC_BIGCL) > 0) {
	7070	int n;
	7071	mb_expand_bigcl_cnt++;
	7072	/* Adjust to current number of 4 KB cluster in use */
	7073	n = m_region_expand(MC_BIGCL) -
	7074	(m_total(MC_BIGCL) - m_infree(MC_BIGCL));
	7075	if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL)) {
	7076	n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
	7077	}
	7078	if (n > 0) {
	7079	mb_expand_bigcl_total += n;
	7080	}
	7081	m_region_expand(MC_BIGCL) = 0;
	7082
	7083	if (n > 0) {
	7084	mbwdog_logger("expanding MC_BIGCL by %d", n);
	7085	freelist_populate(MC_BIGCL, n, M_WAIT);
	7086	}
	7087	}
	7088	if (m_region_expand(MC_16KCL) > 0) {
	7089	int n;
	7090	mb_expand_16kcl_cnt++;
	7091	/* Adjust to current number of 16 KB cluster in use */
	7092	n = m_region_expand(MC_16KCL) -
	7093	(m_total(MC_16KCL) - m_infree(MC_16KCL));
	7094	if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL)) {
	7095	n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
	7096	}
	7097	if (n > 0) {
	7098	mb_expand_16kcl_total += n;
	7099	}
	7100	m_region_expand(MC_16KCL) = 0;
	7101
	7102	if (n > 0) {
	7103	mbwdog_logger("expanding MC_16KCL by %d", n);
	7104	(void) freelist_populate(MC_16KCL, n, M_WAIT);
	7105	}
	7106	}
	7107
	7108	/*
	7109	* Because we can run out of memory before filling the mbuf
	7110	* map, we should not allocate more clusters than they are
	7111	* mbufs -- otherwise we could have a large number of useless
	7112	* clusters allocated.
	7113	*/
	7114	mbwdog_logger("totals: MC_MBUF %d MC_BIGCL %d MC_CL %d MC_16KCL %d",
	7115	m_total(MC_MBUF), m_total(MC_BIGCL), m_total(MC_CL),
	7116	m_total(MC_16KCL));
	7117	uint32_t total_mbufs = m_total(MC_MBUF);
	7118	uint32_t total_clusters = m_total(MC_BIGCL) + m_total(MC_CL) +
	7119	m_total(MC_16KCL);
	7120	if (total_mbufs < total_clusters) {
	7121	mbwdog_logger("expanding MC_MBUF by %d",
	7122	total_clusters - total_mbufs);
	7123	}
	7124	while (total_mbufs < total_clusters) {
	7125	mb_expand_cnt++;
	7126	if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0) {
	7127	break;
	7128	}
	7129	total_mbufs = m_total(MC_MBUF);
	7130	total_clusters = m_total(MC_BIGCL) + m_total(MC_CL) +
	7131	m_total(MC_16KCL);
	7132	}
	7133
	7134	mbuf_worker_needs_wakeup = TRUE;
	7135	/*
	7136	* If there's a deadlock and we're not sending / receiving
	7137	* packets, net_uptime() won't be updated. Update it here
	7138	* so we are sure it's correct.
	7139	*/
	7140	net_update_uptime();
	7141	mbuf_worker_last_runtime = net_uptime();
	7142	assert_wait((caddr_t)&mbuf_worker_needs_wakeup,
	7143	THREAD_UNINT);
	7144	mbwdog_logger("worker thread sleeping");
	7145	lck_mtx_unlock(mbuf_mlock);
	7146	(void) thread_block((thread_continue_t)mbuf_worker_thread);
	7147	}
	7148	}
	7149
	7150	__attribute__((noreturn))
	7151	static void
	7152	mbuf_worker_thread_init(void)
	7153	{
	7154	mbuf_worker_ready++;
	7155	mbuf_worker_thread();
	7156	}
	7157
	7158	static mcl_slab_t *
	7159	slab_get(void *buf)
	7160	{
	7161	mcl_slabg_t *slg;
	7162	unsigned int ix, k;
	7163
	7164	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
	7165
	7166	VERIFY(MBUF_IN_MAP(buf));
	7167	ix = ((unsigned char *)buf - mbutl) >> MBSHIFT;
	7168	VERIFY(ix < maxslabgrp);
	7169
	7170	if ((slg = slabstbl[ix]) == NULL) {
	7171	/*
	7172	* In the current implementation, we never shrink the slabs
	7173	* table; if we attempt to reallocate a cluster group when
	7174	* it's already allocated, panic since this is a sign of a
	7175	* memory corruption (slabstbl[ix] got nullified).
	7176	*/
	7177	++slabgrp;
	7178	VERIFY(ix < slabgrp);
	7179	/*
	7180	* Slabs expansion can only be done single threaded; when
	7181	* we get here, it must be as a result of m_clalloc() which
	7182	* is serialized and therefore mb_clalloc_busy must be set.
	7183	*/
	7184	VERIFY(mb_clalloc_busy);
	7185	lck_mtx_unlock(mbuf_mlock);
	7186
	7187	/* This is a new buffer; create the slabs group for it */
	7188	slg = zalloc_permanent_type(mcl_slabg_t);
	7189	slg->slg_slab = zalloc_permanent(sizeof(mcl_slab_t) * NSLABSPMB,
	7190	ZALIGN(mcl_slab_t));
	7191
	7192	lck_mtx_lock(mbuf_mlock);
	7193	/*
	7194	* No other thread could have gone into m_clalloc() after
	7195	* we dropped the lock above, so verify that it's true.
	7196	*/
	7197	VERIFY(mb_clalloc_busy);
	7198
	7199	slabstbl[ix] = slg;
	7200
	7201	/* Chain each slab in the group to its forward neighbor */
	7202	for (k = 1; k < NSLABSPMB; k++) {
	7203	slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
	7204	}
	7205	VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
	7206
	7207	/* And chain the last slab in the previous group to this */
	7208	if (ix > 0) {
	7209	VERIFY(slabstbl[ix - 1]->
	7210	slg_slab[NSLABSPMB - 1].sl_next == NULL);
	7211	slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
	7212	&slg->slg_slab[0];
	7213	}
	7214	}
	7215
	7216	ix = MTOPG(buf) % NSLABSPMB;
	7217	VERIFY(ix < NSLABSPMB);
	7218
	7219	return &slg->slg_slab[ix];
	7220	}
	7221
	7222	static void
	7223	slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
	7224	void base, void head, unsigned int len, int refcnt, int chunks)
	7225	{
	7226	sp->sl_class = class;
	7227	sp->sl_flags = flags;
	7228	sp->sl_base = base;
	7229	sp->sl_head = head;
	7230	sp->sl_len = len;
	7231	sp->sl_refcnt = refcnt;
	7232	sp->sl_chunks = chunks;
	7233	slab_detach(sp);
	7234	}
	7235
	7236	static void
	7237	slab_insert(mcl_slab_t *sp, mbuf_class_t class)
	7238	{
	7239	VERIFY(slab_is_detached(sp));
	7240	m_slab_cnt(class)++;
	7241	TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
	7242	sp->sl_flags &= ~SLF_DETACHED;
	7243
	7244	/*
	7245	* If a buffer spans multiple contiguous pages then mark them as
	7246	* detached too
	7247	*/
	7248	if (class == MC_16KCL) {
	7249	int k;
	7250	for (k = 1; k < NSLABSP16KB; k++) {
	7251	sp = sp->sl_next;
	7252	/* Next slab must already be present */
	7253	VERIFY(sp != NULL && slab_is_detached(sp));
	7254	sp->sl_flags &= ~SLF_DETACHED;
	7255	}
	7256	}
	7257	}
	7258
	7259	static void
	7260	slab_remove(mcl_slab_t *sp, mbuf_class_t class)
	7261	{
	7262	int k;
	7263	VERIFY(!slab_is_detached(sp));
	7264	VERIFY(m_slab_cnt(class) > 0);
	7265	m_slab_cnt(class)--;
	7266	TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
	7267	slab_detach(sp);
	7268	if (class == MC_16KCL) {
	7269	for (k = 1; k < NSLABSP16KB; k++) {
	7270	sp = sp->sl_next;
	7271	/* Next slab must already be present */
	7272	VERIFY(sp != NULL);
	7273	VERIFY(!slab_is_detached(sp));
	7274	slab_detach(sp);
	7275	}
	7276	}
	7277	}
	7278
	7279	static boolean_t
	7280	slab_inrange(mcl_slab_t sp, void buf)
	7281	{
	7282	return (uintptr_t)buf >= (uintptr_t)sp->sl_base &&
	7283	(uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len);
	7284	}
	7285
	7286	#undef panic
	7287
	7288	static void
	7289	slab_nextptr_panic(mcl_slab_t sp, void addr)
	7290	{
	7291	int i;
	7292	unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
	7293	uintptr_t buf = (uintptr_t)sp->sl_base;
	7294
	7295	for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
	7296	void next = ((mcache_obj_t )buf)->obj_next;
	7297	if (next != addr) {
	7298	continue;
	7299	}
	7300	if (!mclverify) {
	7301	if (next != NULL && !MBUF_IN_MAP(next)) {
	7302	mcache_t *cp = m_cache(sp->sl_class);
	7303	panic("%s: %s buffer %p in slab %p modified "
	7304	"after free at offset 0: %p out of range "
	7305	"[%p-%p)\n", __func__, cp->mc_name,
	7306	(void *)buf, sp, next, mbutl, embutl);
	7307	/* NOTREACHED */
	7308	}
	7309	} else {
	7310	mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
	7311	(mcache_obj_t *)buf);
	7312	mcl_audit_verify_nextptr(next, mca);
	7313	}
	7314	}
	7315	}
	7316
	7317	static void
	7318	slab_detach(mcl_slab_t *sp)
	7319	{
	7320	sp->sl_link.tqe_next = (mcl_slab_t *)-1;
	7321	sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
	7322	sp->sl_flags \|= SLF_DETACHED;
	7323	}
	7324
	7325	static boolean_t
	7326	slab_is_detached(mcl_slab_t *sp)
	7327	{
	7328	return (intptr_t)sp->sl_link.tqe_next == -1 &&
	7329	(intptr_t)sp->sl_link.tqe_prev == -1 &&
	7330	(sp->sl_flags & SLF_DETACHED);
	7331	}
	7332
	7333	static void
	7334	mcl_audit_init(void buf, mcache_audit_t *mca_list,
	7335	mcache_obj_t **con_list, size_t con_size, unsigned int num)
	7336	{
	7337	mcache_audit_t mca, mca_tail;
	7338	mcache_obj_t *con = NULL;
	7339	boolean_t save_contents = (con_list != NULL);
	7340	unsigned int i, ix;
	7341
	7342	ASSERT(num <= NMBPG);
	7343	ASSERT(con_list == NULL \|\| con_size != 0);
	7344
	7345	ix = MTOPG(buf);
	7346	VERIFY(ix < maxclaudit);
	7347
	7348	/* Make sure we haven't been here before */
	7349	for (i = 0; i < num; i++) {
	7350	VERIFY(mclaudit[ix].cl_audit[i] == NULL);
	7351	}
	7352
	7353	mca = mca_tail = *mca_list;
	7354	if (save_contents) {
	7355	con = *con_list;
	7356	}
	7357
	7358	for (i = 0; i < num; i++) {
	7359	mcache_audit_t *next;
	7360
	7361	next = mca->mca_next;
	7362	bzero(mca, sizeof(*mca));
	7363	mca->mca_next = next;
	7364	mclaudit[ix].cl_audit[i] = mca;
	7365
	7366	/* Attach the contents buffer if requested */
	7367	if (save_contents) {
	7368	mcl_saved_contents_t *msc =
	7369	(mcl_saved_contents_t )(void )con;
	7370
	7371	VERIFY(msc != NULL);
	7372	VERIFY(IS_P2ALIGNED(msc, sizeof(u_int64_t)));
	7373	VERIFY(con_size == sizeof(*msc));
	7374	mca->mca_contents_size = con_size;
	7375	mca->mca_contents = msc;
	7376	con = con->obj_next;
	7377	bzero(mca->mca_contents, mca->mca_contents_size);
	7378	}
	7379
	7380	mca_tail = mca;
	7381	mca = mca->mca_next;
	7382	}
	7383
	7384	if (save_contents) {
	7385	*con_list = con;
	7386	}
	7387
	7388	*mca_list = mca_tail->mca_next;
	7389	mca_tail->mca_next = NULL;
	7390	}
	7391
	7392	static void
	7393	mcl_audit_free(void *buf, unsigned int num)
	7394	{
	7395	unsigned int i, ix;
	7396	mcache_audit_t mca, mca_list;
	7397
	7398	ix = MTOPG(buf);
	7399	VERIFY(ix < maxclaudit);
	7400
	7401	if (mclaudit[ix].cl_audit[0] != NULL) {
	7402	mca_list = mclaudit[ix].cl_audit[0];
	7403	for (i = 0; i < num; i++) {
	7404	mca = mclaudit[ix].cl_audit[i];
	7405	mclaudit[ix].cl_audit[i] = NULL;
	7406	if (mca->mca_contents) {
	7407	mcache_free(mcl_audit_con_cache,
	7408	mca->mca_contents);
	7409	}
	7410	}
	7411	mcache_free_ext(mcache_audit_cache,
	7412	(mcache_obj_t *)mca_list);
	7413	}
	7414	}
	7415
	7416	/*
	7417	* Given an address of a buffer (mbuf/2KB/4KB/16KB), return
	7418	* the corresponding audit structure for that buffer.
	7419	*/
	7420	static mcache_audit_t *
	7421	mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *mobj)
	7422	{
	7423	mcache_audit_t *mca = NULL;
	7424	int ix = MTOPG(mobj), m_idx = 0;
	7425	unsigned char *page_addr;
	7426
	7427	VERIFY(ix < maxclaudit);
	7428	VERIFY(IS_P2ALIGNED(mobj, MIN(m_maxsize(class), PAGE_SIZE)));
	7429
	7430	page_addr = PGTOM(ix);
	7431
	7432	switch (class) {
	7433	case MC_MBUF:
	7434	/*
	7435	* For the mbuf case, find the index of the page
	7436	* used by the mbuf and use that index to locate the
	7437	* base address of the page. Then find out the
	7438	* mbuf index relative to the page base and use
	7439	* it to locate the audit structure.
	7440	*/
	7441	m_idx = MBPAGEIDX(page_addr, mobj);
	7442	VERIFY(m_idx < (int)NMBPG);
	7443	mca = mclaudit[ix].cl_audit[m_idx];
	7444	break;
	7445
	7446	case MC_CL:
	7447	/*
	7448	* Same thing as above, but for 2KB clusters in a page.
	7449	*/
	7450	m_idx = CLPAGEIDX(page_addr, mobj);
	7451	VERIFY(m_idx < (int)NCLPG);
	7452	mca = mclaudit[ix].cl_audit[m_idx];
	7453	break;
	7454
	7455	case MC_BIGCL:
	7456	m_idx = BCLPAGEIDX(page_addr, mobj);
	7457	VERIFY(m_idx < (int)NBCLPG);
	7458	mca = mclaudit[ix].cl_audit[m_idx];
	7459	break;
	7460	case MC_16KCL:
	7461	/*
	7462	* Same as above, but only return the first element.
	7463	*/
	7464	mca = mclaudit[ix].cl_audit[0];
	7465	break;
	7466
	7467	default:
	7468	VERIFY(0);
	7469	/* NOTREACHED */
	7470	}
	7471
	7472	return mca;
	7473	}
	7474
	7475	static void
	7476	mcl_audit_mbuf(mcache_audit_t mca, void addr, boolean_t composite,
	7477	boolean_t alloc)
	7478	{
	7479	struct mbuf *m = addr;
	7480	mcache_obj_t next = ((mcache_obj_t )m)->obj_next;
	7481
	7482	VERIFY(mca->mca_contents != NULL &&
	7483	mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
	7484
	7485	if (mclverify) {
	7486	mcl_audit_verify_nextptr(next, mca);
	7487	}
	7488
	7489	if (!alloc) {
	7490	/* Save constructed mbuf fields */
	7491	mcl_audit_save_mbuf(m, mca);
	7492	if (mclverify) {
	7493	mcache_set_pattern(MCACHE_FREE_PATTERN, m,
	7494	m_maxsize(MC_MBUF));
	7495	}
	7496	((mcache_obj_t *)m)->obj_next = next;
	7497	return;
	7498	}
	7499
	7500	/* Check if the buffer has been corrupted while in freelist */
	7501	if (mclverify) {
	7502	mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
	7503	}
	7504	/* Restore constructed mbuf fields */
	7505	mcl_audit_restore_mbuf(m, mca, composite);
	7506	}
	7507
	7508	static void
	7509	mcl_audit_restore_mbuf(struct mbuf m, mcache_audit_t mca, boolean_t composite)
	7510	{
	7511	struct mbuf *ms = MCA_SAVED_MBUF_PTR(mca);
	7512
	7513	if (composite) {
	7514	struct mbuf *next = m->m_next;
	7515	VERIFY(ms->m_flags == M_EXT && m_get_rfa(ms) != NULL &&
	7516	MBUF_IS_COMPOSITE(ms));
	7517	VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
	7518	/*
	7519	* We could have hand-picked the mbuf fields and restore
	7520	* them individually, but that will be a maintenance
	7521	* headache. Instead, restore everything that was saved;
	7522	* the mbuf layer will recheck and reinitialize anyway.
	7523	*/
	7524	bcopy(ms, m, MCA_SAVED_MBUF_SIZE);
	7525	m->m_next = next;
	7526	} else {
	7527	/*
	7528	* For a regular mbuf (no cluster attached) there's nothing
	7529	* to restore other than the type field, which is expected
	7530	* to be MT_FREE.
	7531	*/
	7532	m->m_type = ms->m_type;
	7533	}
	7534	_MCHECK(m);
	7535	}
	7536
	7537	static void
	7538	mcl_audit_save_mbuf(struct mbuf m, mcache_audit_t mca)
	7539	{
	7540	VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
	7541	_MCHECK(m);
	7542	bcopy(m, MCA_SAVED_MBUF_PTR(mca), MCA_SAVED_MBUF_SIZE);
	7543	}
	7544
	7545	static void
	7546	mcl_audit_cluster(mcache_audit_t mca, void addr, size_t size, boolean_t alloc,
	7547	boolean_t save_next)
	7548	{
	7549	mcache_obj_t next = ((mcache_obj_t )addr)->obj_next;
	7550
	7551	if (!alloc) {
	7552	if (mclverify) {
	7553	mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
	7554	}
	7555	if (save_next) {
	7556	mcl_audit_verify_nextptr(next, mca);
	7557	((mcache_obj_t *)addr)->obj_next = next;
	7558	}
	7559	} else if (mclverify) {
	7560	/* Check if the buffer has been corrupted while in freelist */
	7561	mcl_audit_verify_nextptr(next, mca);
	7562	mcache_audit_free_verify_set(mca, addr, 0, size);
	7563	}
	7564	}
	7565
	7566	static void
	7567	mcl_audit_scratch(mcache_audit_t *mca)
	7568	{
	7569	void *stack[MCACHE_STACK_DEPTH + 1];
	7570	mcl_scratch_audit_t *msa;
	7571	struct timeval now;
	7572
	7573	VERIFY(mca->mca_contents != NULL);
	7574	msa = MCA_SAVED_SCRATCH_PTR(mca);
	7575
	7576	msa->msa_pthread = msa->msa_thread;
	7577	msa->msa_thread = current_thread();
	7578	bcopy(msa->msa_stack, msa->msa_pstack, sizeof(msa->msa_pstack));
	7579	msa->msa_pdepth = msa->msa_depth;
	7580	bzero(stack, sizeof(stack));
	7581	msa->msa_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
	7582	bcopy(&stack[1], msa->msa_stack, sizeof(msa->msa_stack));
	7583
	7584	msa->msa_ptstamp = msa->msa_tstamp;
	7585	microuptime(&now);
	7586	/* tstamp is in ms relative to base_ts */
	7587	msa->msa_tstamp = ((now.tv_usec - mb_start.tv_usec) / 1000);
	7588	if ((now.tv_sec - mb_start.tv_sec) > 0) {
	7589	msa->msa_tstamp += ((now.tv_sec - mb_start.tv_sec) * 1000);
	7590	}
	7591	}
	7592
	7593	__abortlike
	7594	static void
	7595	mcl_audit_mcheck_panic(struct mbuf *m)
	7596	{
	7597	char buf[DUMP_MCA_BUF_SIZE];
	7598	mcache_audit_t *mca;
	7599
	7600	MRANGE(m);
	7601	mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
	7602
	7603	panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n",
	7604	m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(buf, mca));
	7605	/* NOTREACHED */
	7606	}
	7607
	7608	__abortlike
	7609	static void
	7610	mcl_audit_verify_nextptr_panic(void next, mcache_audit_t mca)
	7611	{
	7612	char buf[DUMP_MCA_BUF_SIZE];
	7613	panic("mcl_audit: buffer %p modified after free at offset 0: "
	7614	"%p out of range [%p-%p)\n%s\n",
	7615	mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(buf, mca));
	7616	/* NOTREACHED */
	7617	}
	7618
	7619	static void
	7620	mcl_audit_verify_nextptr(void next, mcache_audit_t mca)
	7621	{
	7622	if (next != NULL && !MBUF_IN_MAP(next) &&
	7623	(next != (void *)MCACHE_FREE_PATTERN \|\| !mclverify)) {
	7624	mcl_audit_verify_nextptr_panic(next, mca);
	7625	}
	7626	}
	7627
	7628	/* This function turns on mbuf leak detection */
	7629	static void
	7630	mleak_activate(void)
	7631	{
	7632	mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR;
	7633	PE_parse_boot_argn("mleak_sample_factor",
	7634	&mleak_table.mleak_sample_factor,
	7635	sizeof(mleak_table.mleak_sample_factor));
	7636
	7637	if (mleak_table.mleak_sample_factor == 0) {
	7638	mclfindleak = 0;
	7639	}
	7640
	7641	if (mclfindleak == 0) {
	7642	return;
	7643	}
	7644
	7645	vm_size_t alloc_size =
	7646	mleak_alloc_buckets * sizeof(struct mallocation);
	7647	vm_size_t trace_size = mleak_trace_buckets * sizeof(struct mtrace);
	7648
	7649	mleak_allocations = zalloc_permanent(alloc_size, ZALIGN(struct mallocation));
	7650	mleak_traces = zalloc_permanent(trace_size, ZALIGN(struct mtrace));
	7651	mleak_stat = zalloc_permanent(MLEAK_STAT_SIZE(MLEAK_NUM_TRACES),
	7652	ZALIGN(mleak_stat_t));
	7653
	7654	mleak_stat->ml_cnt = MLEAK_NUM_TRACES;
	7655	#ifdef __LP64__
	7656	mleak_stat->ml_isaddr64 = 1;
	7657	#endif /* __LP64__ */
	7658	}
	7659
	7660	static void
	7661	mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc)
	7662	{
	7663	int temp;
	7664
	7665	if (mclfindleak == 0) {
	7666	return;
	7667	}
	7668
	7669	if (!alloc) {
	7670	return mleak_free(addr);
	7671	}
	7672
	7673	temp = atomic_add_32_ov(&mleak_table.mleak_capture, 1);
	7674
	7675	if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) {
	7676	uintptr_t bt[MLEAK_STACK_DEPTH];
	7677	int logged = backtrace(bt, MLEAK_STACK_DEPTH, NULL);
	7678	mleak_log(bt, addr, logged, num);
	7679	}
	7680	}
	7681
	7682	/*
	7683	* This function records the allocation in the mleak_allocations table
	7684	* and the backtrace in the mleak_traces table; if allocation slot is in use,
	7685	* replace old allocation with new one if the trace slot is in use, return
	7686	* (or increment refcount if same trace).
	7687	*/
	7688	static boolean_t
	7689	mleak_log(uintptr_t bt, mcache_obj_t addr, uint32_t depth, int num)
	7690	{
	7691	struct mallocation *allocation;
	7692	struct mtrace *trace;
	7693	uint32_t trace_index;
	7694
	7695	/* Quit if someone else modifying the tables */
	7696	if (!lck_mtx_try_lock_spin(mleak_lock)) {
	7697	mleak_table.total_conflicts++;
	7698	return FALSE;
	7699	}
	7700
	7701	allocation = &mleak_allocations[hashaddr((uintptr_t)addr,
	7702	mleak_alloc_buckets)];
	7703	trace_index = hashbacktrace(bt, depth, mleak_trace_buckets);
	7704	trace = &mleak_traces[trace_index];
	7705
	7706	VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]);
	7707	VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]);
	7708
	7709	allocation->hitcount++;
	7710	trace->hitcount++;
	7711
	7712	/*
	7713	* If the allocation bucket we want is occupied
	7714	* and the occupier has the same trace, just bail.
	7715	*/
	7716	if (allocation->element != NULL &&
	7717	trace_index == allocation->trace_index) {
	7718	mleak_table.alloc_collisions++;
	7719	lck_mtx_unlock(mleak_lock);
	7720	return TRUE;
	7721	}
	7722
	7723	/*
	7724	* Store the backtrace in the traces array;
	7725	* Size of zero = trace bucket is free.
	7726	*/
	7727	if (trace->allocs > 0 &&
	7728	bcmp(trace->addr, bt, (depth * sizeof(uintptr_t))) != 0) {
	7729	/* Different, unique trace, but the same hash! Bail out. */
	7730	trace->collisions++;
	7731	mleak_table.trace_collisions++;
	7732	lck_mtx_unlock(mleak_lock);
	7733	return TRUE;
	7734	} else if (trace->allocs > 0) {
	7735	/* Same trace, already added, so increment refcount */
	7736	trace->allocs++;
	7737	} else {
	7738	/* Found an unused trace bucket, so record the trace here */
	7739	if (trace->depth != 0) {
	7740	/* this slot previously used but not currently in use */
	7741	mleak_table.trace_overwrites++;
	7742	}
	7743	mleak_table.trace_recorded++;
	7744	trace->allocs = 1;
	7745	memcpy(trace->addr, bt, (depth * sizeof(uintptr_t)));
	7746	trace->depth = depth;
	7747	trace->collisions = 0;
	7748	}
	7749
	7750	/* Step 2: Store the allocation record in the allocations array */
	7751	if (allocation->element != NULL) {
	7752	/*
	7753	* Replace an existing allocation. No need to preserve
	7754	* because only a subset of the allocations are being
	7755	* recorded anyway.
	7756	*/
	7757	mleak_table.alloc_collisions++;
	7758	} else if (allocation->trace_index != 0) {
	7759	mleak_table.alloc_overwrites++;
	7760	}
	7761	allocation->element = addr;
	7762	allocation->trace_index = trace_index;
	7763	allocation->count = num;
	7764	mleak_table.alloc_recorded++;
	7765	mleak_table.outstanding_allocs++;
	7766
	7767	lck_mtx_unlock(mleak_lock);
	7768	return TRUE;
	7769	}
	7770
	7771	static void
	7772	mleak_free(mcache_obj_t *addr)
	7773	{
	7774	while (addr != NULL) {
	7775	struct mallocation *allocation = &mleak_allocations
	7776	[hashaddr((uintptr_t)addr, mleak_alloc_buckets)];
	7777
	7778	if (allocation->element == addr &&
	7779	allocation->trace_index < mleak_trace_buckets) {
	7780	lck_mtx_lock_spin(mleak_lock);
	7781	if (allocation->element == addr &&
	7782	allocation->trace_index < mleak_trace_buckets) {
	7783	struct mtrace *trace;
	7784	trace = &mleak_traces[allocation->trace_index];
	7785	/* allocs = 0 means trace bucket is unused */
	7786	if (trace->allocs > 0) {
	7787	trace->allocs--;
	7788	}
	7789	if (trace->allocs == 0) {
	7790	trace->depth = 0;
	7791	}
	7792	/* NULL element means alloc bucket is unused */
	7793	allocation->element = NULL;
	7794	mleak_table.outstanding_allocs--;
	7795	}
	7796	lck_mtx_unlock(mleak_lock);
	7797	}
	7798	addr = addr->obj_next;
	7799	}
	7800	}
	7801
	7802	static void
	7803	mleak_sort_traces()
	7804	{
	7805	int i, j, k;
	7806	struct mtrace *swap;
	7807
	7808	for (i = 0; i < MLEAK_NUM_TRACES; i++) {
	7809	mleak_top_trace[i] = NULL;
	7810	}
	7811
	7812	for (i = 0, j = 0; j < MLEAK_NUM_TRACES && i < mleak_trace_buckets; i++) {
	7813	if (mleak_traces[i].allocs <= 0) {
	7814	continue;
	7815	}
	7816
	7817	mleak_top_trace[j] = &mleak_traces[i];
	7818	for (k = j; k > 0; k--) {
	7819	if (mleak_top_trace[k]->allocs <=
	7820	mleak_top_trace[k - 1]->allocs) {
	7821	break;
	7822	}
	7823
	7824	swap = mleak_top_trace[k - 1];
	7825	mleak_top_trace[k - 1] = mleak_top_trace[k];
	7826	mleak_top_trace[k] = swap;
	7827	}
	7828	j++;
	7829	}
	7830
	7831	j--;
	7832	for (; i < mleak_trace_buckets; i++) {
	7833	if (mleak_traces[i].allocs <= mleak_top_trace[j]->allocs) {
	7834	continue;
	7835	}
	7836
	7837	mleak_top_trace[j] = &mleak_traces[i];
	7838
	7839	for (k = j; k > 0; k--) {
	7840	if (mleak_top_trace[k]->allocs <=
	7841	mleak_top_trace[k - 1]->allocs) {
	7842	break;
	7843	}
	7844
	7845	swap = mleak_top_trace[k - 1];
	7846	mleak_top_trace[k - 1] = mleak_top_trace[k];
	7847	mleak_top_trace[k] = swap;
	7848	}
	7849	}
	7850	}
	7851
	7852	static void
	7853	mleak_update_stats()
	7854	{
	7855	mleak_trace_stat_t *mltr;
	7856	int i;
	7857
	7858	VERIFY(mleak_stat != NULL);
	7859	#ifdef __LP64__
	7860	VERIFY(mleak_stat->ml_isaddr64);
	7861	#else
	7862	VERIFY(!mleak_stat->ml_isaddr64);
	7863	#endif /* !__LP64__ */
	7864	VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES);
	7865
	7866	mleak_sort_traces();
	7867
	7868	mltr = &mleak_stat->ml_trace[0];
	7869	bzero(mltr, sizeof(mltr) MLEAK_NUM_TRACES);
	7870	for (i = 0; i < MLEAK_NUM_TRACES; i++) {
	7871	int j;
	7872
	7873	if (mleak_top_trace[i] == NULL \|\|
	7874	mleak_top_trace[i]->allocs == 0) {
	7875	continue;
	7876	}
	7877
	7878	mltr->mltr_collisions = mleak_top_trace[i]->collisions;
	7879	mltr->mltr_hitcount = mleak_top_trace[i]->hitcount;
	7880	mltr->mltr_allocs = mleak_top_trace[i]->allocs;
	7881	mltr->mltr_depth = mleak_top_trace[i]->depth;
	7882
	7883	VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH);
	7884	for (j = 0; j < mltr->mltr_depth; j++) {
	7885	mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j];
	7886	}
	7887
	7888	mltr++;
	7889	}
	7890	}
	7891
	7892	static struct mbtypes {
	7893	int mt_type;
	7894	const char *mt_name;
	7895	} mbtypes[] = {
	7896	{ MT_DATA, "data" },
	7897	{ MT_OOBDATA, "oob data" },
	7898	{ MT_CONTROL, "ancillary data" },
	7899	{ MT_HEADER, "packet headers" },
	7900	{ MT_SOCKET, "socket structures" },
	7901	{ MT_PCB, "protocol control blocks" },
	7902	{ MT_RTABLE, "routing table entries" },
	7903	{ MT_HTABLE, "IMP host table entries" },
	7904	{ MT_ATABLE, "address resolution tables" },
	7905	{ MT_FTABLE, "fragment reassembly queue headers" },
	7906	{ MT_SONAME, "socket names and addresses" },
	7907	{ MT_SOOPTS, "socket options" },
	7908	{ MT_RIGHTS, "access rights" },
	7909	{ MT_IFADDR, "interface addresses" },
	7910	{ MT_TAG, "packet tags" },
	7911	{ 0, NULL }
	7912	};
	7913
	7914	#define MBUF_DUMP_BUF_CHK() { \
	7915	clen -= k; \
	7916	if (clen < 1) \
	7917	goto done; \
	7918	c += k; \
	7919	}
	7920
	7921	static char *
	7922	mbuf_dump(void)
	7923	{
	7924	unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct,
	7925	totreturned = 0;
	7926	u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0;
	7927	u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0;
	7928	u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0;
	7929	int nmbtypes = sizeof(mbstat.m_mtypes) / sizeof(short);
	7930	uint8_t seen[256];
	7931	struct mbtypes *mp;
	7932	mb_class_stat_t *sp;
	7933	mleak_trace_stat_t *mltr;
	7934	char *c = mbuf_dump_buf;
	7935	int i, j, k, clen = MBUF_DUMP_BUF_SIZE;
	7936	bool printed_banner = false;
	7937
	7938	mbuf_dump_buf[0] = '\0';
	7939
	7940	/* synchronize all statistics in the mbuf table */
	7941	mbuf_stat_sync();
	7942	mbuf_mtypes_sync(TRUE);
	7943
	7944	sp = &mb_stat->mbs_class[0];
	7945	for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) {
	7946	u_int32_t mem;
	7947
	7948	if (m_class(i) == MC_MBUF) {
	7949	m_mbufs = sp->mbcl_active;
	7950	} else if (m_class(i) == MC_CL) {
	7951	m_clfree = sp->mbcl_total - sp->mbcl_active;
	7952	} else if (m_class(i) == MC_BIGCL) {
	7953	m_bigclfree = sp->mbcl_total - sp->mbcl_active;
	7954	} else if (njcl > 0 && m_class(i) == MC_16KCL) {
	7955	m_16kclfree = sp->mbcl_total - sp->mbcl_active;
	7956	m_16kclusters = sp->mbcl_total;
	7957	} else if (m_class(i) == MC_MBUF_CL) {
	7958	m_mbufclfree = sp->mbcl_total - sp->mbcl_active;
	7959	} else if (m_class(i) == MC_MBUF_BIGCL) {
	7960	m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active;
	7961	} else if (njcl > 0 && m_class(i) == MC_MBUF_16KCL) {
	7962	m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active;
	7963	}
	7964
	7965	mem = sp->mbcl_ctotal * sp->mbcl_size;
	7966	totmem += mem;
	7967	totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) *
	7968	sp->mbcl_size;
	7969	totreturned += sp->mbcl_release_cnt;
	7970	}
	7971
	7972	/* adjust free counts to include composite caches */
	7973	m_clfree += m_mbufclfree;
	7974	m_bigclfree += m_mbufbigclfree;
	7975	m_16kclfree += m_mbuf16kclfree;
	7976
	7977	totmbufs = 0;
	7978	for (mp = mbtypes; mp->mt_name != NULL; mp++) {
	7979	totmbufs += mbstat.m_mtypes[mp->mt_type];
	7980	}
	7981	if (totmbufs > m_mbufs) {
	7982	totmbufs = m_mbufs;
	7983	}
	7984	k = scnprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs);
	7985	MBUF_DUMP_BUF_CHK();
	7986
	7987	bzero(&seen, sizeof(seen));
	7988	for (mp = mbtypes; mp->mt_name != NULL; mp++) {
	7989	if (mbstat.m_mtypes[mp->mt_type] != 0) {
	7990	seen[mp->mt_type] = 1;
	7991	k = scnprintf(c, clen, "\t%u mbufs allocated to %s\n",
	7992	mbstat.m_mtypes[mp->mt_type], mp->mt_name);
	7993	MBUF_DUMP_BUF_CHK();
	7994	}
	7995	}
	7996	seen[MT_FREE] = 1;
	7997	for (i = 0; i < nmbtypes; i++) {
	7998	if (!seen[i] && mbstat.m_mtypes[i] != 0) {
	7999	k = scnprintf(c, clen, "\t%u mbufs allocated to "
	8000	"<mbuf type %d>\n", mbstat.m_mtypes[i], i);
	8001	MBUF_DUMP_BUF_CHK();
	8002	}
	8003	}
	8004	if ((m_mbufs - totmbufs) > 0) {
	8005	k = scnprintf(c, clen, "\t%lu mbufs allocated to caches\n",
	8006	m_mbufs - totmbufs);
	8007	MBUF_DUMP_BUF_CHK();
	8008	}
	8009	k = scnprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n"
	8010	"%u/%u mbuf 4KB clusters in use\n",
	8011	(unsigned int)(mbstat.m_clusters - m_clfree),
	8012	(unsigned int)mbstat.m_clusters,
	8013	(unsigned int)(mbstat.m_bigclusters - m_bigclfree),
	8014	(unsigned int)mbstat.m_bigclusters);
	8015	MBUF_DUMP_BUF_CHK();
	8016
	8017	if (njcl > 0) {
	8018	k = scnprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n",
	8019	m_16kclusters - m_16kclfree, m_16kclusters,
	8020	njclbytes / 1024);
	8021	MBUF_DUMP_BUF_CHK();
	8022	}
	8023	totused = totmem - totfree;
	8024	if (totmem == 0) {
	8025	totpct = 0;
	8026	} else if (totused < (ULONG_MAX / 100)) {
	8027	totpct = (totused * 100) / totmem;
	8028	} else {
	8029	u_long totmem1 = totmem / 100;
	8030	u_long totused1 = totused / 100;
	8031	totpct = (totused1 * 100) / totmem1;
	8032	}
	8033	k = scnprintf(c, clen, "%lu KB allocated to network (approx. %lu%% "
	8034	"in use)\n", totmem / 1024, totpct);
	8035	MBUF_DUMP_BUF_CHK();
	8036	k = scnprintf(c, clen, "%lu KB returned to the system\n",
	8037	totreturned / 1024);
	8038	MBUF_DUMP_BUF_CHK();
	8039
	8040	net_update_uptime();
	8041	k = scnprintf(c, clen,
	8042	"VM allocation failures: contiguous %u, normal %u, one page %u\n",
	8043	mb_kmem_contig_failed, mb_kmem_failed, mb_kmem_one_failed);
	8044	MBUF_DUMP_BUF_CHK();
	8045	if (mb_kmem_contig_failed_ts \|\| mb_kmem_failed_ts \|\|
	8046	mb_kmem_one_failed_ts) {
	8047	k = scnprintf(c, clen,
	8048	"VM allocation failure timestamps: contiguous %llu "
	8049	"(size %llu), normal %llu (size %llu), one page %llu "
	8050	"(now %llu)\n",
	8051	mb_kmem_contig_failed_ts, mb_kmem_contig_failed_size,
	8052	mb_kmem_failed_ts, mb_kmem_failed_size,
	8053	mb_kmem_one_failed_ts, net_uptime());
	8054	MBUF_DUMP_BUF_CHK();
	8055	k = scnprintf(c, clen,
	8056	"VM return codes: ");
	8057	MBUF_DUMP_BUF_CHK();
	8058	for (i = 0;
	8059	i < sizeof(mb_kmem_stats) / sizeof(mb_kmem_stats[0]);
	8060	i++) {
	8061	k = scnprintf(c, clen, "%s: %u ", mb_kmem_stats_labels[i],
	8062	mb_kmem_stats[i]);
	8063	MBUF_DUMP_BUF_CHK();
	8064	}
	8065	k = scnprintf(c, clen, "\n");
	8066	MBUF_DUMP_BUF_CHK();
	8067	}
	8068	k = scnprintf(c, clen,
	8069	"worker thread runs: %u, expansions: %llu, cl %llu/%llu, "
	8070	"bigcl %llu/%llu, 16k %llu/%llu\n", mbuf_worker_run_cnt,
	8071	mb_expand_cnt, mb_expand_cl_cnt, mb_expand_cl_total,
	8072	mb_expand_bigcl_cnt, mb_expand_bigcl_total, mb_expand_16kcl_cnt,
	8073	mb_expand_16kcl_total);
	8074	MBUF_DUMP_BUF_CHK();
	8075	if (mbuf_worker_last_runtime != 0) {
	8076	k = scnprintf(c, clen, "worker thread last run time: "
	8077	"%llu (%llu seconds ago)\n",
	8078	mbuf_worker_last_runtime,
	8079	net_uptime() - mbuf_worker_last_runtime);
	8080	MBUF_DUMP_BUF_CHK();
	8081	}
	8082	if (mbuf_drain_last_runtime != 0) {
	8083	k = scnprintf(c, clen, "drain routine last run time: "
	8084	"%llu (%llu seconds ago)\n",
	8085	mbuf_drain_last_runtime,
	8086	net_uptime() - mbuf_drain_last_runtime);
	8087	MBUF_DUMP_BUF_CHK();
	8088	}
	8089
	8090	#if DEBUG \|\| DEVELOPMENT
	8091	k = scnprintf(c, clen, "\nworker thread log:\n%s\n", mbwdog_logging);
	8092	MBUF_DUMP_BUF_CHK();
	8093	#endif
	8094
	8095	for (j = 0; j < MTRACELARGE_NUM_TRACES; j++) {
	8096	struct mtracelarge *trace = &mtracelarge_table[j];
	8097	if (trace->size == 0 \|\| trace->depth == 0) {
	8098	continue;
	8099	}
	8100	if (printed_banner == false) {
	8101	k = scnprintf(c, clen,
	8102	"\nlargest allocation failure backtraces:\n");
	8103	MBUF_DUMP_BUF_CHK();
	8104	printed_banner = true;
	8105	}
	8106	k = scnprintf(c, clen, "size %llu: < ", trace->size);
	8107	MBUF_DUMP_BUF_CHK();
	8108	for (i = 0; i < trace->depth; i++) {
	8109	if (mleak_stat->ml_isaddr64) {
	8110	k = scnprintf(c, clen, "0x%0llx ",
	8111	(uint64_t)VM_KERNEL_UNSLIDE(
	8112	trace->addr[i]));
	8113	} else {
	8114	k = scnprintf(c, clen,
	8115	"0x%08x ",
	8116	(uint32_t)VM_KERNEL_UNSLIDE(
	8117	trace->addr[i]));
	8118	}
	8119	MBUF_DUMP_BUF_CHK();
	8120	}
	8121	k = scnprintf(c, clen, ">\n");
	8122	MBUF_DUMP_BUF_CHK();
	8123	}
	8124
	8125	/* mbuf leak detection statistics */
	8126	mleak_update_stats();
	8127
	8128	k = scnprintf(c, clen, "\nmbuf leak detection table:\n");
	8129	MBUF_DUMP_BUF_CHK();
	8130	k = scnprintf(c, clen, "\ttotal captured: %u (one per %u)\n",
	8131	mleak_table.mleak_capture / mleak_table.mleak_sample_factor,
	8132	mleak_table.mleak_sample_factor);
	8133	MBUF_DUMP_BUF_CHK();
	8134	k = scnprintf(c, clen, "\ttotal allocs outstanding: %llu\n",
	8135	mleak_table.outstanding_allocs);
	8136	MBUF_DUMP_BUF_CHK();
	8137	k = scnprintf(c, clen, "\tnew hash recorded: %llu allocs, %llu traces\n",
	8138	mleak_table.alloc_recorded, mleak_table.trace_recorded);
	8139	MBUF_DUMP_BUF_CHK();
	8140	k = scnprintf(c, clen, "\thash collisions: %llu allocs, %llu traces\n",
	8141	mleak_table.alloc_collisions, mleak_table.trace_collisions);
	8142	MBUF_DUMP_BUF_CHK();
	8143	k = scnprintf(c, clen, "\toverwrites: %llu allocs, %llu traces\n",
	8144	mleak_table.alloc_overwrites, mleak_table.trace_overwrites);
	8145	MBUF_DUMP_BUF_CHK();
	8146	k = scnprintf(c, clen, "\tlock conflicts: %llu\n\n",
	8147	mleak_table.total_conflicts);
	8148	MBUF_DUMP_BUF_CHK();
	8149
	8150	k = scnprintf(c, clen, "top %d outstanding traces:\n",
	8151	mleak_stat->ml_cnt);
	8152	MBUF_DUMP_BUF_CHK();
	8153	for (i = 0; i < mleak_stat->ml_cnt; i++) {
	8154	mltr = &mleak_stat->ml_trace[i];
	8155	k = scnprintf(c, clen, "[%d] %llu outstanding alloc(s), "
	8156	"%llu hit(s), %llu collision(s)\n", (i + 1),
	8157	mltr->mltr_allocs, mltr->mltr_hitcount,
	8158	mltr->mltr_collisions);
	8159	MBUF_DUMP_BUF_CHK();
	8160	}
	8161
	8162	if (mleak_stat->ml_isaddr64) {
	8163	k = scnprintf(c, clen, MB_LEAK_HDR_64);
	8164	} else {
	8165	k = scnprintf(c, clen, MB_LEAK_HDR_32);
	8166	}
	8167	MBUF_DUMP_BUF_CHK();
	8168
	8169	for (i = 0; i < MLEAK_STACK_DEPTH; i++) {
	8170	k = scnprintf(c, clen, "%2d: ", (i + 1));
	8171	MBUF_DUMP_BUF_CHK();
	8172	for (j = 0; j < mleak_stat->ml_cnt; j++) {
	8173	mltr = &mleak_stat->ml_trace[j];
	8174	if (i < mltr->mltr_depth) {
	8175	if (mleak_stat->ml_isaddr64) {
	8176	k = scnprintf(c, clen, "0x%0llx ",
	8177	(uint64_t)VM_KERNEL_UNSLIDE(
	8178	mltr->mltr_addr[i]));
	8179	} else {
	8180	k = scnprintf(c, clen,
	8181	"0x%08x ",
	8182	(uint32_t)VM_KERNEL_UNSLIDE(
	8183	mltr->mltr_addr[i]));
	8184	}
	8185	} else {
	8186	if (mleak_stat->ml_isaddr64) {
	8187	k = scnprintf(c, clen,
	8188	MB_LEAK_SPACING_64);
	8189	} else {
	8190	k = scnprintf(c, clen,
	8191	MB_LEAK_SPACING_32);
	8192	}
	8193	}
	8194	MBUF_DUMP_BUF_CHK();
	8195	}
	8196	k = scnprintf(c, clen, "\n");
	8197	MBUF_DUMP_BUF_CHK();
	8198	}
	8199	done:
	8200	return mbuf_dump_buf;
	8201	}
	8202
	8203	#undef MBUF_DUMP_BUF_CHK
	8204
	8205	/*
	8206	* Convert between a regular and a packet header mbuf. Caller is responsible
	8207	* for setting or clearing M_PKTHDR; this routine does the rest of the work.
	8208	*/
	8209	int
	8210	m_reinit(struct mbuf *m, int hdr)
	8211	{
	8212	int ret = 0;
	8213
	8214	if (hdr) {
	8215	VERIFY(!(m->m_flags & M_PKTHDR));
	8216	if (!(m->m_flags & M_EXT) &&
	8217	(m->m_data != m->m_dat \|\| m->m_len > 0)) {
	8218	/*
	8219	* If there's no external cluster attached and the
	8220	* mbuf appears to contain user data, we cannot
	8221	* safely convert this to a packet header mbuf,
	8222	* as the packet header structure might overlap
	8223	* with the data.
	8224	*/
	8225	printf("%s: cannot set M_PKTHDR on altered mbuf %llx, "
	8226	"m_data %llx (expected %llx), "
	8227	"m_len %d (expected 0)\n",
	8228	__func__,
	8229	(uint64_t)VM_KERNEL_ADDRPERM(m),
	8230	(uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
	8231	(uint64_t)VM_KERNEL_ADDRPERM(m->m_dat), m->m_len);
	8232	ret = EBUSY;
	8233	} else {
	8234	VERIFY((m->m_flags & M_EXT) \|\| m->m_data == m->m_dat);
	8235	m->m_flags \|= M_PKTHDR;
	8236	MBUF_INIT_PKTHDR(m);
	8237	}
	8238	} else {
	8239	/* Check for scratch area overflow */
	8240	m_redzone_verify(m);
	8241	/* Free the aux data and tags if there is any */
	8242	m_tag_delete_chain(m, NULL);
	8243	m->m_flags &= ~M_PKTHDR;
	8244	}
	8245
	8246	return ret;
	8247	}
	8248
	8249	int
	8250	m_ext_set_prop(struct mbuf *m, uint32_t o, uint32_t n)
	8251	{
	8252	ASSERT(m->m_flags & M_EXT);
	8253	return atomic_test_set_32(&MEXT_PRIV(m), o, n);
	8254	}
	8255
	8256	uint32_t
	8257	m_ext_get_prop(struct mbuf *m)
	8258	{
	8259	ASSERT(m->m_flags & M_EXT);
	8260	return MEXT_PRIV(m);
	8261	}
	8262
	8263	int
	8264	m_ext_paired_is_active(struct mbuf *m)
	8265	{
	8266	return MBUF_IS_PAIRED(m) ? (MEXT_PREF(m) > MEXT_MINREF(m)) : 1;
	8267	}
	8268
	8269	void
	8270	m_ext_paired_activate(struct mbuf *m)
	8271	{
	8272	struct ext_ref *rfa;
	8273	int hdr, type;
	8274	caddr_t extbuf;
	8275	m_ext_free_func_t extfree;
	8276	u_int extsize;
	8277
	8278	VERIFY(MBUF_IS_PAIRED(m));
	8279	VERIFY(MEXT_REF(m) == MEXT_MINREF(m));
	8280	VERIFY(MEXT_PREF(m) == MEXT_MINREF(m));
	8281
	8282	hdr = (m->m_flags & M_PKTHDR);
	8283	type = m->m_type;
	8284	extbuf = m->m_ext.ext_buf;
	8285	extfree = m_get_ext_free(m);
	8286	extsize = m->m_ext.ext_size;
	8287	rfa = m_get_rfa(m);
	8288
	8289	VERIFY(extbuf != NULL && rfa != NULL);
	8290
	8291	/*
	8292	* Safe to reinitialize packet header tags, since it's
	8293	* already taken care of at m_free() time. Similar to
	8294	* what's done in m_clattach() for the cluster. Bump
	8295	* up MEXT_PREF to indicate activation.
	8296	*/
	8297	MBUF_INIT(m, hdr, type);
	8298	MEXT_INIT(m, extbuf, extsize, extfree, (caddr_t)m, rfa,
	8299	1, 1, 2, EXTF_PAIRED, MEXT_PRIV(m), m);
	8300	}
	8301
	8302	void
	8303	m_scratch_init(struct mbuf *m)
	8304	{
	8305	struct pkthdr *pkt = &m->m_pkthdr;
	8306
	8307	VERIFY(m->m_flags & M_PKTHDR);
	8308
	8309	/* See comments in <rdar://problem/14040693> */
	8310	if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
	8311	panic_plain("Invalid attempt to modify guarded module-private "
	8312	"area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
	8313	/* NOTREACHED */
	8314	}
	8315
	8316	bzero(&pkt->pkt_mpriv, sizeof(pkt->pkt_mpriv));
	8317	}
	8318
	8319	/*
	8320	* This routine is reserved for mbuf_get_driver_scratch(); clients inside
	8321	* xnu that intend on utilizing the module-private area should directly
	8322	* refer to the pkt_mpriv structure in the pkthdr. They are also expected
	8323	* to set and clear PKTF_PRIV_GUARDED, while owning the packet and prior
	8324	* to handing it off to another module, respectively.
	8325	*/
	8326	u_int32_t
	8327	m_scratch_get(struct mbuf m, u_int8_t *p)
	8328	{
	8329	struct pkthdr *pkt = &m->m_pkthdr;
	8330
	8331	VERIFY(m->m_flags & M_PKTHDR);
	8332
	8333	/* See comments in <rdar://problem/14040693> */
	8334	if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
	8335	panic_plain("Invalid attempt to access guarded module-private "
	8336	"area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
	8337	/* NOTREACHED */
	8338	}
	8339
	8340	if (mcltrace) {
	8341	mcache_audit_t *mca;
	8342
	8343	lck_mtx_lock(mbuf_mlock);
	8344	mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
	8345	if (mca->mca_uflags & MB_SCVALID) {
	8346	mcl_audit_scratch(mca);
	8347	}
	8348	lck_mtx_unlock(mbuf_mlock);
	8349	}
	8350
	8351	p = (u_int8_t )&pkt->pkt_mpriv;
	8352	return sizeof(pkt->pkt_mpriv);
	8353	}
	8354
	8355	static void
	8356	m_redzone_init(struct mbuf *m)
	8357	{
	8358	VERIFY(m->m_flags & M_PKTHDR);
	8359	/*
	8360	* Each mbuf has a unique red zone pattern, which is a XOR
	8361	* of the red zone cookie and the address of the mbuf.
	8362	*/
	8363	m->m_pkthdr.redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
	8364	}
	8365
	8366	static void
	8367	m_redzone_verify(struct mbuf *m)
	8368	{
	8369	u_int32_t mb_redzone;
	8370
	8371	VERIFY(m->m_flags & M_PKTHDR);
	8372
	8373	mb_redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
	8374	if (m->m_pkthdr.redzone != mb_redzone) {
	8375	panic("mbuf %p redzone violation with value 0x%x "
	8376	"(instead of 0x%x, using cookie 0x%x)\n",
	8377	m, m->m_pkthdr.redzone, mb_redzone, mb_redzone_cookie);
	8378	/* NOTREACHED */
	8379	}
	8380	}
	8381
	8382	__private_extern__ inline void
	8383	m_set_ext(struct mbuf m, struct ext_ref rfa, m_ext_free_func_t ext_free,
	8384	caddr_t ext_arg)
	8385	{
	8386	VERIFY(m->m_flags & M_EXT);
	8387	if (rfa != NULL) {
	8388	m->m_ext.ext_refflags =
	8389	(struct ext_ref *)(((uintptr_t)rfa) ^ mb_obscure_extref);
	8390	if (ext_free != NULL) {
	8391	rfa->ext_token = ((uintptr_t)&rfa->ext_token) ^
	8392	mb_obscure_extfree;
	8393	uintptr_t ext_free_val = ptrauth_nop_cast(uintptr_t, ext_free) ^ rfa->ext_token;
	8394	m->m_ext.ext_free = ptrauth_nop_cast(m_ext_free_func_t, ext_free_val);
	8395	if (ext_arg != NULL) {
	8396	m->m_ext.ext_arg =
	8397	(caddr_t)(((uintptr_t)ext_arg) ^ rfa->ext_token);
	8398	} else {
	8399	m->m_ext.ext_arg = NULL;
	8400	}
	8401	} else {
	8402	rfa->ext_token = 0;
	8403	m->m_ext.ext_free = NULL;
	8404	m->m_ext.ext_arg = NULL;
	8405	}
	8406	} else {
	8407	/*
	8408	* If we are going to loose the cookie in ext_token by
	8409	* resetting the rfa, we should use the global cookie
	8410	* to obscure the ext_free and ext_arg pointers.
	8411	*/
	8412	if (ext_free != NULL) {
	8413	uintptr_t ext_free_val = ptrauth_nop_cast(uintptr_t, ext_free) ^ mb_obscure_extfree;
	8414	m->m_ext.ext_free = ptrauth_nop_cast(m_ext_free_func_t, ext_free_val);
	8415	if (ext_arg != NULL) {
	8416	m->m_ext.ext_arg =
	8417	(caddr_t)((uintptr_t)ext_arg ^
	8418	mb_obscure_extfree);
	8419	} else {
	8420	m->m_ext.ext_arg = NULL;
	8421	}
	8422	} else {
	8423	m->m_ext.ext_free = NULL;
	8424	m->m_ext.ext_arg = NULL;
	8425	}
	8426	m->m_ext.ext_refflags = NULL;
	8427	}
	8428	}
	8429
	8430	__private_extern__ inline struct ext_ref *
	8431	m_get_rfa(struct mbuf *m)
	8432	{
	8433	if (m->m_ext.ext_refflags == NULL) {
	8434	return NULL;
	8435	} else {
	8436	return (struct ext_ref *)(((uintptr_t)m->m_ext.ext_refflags) ^ mb_obscure_extref);
	8437	}
	8438	}
	8439
	8440	__private_extern__ inline m_ext_free_func_t
	8441	m_get_ext_free(struct mbuf *m)
	8442	{
	8443	struct ext_ref *rfa;
	8444	if (m->m_ext.ext_free == NULL) {
	8445	return NULL;
	8446	}
	8447
	8448	rfa = m_get_rfa(m);
	8449	if (rfa == NULL) {
	8450	uintptr_t ext_free_val = ptrauth_nop_cast(uintptr_t, m->m_ext.ext_free) ^ mb_obscure_extfree;
	8451	return ptrauth_nop_cast(m_ext_free_func_t, ext_free_val);
	8452	} else {
	8453	uintptr_t ext_free_val = ptrauth_nop_cast(uintptr_t, m->m_ext.ext_free) ^ rfa->ext_token;
	8454	return ptrauth_nop_cast(m_ext_free_func_t, ext_free_val);
	8455	}
	8456	}
	8457
	8458	__private_extern__ inline caddr_t
	8459	m_get_ext_arg(struct mbuf *m)
	8460	{
	8461	struct ext_ref *rfa;
	8462	if (m->m_ext.ext_arg == NULL) {
	8463	return NULL;
	8464	}
	8465
	8466	rfa = m_get_rfa(m);
	8467	if (rfa == NULL) {
	8468	return (caddr_t)((uintptr_t)m->m_ext.ext_arg ^ mb_obscure_extfree);
	8469	} else {
	8470	return (caddr_t)(((uintptr_t)m->m_ext.ext_arg) ^
	8471	rfa->ext_token);
	8472	}
	8473	}
	8474
	8475	/*
	8476	* Send a report of mbuf usage if the usage is at least 6% of max limit
	8477	* or if there has been at least 3% increase since the last report.
	8478	*
	8479	* The values 6% and 3% are chosen so that we can do simple arithmetic
	8480	* with shift operations.
	8481	*/
	8482	static boolean_t
	8483	mbuf_report_usage(mbuf_class_t cl)
	8484	{
	8485	/* if a report is already in progress, nothing to do */
	8486	if (mb_peak_newreport) {
	8487	return TRUE;
	8488	}
	8489
	8490	if (m_total(cl) > m_peak(cl) &&
	8491	m_total(cl) >= (m_maxlimit(cl) >> 4) &&
	8492	(m_total(cl) - m_peak(cl)) >= (m_peak(cl) >> 5)) {
	8493	return TRUE;
	8494	}
	8495	return FALSE;
	8496	}
	8497
	8498	__private_extern__ void
	8499	mbuf_report_peak_usage(void)
	8500	{
	8501	int i = 0;
	8502	u_int64_t uptime;
	8503	struct nstat_sysinfo_data ns_data;
	8504	uint32_t memreleased = 0;
	8505	static uint32_t prevmemreleased;
	8506
	8507	uptime = net_uptime();
	8508	lck_mtx_lock(mbuf_mlock);
	8509
	8510	/* Generate an initial report after 1 week of uptime */
	8511	if (!mb_peak_firstreport &&
	8512	uptime > MBUF_PEAK_FIRST_REPORT_THRESHOLD) {
	8513	mb_peak_newreport = TRUE;
	8514	mb_peak_firstreport = TRUE;
	8515	}
	8516
	8517	if (!mb_peak_newreport) {
	8518	lck_mtx_unlock(mbuf_mlock);
	8519	return;
	8520	}
	8521
	8522	/*
	8523	* Since a report is being generated before 1 week,
	8524	* we do not need to force another one later
	8525	*/
	8526	if (uptime < MBUF_PEAK_FIRST_REPORT_THRESHOLD) {
	8527	mb_peak_firstreport = TRUE;
	8528	}
	8529
	8530	for (i = 0; i < NELEM(mbuf_table); i++) {
	8531	m_peak(m_class(i)) = m_total(m_class(i));
	8532	memreleased += m_release_cnt(i);
	8533	}
	8534	memreleased = memreleased - prevmemreleased;
	8535	prevmemreleased = memreleased;
	8536	mb_peak_newreport = FALSE;
	8537	lck_mtx_unlock(mbuf_mlock);
	8538
	8539	bzero(&ns_data, sizeof(ns_data));
	8540	ns_data.flags = NSTAT_SYSINFO_MBUF_STATS;
	8541	ns_data.u.mb_stats.total_256b = m_peak(MC_MBUF);
	8542	ns_data.u.mb_stats.total_2kb = m_peak(MC_CL);
	8543	ns_data.u.mb_stats.total_4kb = m_peak(MC_BIGCL);
	8544	ns_data.u.mb_stats.total_16kb = m_peak(MC_16KCL);
	8545	ns_data.u.mb_stats.sbmb_total = total_sbmb_cnt_peak;
	8546	ns_data.u.mb_stats.sb_atmbuflimit = sbmb_limreached;
	8547	ns_data.u.mb_stats.draincnt = mbstat.m_drain;
	8548	ns_data.u.mb_stats.memreleased = memreleased;
	8549	ns_data.u.mb_stats.sbmb_floor = total_sbmb_cnt_floor;
	8550
	8551	nstat_sysinfo_send_data(&ns_data);
	8552
	8553	/*
	8554	* Reset the floor whenever we report a new
	8555	* peak to track the trend (increase peek usage
	8556	* is not a leak if mbufs get released
	8557	* between reports and the floor stays low)
	8558	*/
	8559	total_sbmb_cnt_floor = total_sbmb_cnt_peak;
	8560	}
	8561
	8562	/*
	8563	* Simple routine to avoid taking the lock when we can't run the
	8564	* mbuf drain.
	8565	*/
	8566	static int
	8567	mbuf_drain_checks(boolean_t ignore_waiters)
	8568	{
	8569	if (mb_drain_maxint == 0) {
	8570	return 0;
	8571	}
	8572	if (!ignore_waiters && mb_waiters != 0) {
	8573	return 0;
	8574	}
	8575
	8576	return 1;
	8577	}
	8578
	8579	/*
	8580	* Called by the VM when there's memory pressure or when we exhausted
	8581	* the 4k/16k reserved space.
	8582	*/
	8583	static void
	8584	mbuf_drain_locked(boolean_t ignore_waiters)
	8585	{
	8586	mbuf_class_t mc;
	8587	mcl_slab_t sp, sp_tmp, *nsp;
	8588	unsigned int num, k, interval, released = 0;
	8589	unsigned long total_mem = 0, use_mem = 0;
	8590	boolean_t ret, purge_caches = FALSE;
	8591	ppnum_t offset;
	8592	mcache_obj_t *obj;
	8593	unsigned long per;
	8594	static unsigned char scratch[32];
	8595	static ppnum_t scratch_pa = 0;
	8596
	8597	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
	8598	if (!mbuf_drain_checks(ignore_waiters)) {
	8599	return;
	8600	}
	8601	if (scratch_pa == 0) {
	8602	bzero(scratch, sizeof(scratch));
	8603	scratch_pa = pmap_find_phys(kernel_pmap, (addr64_t)scratch);
	8604	VERIFY(scratch_pa);
	8605	} else if (mclverify) {
	8606	/*
	8607	* Panic if a driver wrote to our scratch memory.
	8608	*/
	8609	for (k = 0; k < sizeof(scratch); k++) {
	8610	if (scratch[k]) {
	8611	panic("suspect DMA to freed address");
	8612	}
	8613	}
	8614	}
	8615	/*
	8616	* Don't free memory too often as that could cause excessive
	8617	* waiting times for mbufs. Purge caches if we were asked to drain
	8618	* in the last 5 minutes.
	8619	*/
	8620	if (mbuf_drain_last_runtime != 0) {
	8621	interval = net_uptime() - mbuf_drain_last_runtime;
	8622	if (interval <= mb_drain_maxint) {
	8623	return;
	8624	}
	8625	if (interval <= mb_drain_maxint * 5) {
	8626	purge_caches = TRUE;
	8627	}
	8628	}
	8629	mbuf_drain_last_runtime = net_uptime();
	8630	/*
	8631	* Don't free any memory if we're using 60% or more.
	8632	*/
	8633	for (mc = 0; mc < NELEM(mbuf_table); mc++) {
	8634	total_mem += m_total(mc) * m_maxsize(mc);
	8635	use_mem += m_active(mc) * m_maxsize(mc);
	8636	}
	8637	per = (use_mem * 100) / total_mem;
	8638	if (per >= 60) {
	8639	return;
	8640	}
	8641	/*
	8642	* Purge all the caches. This effectively disables
	8643	* caching for a few seconds, but the mbuf worker thread will
	8644	* re-enable them again.
	8645	*/
	8646	if (purge_caches == TRUE) {
	8647	for (mc = 0; mc < NELEM(mbuf_table); mc++) {
	8648	if (m_total(mc) < m_avgtotal(mc)) {
	8649	continue;
	8650	}
	8651	lck_mtx_unlock(mbuf_mlock);
	8652	ret = mcache_purge_cache(m_cache(mc), FALSE);
	8653	lck_mtx_lock(mbuf_mlock);
	8654	if (ret == TRUE) {
	8655	m_purge_cnt(mc)++;
	8656	}
	8657	}
	8658	}
	8659	/*
	8660	* Move the objects from the composite class freelist to
	8661	* the rudimentary slabs list, but keep at least 10% of the average
	8662	* total in the freelist.
	8663	*/
	8664	for (mc = 0; mc < NELEM(mbuf_table); mc++) {
	8665	while (m_cobjlist(mc) &&
	8666	m_total(mc) < m_avgtotal(mc) &&
	8667	m_infree(mc) > 0.1 * m_avgtotal(mc) + m_minlimit(mc)) {
	8668	obj = m_cobjlist(mc);
	8669	m_cobjlist(mc) = obj->obj_next;
	8670	obj->obj_next = NULL;
	8671	num = cslab_free(mc, obj, 1);
	8672	VERIFY(num == 1);
	8673	m_free_cnt(mc)++;
	8674	m_infree(mc)--;
	8675	/* cslab_free() handles m_total */
	8676	}
	8677	}
	8678	/*
	8679	* Free the buffers present in the slab list up to 10% of the total
	8680	* average per class.
	8681	*
	8682	* We walk the list backwards in an attempt to reduce fragmentation.
	8683	*/
	8684	for (mc = NELEM(mbuf_table) - 1; (int)mc >= 0; mc--) {
	8685	TAILQ_FOREACH_SAFE(sp, &m_slablist(mc), sl_link, sp_tmp) {
	8686	/*
	8687	* Process only unused slabs occupying memory.
	8688	*/
	8689	if (sp->sl_refcnt != 0 \|\| sp->sl_len == 0 \|\|
	8690	sp->sl_base == NULL) {
	8691	continue;
	8692	}
	8693	if (m_total(mc) < m_avgtotal(mc) \|\|
	8694	m_infree(mc) < 0.1 * m_avgtotal(mc) + m_minlimit(mc)) {
	8695	break;
	8696	}
	8697	slab_remove(sp, mc);
	8698	switch (mc) {
	8699	case MC_MBUF:
	8700	m_infree(mc) -= NMBPG;
	8701	m_total(mc) -= NMBPG;
	8702	if (mclaudit != NULL) {
	8703	mcl_audit_free(sp->sl_base, NMBPG);
	8704	}
	8705	break;
	8706	case MC_CL:
	8707	m_infree(mc) -= NCLPG;
	8708	m_total(mc) -= NCLPG;
	8709	if (mclaudit != NULL) {
	8710	mcl_audit_free(sp->sl_base, NMBPG);
	8711	}
	8712	break;
	8713	case MC_BIGCL:
	8714	{
	8715	m_infree(mc) -= NBCLPG;
	8716	m_total(mc) -= NBCLPG;
	8717	if (mclaudit != NULL) {
	8718	mcl_audit_free(sp->sl_base, NMBPG);
	8719	}
	8720	break;
	8721	}
	8722	case MC_16KCL:
	8723	m_infree(mc)--;
	8724	m_total(mc)--;
	8725	for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
	8726	nsp = nsp->sl_next;
	8727	VERIFY(nsp->sl_refcnt == 0 &&
	8728	nsp->sl_base != NULL &&
	8729	nsp->sl_len == 0);
	8730	slab_init(nsp, 0, 0, NULL, NULL, 0, 0,
	8731	0);
	8732	nsp->sl_flags = 0;
	8733	}
	8734	if (mclaudit != NULL) {
	8735	if (sp->sl_len == PAGE_SIZE) {
	8736	mcl_audit_free(sp->sl_base,
	8737	NMBPG);
	8738	} else {
	8739	mcl_audit_free(sp->sl_base, 1);
	8740	}
	8741	}
	8742	break;
	8743	default:
	8744	/*
	8745	* The composite classes have their own
	8746	* freelist (m_cobjlist), so we only
	8747	* process rudimentary classes here.
	8748	*/
	8749	VERIFY(0);
	8750	}
	8751	m_release_cnt(mc) += m_size(mc);
	8752	released += m_size(mc);
	8753	VERIFY(sp->sl_base != NULL &&
	8754	sp->sl_len >= PAGE_SIZE);
	8755	offset = MTOPG(sp->sl_base);
	8756	/*
	8757	* Make sure the IOMapper points to a valid, but
	8758	* bogus, address. This should prevent further DMA
	8759	* accesses to freed memory.
	8760	*/
	8761	IOMapperInsertPage(mcl_paddr_base, offset, scratch_pa);
	8762	mcl_paddr[offset] = 0;
	8763	kmem_free(mb_map, (vm_offset_t)sp->sl_base,
	8764	sp->sl_len);
	8765	slab_init(sp, 0, 0, NULL, NULL, 0, 0, 0);
	8766	sp->sl_flags = 0;
	8767	}
	8768	}
	8769	mbstat.m_drain++;
	8770	mbstat.m_bigclusters = m_total(MC_BIGCL);
	8771	mbstat.m_clusters = m_total(MC_CL);
	8772	mbstat.m_mbufs = m_total(MC_MBUF);
	8773	mbuf_stat_sync();
	8774	mbuf_mtypes_sync(TRUE);
	8775	}
	8776
	8777	__private_extern__ void
	8778	mbuf_drain(boolean_t ignore_waiters)
	8779	{
	8780	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_NOTOWNED);
	8781	if (!mbuf_drain_checks(ignore_waiters)) {
	8782	return;
	8783	}
	8784	lck_mtx_lock(mbuf_mlock);
	8785	mbuf_drain_locked(ignore_waiters);
	8786	lck_mtx_unlock(mbuf_mlock);
	8787	}
	8788
	8789
	8790	static int
	8791	m_drain_force_sysctl SYSCTL_HANDLER_ARGS
	8792	{
	8793	#pragma unused(arg1, arg2)
	8794	int val = 0, err;
	8795
	8796	err = sysctl_handle_int(oidp, &val, 0, req);
	8797	if (err != 0 \|\| req->newptr == USER_ADDR_NULL) {
	8798	return err;
	8799	}
	8800	if (val) {
	8801	mbuf_drain(TRUE);
	8802	}
	8803
	8804	return err;
	8805	}
	8806
	8807	#if DEBUG \|\| DEVELOPMENT
	8808	static void
	8809	_mbwdog_logger(const char func, const int line, const char fmt, ...)
	8810	{
	8811	va_list ap;
	8812	struct timeval now;
	8813	char str[384], p[256];
	8814	int len;
	8815
	8816	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
	8817	if (mbwdog_logging == NULL) {
	8818	/*
	8819	* This might block under a mutex, which isn't really great,
	8820	* but this happens once, so we'll live.
	8821	*/
	8822	mbwdog_logging = zalloc_permanent(mbwdog_logging_size,
	8823	ZALIGN_NONE);
	8824	}
	8825	va_start(ap, fmt);
	8826	vsnprintf(p, sizeof(p), fmt, ap);
	8827	va_end(ap);
	8828	microuptime(&now);
	8829	len = scnprintf(str, sizeof(str),
	8830	"\n%ld.%d (%d/%llx) %s:%d %s",
	8831	now.tv_sec, now.tv_usec,
	8832	current_proc()->p_pid,
	8833	(uint64_t)VM_KERNEL_ADDRPERM(current_thread()),
	8834	func, line, p);
	8835	if (len < 0) {
	8836	return;
	8837	}
	8838	if (mbwdog_logging_used + len > mbwdog_logging_size) {
	8839	mbwdog_logging_used = mbwdog_logging_used / 2;
	8840	memmove(mbwdog_logging, mbwdog_logging + mbwdog_logging_used,
	8841	mbwdog_logging_size - mbwdog_logging_used);
	8842	mbwdog_logging[mbwdog_logging_used] = 0;
	8843	}
	8844	strlcat(mbwdog_logging, str, mbwdog_logging_size);
	8845	mbwdog_logging_used += len;
	8846	}
	8847
	8848	static int
	8849	sysctl_mbwdog_log SYSCTL_HANDLER_ARGS
	8850	{
	8851	#pragma unused(oidp, arg1, arg2)
	8852	return SYSCTL_OUT(req, mbwdog_logging, mbwdog_logging_used);
	8853	}
	8854	SYSCTL_DECL(_kern_ipc);
	8855	SYSCTL_PROC(_kern_ipc, OID_AUTO, mbwdog_log,
	8856	CTLTYPE_STRING \| CTLFLAG_RD \| CTLFLAG_LOCKED,
	8857	0, 0, sysctl_mbwdog_log, "A", "");
	8858
	8859	#endif // DEBUG \|\| DEVELOPMENT
	8860
	8861	static void
	8862	mtracelarge_register(size_t size)
	8863	{
	8864	int i;
	8865	struct mtracelarge *trace;
	8866	uintptr_t bt[MLEAK_STACK_DEPTH];
	8867	unsigned int depth;
	8868
	8869	depth = backtrace(bt, MLEAK_STACK_DEPTH, NULL);
	8870	/* Check if this entry is already on the list. */
	8871	for (i = 0; i < MTRACELARGE_NUM_TRACES; i++) {
	8872	trace = &mtracelarge_table[i];
	8873	if (trace->size == size && trace->depth == depth &&
	8874	memcmp(bt, trace->addr, depth * sizeof(uintptr_t)) == 0) {
	8875	return;
	8876	}
	8877	}
	8878	for (i = 0; i < MTRACELARGE_NUM_TRACES; i++) {
	8879	trace = &mtracelarge_table[i];
	8880	if (size > trace->size) {
	8881	trace->depth = depth;
	8882	memcpy(trace->addr, bt, depth * sizeof(uintptr_t));
	8883	trace->size = size;
	8884	break;
	8885	}
	8886	}
	8887	}
	8888
	8889	SYSCTL_DECL(_kern_ipc);
	8890	#if DEBUG \|\| DEVELOPMENT
	8891	#endif
	8892	SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat,
	8893	CTLTYPE_STRUCT \| CTLFLAG_RD \| CTLFLAG_LOCKED,
	8894	0, 0, mbstat_sysctl, "S,mbstat", "");
	8895	SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat,
	8896	CTLTYPE_STRUCT \| CTLFLAG_RD \| CTLFLAG_LOCKED,
	8897	0, 0, mb_stat_sysctl, "S,mb_stat", "");
	8898	SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace,
	8899	CTLTYPE_STRUCT \| CTLFLAG_RD \| CTLFLAG_LOCKED,
	8900	0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", "");
	8901	SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table,
	8902	CTLTYPE_STRUCT \| CTLFLAG_RD \| CTLFLAG_LOCKED,
	8903	0, 0, mleak_table_sysctl, "S,mleak_table", "");
	8904	SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor,
	8905	CTLFLAG_RW \| CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, "");
	8906	SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized,
	8907	CTLFLAG_RD \| CTLFLAG_LOCKED, &mb_normalized, 0, "");
	8908	SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog,
	8909	CTLFLAG_RW \| CTLFLAG_LOCKED, &mb_watchdog, 0, "");
	8910	SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_drain_force,
	8911	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_LOCKED, NULL, 0,
	8912	m_drain_force_sysctl, "I",
	8913	"Forces the mbuf garbage collection to run");
	8914	SYSCTL_INT(_kern_ipc, OID_AUTO, mb_drain_maxint,
	8915	CTLFLAG_RW \| CTLFLAG_LOCKED, &mb_drain_maxint, 0,
	8916	"Minimum time interval between garbage collection");
	8917	SYSCTL_INT(_kern_ipc, OID_AUTO, mb_memory_pressure_percentage,
	8918	CTLFLAG_RW \| CTLFLAG_LOCKED, &mb_memory_pressure_percentage, 0,
	8919	"Percentage of when we trigger memory-pressure for an mbuf-class");