git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2006-2013 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28
	29	/*
	30	* Memory allocator with per-CPU caching, derived from the kmem magazine
	31	* concept and implementation as described in the following paper:
	32	* http://www.usenix.org/events/usenix01/full_papers/bonwick/bonwick.pdf
	33	* That implementation is Copyright 2006 Sun Microsystems, Inc. All rights
	34	* reserved. Use is subject to license terms.
	35	*
	36	* There are several major differences between this and the original kmem
	37	* magazine: this derivative implementation allows for multiple objects to
	38	* be allocated and freed from/to the object cache in one call; in addition,
	39	* it provides for better flexibility where the user is allowed to define
	40	* its own slab allocator (instead of the default zone allocator). Finally,
	41	* no object construction/destruction takes place at the moment, although
	42	* this could be added in future to improve efficiency.
	43	*/
	44
	45	#include <sys/param.h>
	46	#include <sys/types.h>
	47	#include <sys/malloc.h>
	48	#include <sys/mbuf.h>
	49	#include <sys/queue.h>
	50	#include <sys/kernel.h>
	51	#include <sys/systm.h>
	52
	53	#include <kern/debug.h>
	54	#include <kern/zalloc.h>
	55	#include <kern/cpu_number.h>
	56	#include <kern/locks.h>
	57
	58	#include <libkern/libkern.h>
	59	#include <libkern/OSAtomic.h>
	60	#include <libkern/OSDebug.h>
	61
	62	#include <mach/vm_param.h>
	63	#include <machine/limits.h>
	64	#include <machine/machine_routines.h>
	65
	66	#include <string.h>
	67
	68	#include <sys/mcache.h>
	69
	70	#define MCACHE_SIZE(n) \
	71	((size_t)(&((mcache_t *)0)->mc_cpu[n]))
	72
	73	/* Allocate extra in case we need to manually align the pointer */
	74	#define MCACHE_ALLOC_SIZE \
	75	(sizeof (void *) + MCACHE_SIZE(ncpu) + CPU_CACHE_LINE_SIZE)
	76
	77	#define MCACHE_CPU(c) \
	78	(mcache_cpu_t )((void )((char *)(c) + MCACHE_SIZE(cpu_number())))
	79
	80	/*
	81	* MCACHE_LIST_LOCK() and MCACHE_LIST_UNLOCK() are macros used
	82	* to serialize accesses to the global list of caches in the system.
	83	* They also record the thread currently running in the critical
	84	* section, so that we can avoid recursive requests to reap the
	85	* caches when memory runs low.
	86	*/
	87	#define MCACHE_LIST_LOCK() { \
	88	lck_mtx_lock(mcache_llock); \
	89	mcache_llock_owner = current_thread(); \
	90	}
	91
	92	#define MCACHE_LIST_UNLOCK() { \
	93	mcache_llock_owner = NULL; \
	94	lck_mtx_unlock(mcache_llock); \
	95	}
	96
	97	#define MCACHE_LOCK(l) lck_mtx_lock(l)
	98	#define MCACHE_UNLOCK(l) lck_mtx_unlock(l)
	99	#define MCACHE_LOCK_TRY(l) lck_mtx_try_lock(l)
	100
	101	static int ncpu;
	102	static unsigned int cache_line_size;
	103	static lck_mtx_t *mcache_llock;
	104	static struct thread *mcache_llock_owner;
	105	static lck_attr_t *mcache_llock_attr;
	106	static lck_grp_t *mcache_llock_grp;
	107	static lck_grp_attr_t *mcache_llock_grp_attr;
	108	static struct zone *mcache_zone;
	109	static unsigned int mcache_reap_interval;
	110	static UInt32 mcache_reaping;
	111	static int mcache_ready;
	112	static int mcache_updating;
	113
	114	static int mcache_bkt_contention = 3;
	115	#if DEBUG
	116	static unsigned int mcache_flags = MCF_DEBUG;
	117	#else
	118	static unsigned int mcache_flags = 0;
	119	#endif
	120
	121	#define DUMP_MCA_BUF_SIZE 512
	122	static char *mca_dump_buf;
	123
	124	static mcache_bkttype_t mcache_bkttype[] = {
	125	{ 1, 4096, 32768, NULL },
	126	{ 3, 2048, 16384, NULL },
	127	{ 7, 1024, 12288, NULL },
	128	{ 15, 256, 8192, NULL },
	129	{ 31, 64, 4096, NULL },
	130	{ 47, 0, 2048, NULL },
	131	{ 63, 0, 1024, NULL },
	132	{ 95, 0, 512, NULL },
	133	{ 143, 0, 256, NULL },
	134	{ 165, 0, 0, NULL },
	135	};
	136
	137	static mcache_t mcache_create_common(const char , size_t, size_t,
	138	mcache_allocfn_t, mcache_freefn_t, mcache_auditfn_t, mcache_logfn_t,
	139	mcache_notifyfn_t, void *, u_int32_t, int, int);
	140	static unsigned int mcache_slab_alloc(void , mcache_obj_t **,
	141	unsigned int, int);
	142	static void mcache_slab_free(void , mcache_obj_t , boolean_t);
	143	static void mcache_slab_audit(void , mcache_obj_t , boolean_t);
	144	static void mcache_cpu_refill(mcache_cpu_t , mcache_bkt_t , int);
	145	static mcache_bkt_t mcache_bkt_alloc(mcache_t , mcache_bktlist_t *,
	146	mcache_bkttype_t **);
	147	static void mcache_bkt_free(mcache_t , mcache_bktlist_t , mcache_bkt_t *);
	148	static void mcache_cache_bkt_enable(mcache_t *);
	149	static void mcache_bkt_purge(mcache_t *);
	150	static void mcache_bkt_destroy(mcache_t , mcache_bkttype_t ,
	151	mcache_bkt_t *, int);
	152	static void mcache_bkt_ws_update(mcache_t *);
	153	static void mcache_bkt_ws_reap(mcache_t *);
	154	static void mcache_dispatch(void ()(void ), void *);
	155	static void mcache_cache_reap(mcache_t *);
	156	static void mcache_cache_update(mcache_t *);
	157	static void mcache_cache_bkt_resize(void *);
	158	static void mcache_cache_enable(void *);
	159	static void mcache_update(void *);
	160	static void mcache_update_timeout(void *);
	161	static void mcache_applyall(void ()(mcache_t ));
	162	static void mcache_reap_start(void *);
	163	static void mcache_reap_done(void *);
	164	static void mcache_reap_timeout(void *);
	165	static void mcache_notify(mcache_t *, u_int32_t);
	166	static void mcache_purge(void *);
	167
	168	static LIST_HEAD(, mcache) mcache_head;
	169	mcache_t *mcache_audit_cache;
	170
	171	/*
	172	* Initialize the framework; this is currently called as part of BSD init.
	173	*/
	174	__private_extern__ void
	175	mcache_init(void)
	176	{
	177	mcache_bkttype_t *btp;
	178	unsigned int i;
	179	char name[32];
	180
	181	ncpu = ml_get_max_cpus();
	182	(void) mcache_cache_line_size(); /* prime it */
	183
	184	mcache_llock_grp_attr = lck_grp_attr_alloc_init();
	185	mcache_llock_grp = lck_grp_alloc_init("mcache.list",
	186	mcache_llock_grp_attr);
	187	mcache_llock_attr = lck_attr_alloc_init();
	188	mcache_llock = lck_mtx_alloc_init(mcache_llock_grp, mcache_llock_attr);
	189
	190	mcache_zone = zinit(MCACHE_ALLOC_SIZE, 256 * MCACHE_ALLOC_SIZE,
	191	PAGE_SIZE, "mcache");
	192	if (mcache_zone == NULL)
	193	panic("mcache_init: failed to allocate mcache zone\n");
	194	zone_change(mcache_zone, Z_CALLERACCT, FALSE);
	195
	196	LIST_INIT(&mcache_head);
	197
	198	for (i = 0; i < sizeof (mcache_bkttype) / sizeof (*btp); i++) {
	199	btp = &mcache_bkttype[i];
	200	(void) snprintf(name, sizeof (name), "bkt_%d",
	201	btp->bt_bktsize);
	202	btp->bt_cache = mcache_create(name,
	203	(btp->bt_bktsize + 1) * sizeof (void *), 0, 0, MCR_SLEEP);
	204	}
	205
	206	PE_parse_boot_argn("mcache_flags", &mcache_flags, sizeof (mcache_flags));
	207	mcache_flags &= MCF_FLAGS_MASK;
	208
	209	mcache_audit_cache = mcache_create("audit", sizeof (mcache_audit_t),
	210	0, 0, MCR_SLEEP);
	211
	212	mcache_reap_interval = 15 * hz;
	213	mcache_applyall(mcache_cache_bkt_enable);
	214	mcache_ready = 1;
	215
	216	printf("mcache: %d CPU(s), %d bytes CPU cache line size\n",
	217	ncpu, CPU_CACHE_LINE_SIZE);
	218	}
	219
	220	/*
	221	* Return the global mcache flags.
	222	*/
	223	__private_extern__ unsigned int
	224	mcache_getflags(void)
	225	{
	226	return (mcache_flags);
	227	}
	228
	229	/*
	230	* Return the CPU cache line size.
	231	*/
	232	__private_extern__ unsigned int
	233	mcache_cache_line_size(void)
	234	{
	235	if (cache_line_size == 0) {
	236	ml_cpu_info_t cpu_info;
	237	ml_cpu_get_info(&cpu_info);
	238	cache_line_size = cpu_info.cache_line_size;
	239	}
	240	return (cache_line_size);
	241	}
	242
	243	/*
	244	* Create a cache using the zone allocator as the backend slab allocator.
	245	* The caller may specify any alignment for the object; if it specifies 0
	246	* the default alignment (MCACHE_ALIGN) will be used.
	247	*/
	248	__private_extern__ mcache_t *
	249	mcache_create(const char *name, size_t bufsize, size_t align,
	250	u_int32_t flags, int wait)
	251	{
	252	return (mcache_create_common(name, bufsize, align, mcache_slab_alloc,
	253	mcache_slab_free, mcache_slab_audit, NULL, NULL, NULL, flags, 1,
	254	wait));
	255	}
	256
	257	/*
	258	* Create a cache using a custom backend slab allocator. Since the caller
	259	* is responsible for allocation, no alignment guarantee will be provided
	260	* by this framework.
	261	*/
	262	__private_extern__ mcache_t *
	263	mcache_create_ext(const char *name, size_t bufsize,
	264	mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
	265	mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
	266	u_int32_t flags, int wait)
	267	{
	268	return (mcache_create_common(name, bufsize, 0, allocfn,
	269	freefn, auditfn, logfn, notifyfn, arg, flags, 0, wait));
	270	}
	271
	272	/*
	273	* Common cache creation routine.
	274	*/
	275	static mcache_t *
	276	mcache_create_common(const char *name, size_t bufsize, size_t align,
	277	mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
	278	mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
	279	u_int32_t flags, int need_zone, int wait)
	280	{
	281	mcache_bkttype_t *btp;
	282	mcache_t *cp = NULL;
	283	size_t chunksize;
	284	void buf, *pbuf;
	285	int c;
	286	char lck_name[64];
	287
	288	/* If auditing is on and print buffer is NULL, allocate it now */
	289	if ((flags & MCF_DEBUG) && mca_dump_buf == NULL) {
	290	int malloc_wait = (wait & MCR_NOSLEEP) ? M_NOWAIT : M_WAITOK;
	291	MALLOC(mca_dump_buf, char *, DUMP_MCA_BUF_SIZE, M_TEMP,
	292	malloc_wait \| M_ZERO);
	293	if (mca_dump_buf == NULL)
	294	return (NULL);
	295	}
	296
	297	if (!(wait & MCR_NOSLEEP))
	298	buf = zalloc(mcache_zone);
	299	else
	300	buf = zalloc_noblock(mcache_zone);
	301
	302	if (buf == NULL)
	303	goto fail;
	304
	305	bzero(buf, MCACHE_ALLOC_SIZE);
	306
	307	/*
	308	* In case we didn't get a cache-aligned memory, round it up
	309	* accordingly. This is needed in order to get the rest of
	310	* structure members aligned properly. It also means that
	311	* the memory span gets shifted due to the round up, but it
	312	* is okay since we've allocated extra space for this.
	313	*/
	314	cp = (mcache_t *)
	315	P2ROUNDUP((intptr_t)buf + sizeof (void *), CPU_CACHE_LINE_SIZE);
	316	pbuf = (void *)((intptr_t)cp - sizeof (void ));
	317	*pbuf = buf;
	318
	319	/*
	320	* Guaranteed alignment is valid only when we use the internal
	321	* slab allocator (currently set to use the zone allocator).
	322	*/
	323	if (!need_zone)
	324	align = 1;
	325	else if (align == 0)
	326	align = MCACHE_ALIGN;
	327
	328	if ((align & (align - 1)) != 0)
	329	panic("mcache_create: bad alignment %lu", align);
	330
	331	cp->mc_align = align;
	332	cp->mc_slab_alloc = allocfn;
	333	cp->mc_slab_free = freefn;
	334	cp->mc_slab_audit = auditfn;
	335	cp->mc_slab_log = logfn;
	336	cp->mc_slab_notify = notifyfn;
	337	cp->mc_private = need_zone ? cp : arg;
	338	cp->mc_bufsize = bufsize;
	339	cp->mc_flags = (flags & MCF_FLAGS_MASK) \| mcache_flags;
	340
	341	(void) snprintf(cp->mc_name, sizeof (cp->mc_name), "mcache.%s", name);
	342
	343	(void) snprintf(lck_name, sizeof (lck_name), "%s.cpu", cp->mc_name);
	344	cp->mc_cpu_lock_grp_attr = lck_grp_attr_alloc_init();
	345	cp->mc_cpu_lock_grp = lck_grp_alloc_init(lck_name,
	346	cp->mc_cpu_lock_grp_attr);
	347	cp->mc_cpu_lock_attr = lck_attr_alloc_init();
	348
	349	/*
	350	* Allocation chunk size is the object's size plus any extra size
	351	* needed to satisfy the object's alignment. It is enforced to be
	352	* at least the size of an LP64 pointer to simplify auditing and to
	353	* handle multiple-element allocation requests, where the elements
	354	* returned are linked together in a list.
	355	*/
	356	chunksize = MAX(bufsize, sizeof (u_int64_t));
	357	if (need_zone) {
	358	/* Enforce 64-bit minimum alignment for zone-based buffers */
	359	align = MAX(align, sizeof (u_int64_t));
	360	chunksize += sizeof (void *) + align;
	361	chunksize = P2ROUNDUP(chunksize, align);
	362	if ((cp->mc_slab_zone = zinit(chunksize, 64 * 1024 * ncpu,
	363	PAGE_SIZE, cp->mc_name)) == NULL)
	364	goto fail;
	365	zone_change(cp->mc_slab_zone, Z_EXPAND, TRUE);
	366	}
	367	cp->mc_chunksize = chunksize;
	368
	369	/*
	370	* Initialize the bucket layer.
	371	*/
	372	(void) snprintf(lck_name, sizeof (lck_name), "%s.bkt", cp->mc_name);
	373	cp->mc_bkt_lock_grp_attr = lck_grp_attr_alloc_init();
	374	cp->mc_bkt_lock_grp = lck_grp_alloc_init(lck_name,
	375	cp->mc_bkt_lock_grp_attr);
	376	cp->mc_bkt_lock_attr = lck_attr_alloc_init();
	377	lck_mtx_init(&cp->mc_bkt_lock, cp->mc_bkt_lock_grp,
	378	cp->mc_bkt_lock_attr);
	379
	380	(void) snprintf(lck_name, sizeof (lck_name), "%s.sync", cp->mc_name);
	381	cp->mc_sync_lock_grp_attr = lck_grp_attr_alloc_init();
	382	cp->mc_sync_lock_grp = lck_grp_alloc_init(lck_name,
	383	cp->mc_sync_lock_grp_attr);
	384	cp->mc_sync_lock_attr = lck_attr_alloc_init();
	385	lck_mtx_init(&cp->mc_sync_lock, cp->mc_sync_lock_grp,
	386	cp->mc_sync_lock_attr);
	387
	388	for (btp = mcache_bkttype; chunksize <= btp->bt_minbuf; btp++)
	389	continue;
	390
	391	cp->cache_bkttype = btp;
	392
	393	/*
	394	* Initialize the CPU layer. Each per-CPU structure is aligned
	395	* on the CPU cache line boundary to prevent false sharing.
	396	*/
	397	for (c = 0; c < ncpu; c++) {
	398	mcache_cpu_t *ccp = &cp->mc_cpu[c];
	399
	400	VERIFY(IS_P2ALIGNED(ccp, CPU_CACHE_LINE_SIZE));
	401	lck_mtx_init(&ccp->cc_lock, cp->mc_cpu_lock_grp,
	402	cp->mc_cpu_lock_attr);
	403	ccp->cc_objs = -1;
	404	ccp->cc_pobjs = -1;
	405	}
	406
	407	if (mcache_ready)
	408	mcache_cache_bkt_enable(cp);
	409
	410	/* TODO: dynamically create sysctl for stats */
	411
	412	MCACHE_LIST_LOCK();
	413	LIST_INSERT_HEAD(&mcache_head, cp, mc_list);
	414	MCACHE_LIST_UNLOCK();
	415
	416	/*
	417	* If cache buckets are enabled and this is the first cache
	418	* created, start the periodic cache update.
	419	*/
	420	if (!(mcache_flags & MCF_NOCPUCACHE) && !mcache_updating) {
	421	mcache_updating = 1;
	422	mcache_update_timeout(NULL);
	423	}
	424	if (cp->mc_flags & MCF_DEBUG) {
	425	printf("mcache_create: %s (%s) arg %p bufsize %lu align %lu "
	426	"chunksize %lu bktsize %d\n", name, need_zone ? "i" : "e",
	427	arg, bufsize, cp->mc_align, chunksize, btp->bt_bktsize);
	428	}
	429	return (cp);
	430
	431	fail:
	432	if (buf != NULL)
	433	zfree(mcache_zone, buf);
	434	return (NULL);
	435	}
	436
	437	/*
	438	* Allocate one or more objects from a cache.
	439	*/
	440	__private_extern__ unsigned int
	441	mcache_alloc_ext(mcache_t cp, mcache_obj_t *list, unsigned int num, int wait)
	442	{
	443	mcache_cpu_t *ccp;
	444	mcache_obj_t *top = &(list);
	445	mcache_bkt_t *bkt;
	446	unsigned int need = num;
	447	boolean_t nwretry = FALSE;
	448
	449	/* MCR_NOSLEEP and MCR_FAILOK are mutually exclusive */
	450	VERIFY((wait & (MCR_NOSLEEP\|MCR_FAILOK)) != (MCR_NOSLEEP\|MCR_FAILOK));
	451
	452	ASSERT(list != NULL);
	453	*list = NULL;
	454
	455	if (num == 0)
	456	return (0);
	457
	458	retry_alloc:
	459	/* We may not always be running in the same CPU in case of retries */
	460	ccp = MCACHE_CPU(cp);
	461
	462	MCACHE_LOCK(&ccp->cc_lock);
	463	for (;;) {
	464	/*
	465	* If we have an object in the current CPU's filled bucket,
	466	* chain the object to any previous objects and return if
	467	* we've satisfied the number of requested objects.
	468	*/
	469	if (ccp->cc_objs > 0) {
	470	mcache_obj_t *tail;
	471	int objs;
	472
	473	/*
	474	* Objects in the bucket are already linked together
	475	* with the most recently freed object at the head of
	476	* the list; grab as many objects as we can.
	477	*/
	478	objs = MIN((unsigned int)ccp->cc_objs, need);
	479	*list = ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
	480	ccp->cc_objs -= objs;
	481	ccp->cc_alloc += objs;
	482
	483	tail = ccp->cc_filled->bkt_obj[ccp->cc_objs];
	484	list = &tail->obj_next;
	485	*list = NULL;
	486
	487	/* If we got them all, return to caller */
	488	if ((need -= objs) == 0) {
	489	MCACHE_UNLOCK(&ccp->cc_lock);
	490
	491	if (!(cp->mc_flags & MCF_NOLEAKLOG) &&
	492	cp->mc_slab_log != NULL)
	493	(cp->mc_slab_log)(num, top, TRUE);
	494
	495	if (cp->mc_flags & MCF_DEBUG)
	496	goto debug_alloc;
	497
	498	return (num);
	499	}
	500	}
	501
	502	/*
	503	* The CPU's filled bucket is empty. If the previous filled
	504	* bucket was full, exchange and try again.
	505	*/
	506	if (ccp->cc_pobjs > 0) {
	507	mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
	508	continue;
	509	}
	510
	511	/*
	512	* If the bucket layer is disabled, allocate from slab. This
	513	* can happen either because MCF_NOCPUCACHE is set, or because
	514	* the bucket layer is currently being resized.
	515	*/
	516	if (ccp->cc_bktsize == 0)
	517	break;
	518
	519	/*
	520	* Both of the CPU's buckets are empty; try to get a full
	521	* bucket from the bucket layer. Upon success, refill this
	522	* CPU and place any empty bucket into the empty list.
	523	*/
	524	bkt = mcache_bkt_alloc(cp, &cp->mc_full, NULL);
	525	if (bkt != NULL) {
	526	if (ccp->cc_pfilled != NULL)
	527	mcache_bkt_free(cp, &cp->mc_empty,
	528	ccp->cc_pfilled);
	529	mcache_cpu_refill(ccp, bkt, ccp->cc_bktsize);
	530	continue;
	531	}
	532
	533	/*
	534	* The bucket layer has no full buckets; allocate the
	535	* object(s) directly from the slab layer.
	536	*/
	537	break;
	538	}
	539	MCACHE_UNLOCK(&ccp->cc_lock);
	540
	541	need -= (*cp->mc_slab_alloc)(cp->mc_private, &list, need, wait);
	542
	543	/*
	544	* If this is a blocking allocation, or if it is non-blocking and
	545	* the cache's full bucket is non-empty, then retry the allocation.
	546	*/
	547	if (need > 0) {
	548	if (!(wait & MCR_NONBLOCKING)) {
	549	atomic_add_32(&cp->mc_wretry_cnt, 1);
	550	goto retry_alloc;
	551	} else if ((wait & (MCR_NOSLEEP \| MCR_TRYHARD)) &&
	552	!mcache_bkt_isempty(cp)) {
	553	if (!nwretry)
	554	nwretry = TRUE;
	555	atomic_add_32(&cp->mc_nwretry_cnt, 1);
	556	goto retry_alloc;
	557	} else if (nwretry) {
	558	atomic_add_32(&cp->mc_nwfail_cnt, 1);
	559	}
	560	}
	561
	562	if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL)
	563	(cp->mc_slab_log)((num - need), top, TRUE);
	564
	565	if (!(cp->mc_flags & MCF_DEBUG))
	566	return (num - need);
	567
	568	debug_alloc:
	569	if (cp->mc_flags & MCF_DEBUG) {
	570	mcache_obj_t **o = top;
	571	unsigned int n;
	572
	573	n = 0;
	574	/*
	575	* Verify that the chain of objects have the same count as
	576	* what we are about to report to the caller. Any mismatch
	577	* here means that the object list is insanely broken and
	578	* therefore we must panic.
	579	*/
	580	while (*o != NULL) {
	581	o = &(*o)->obj_next;
	582	++n;
	583	}
	584	if (n != (num - need)) {
	585	panic("mcache_alloc_ext: %s cp %p corrupted list "
	586	"(got %d actual %d)\n", cp->mc_name,
	587	(void *)cp, num - need, n);
	588	}
	589	}
	590
	591	/* Invoke the slab layer audit callback if auditing is enabled */
	592	if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL)
	593	(cp->mc_slab_audit)(cp->mc_private, top, TRUE);
	594
	595	return (num - need);
	596	}
	597
	598	/*
	599	* Allocate a single object from a cache.
	600	*/
	601	__private_extern__ void *
	602	mcache_alloc(mcache_t *cp, int wait)
	603	{
	604	mcache_obj_t *buf;
	605
	606	(void) mcache_alloc_ext(cp, &buf, 1, wait);
	607	return (buf);
	608	}
	609
	610	__private_extern__ void
	611	mcache_waiter_inc(mcache_t *cp)
	612	{
	613	atomic_add_32(&cp->mc_waiter_cnt, 1);
	614	}
	615
	616	__private_extern__ void
	617	mcache_waiter_dec(mcache_t *cp)
	618	{
	619	atomic_add_32(&cp->mc_waiter_cnt, -1);
	620	}
	621
	622	__private_extern__ boolean_t
	623	mcache_bkt_isempty(mcache_t *cp)
	624	{
	625	/*
	626	* This isn't meant to accurately tell whether there are
	627	* any full buckets in the cache; it is simply a way to
	628	* obtain "hints" about the state of the cache.
	629	*/
	630	return (cp->mc_full.bl_total == 0);
	631	}
	632
	633	/*
	634	* Notify the slab layer about an event.
	635	*/
	636	static void
	637	mcache_notify(mcache_t *cp, u_int32_t event)
	638	{
	639	if (cp->mc_slab_notify != NULL)
	640	(*cp->mc_slab_notify)(cp->mc_private, event);
	641	}
	642
	643	/*
	644	* Purge the cache and disable its buckets.
	645	*/
	646	static void
	647	mcache_purge(void *arg)
	648	{
	649	mcache_t *cp = arg;
	650
	651	mcache_bkt_purge(cp);
	652	/*
	653	* We cannot simply call mcache_cache_bkt_enable() from here as
	654	* a bucket resize may be in flight and we would cause the CPU
	655	* layers of the cache to point to different sizes. Therefore,
	656	* we simply increment the enable count so that during the next
	657	* periodic cache update the buckets can be reenabled.
	658	*/
	659	lck_mtx_lock_spin(&cp->mc_sync_lock);
	660	cp->mc_enable_cnt++;
	661	lck_mtx_unlock(&cp->mc_sync_lock);
	662
	663	}
	664
	665	__private_extern__ boolean_t
	666	mcache_purge_cache(mcache_t *cp)
	667	{
	668	/*
	669	* Purging a cache that has no per-CPU caches or is already
	670	* in the process of being purged is rather pointless.
	671	*/
	672	if (cp->mc_flags & MCF_NOCPUCACHE)
	673	return (FALSE);
	674
	675	lck_mtx_lock_spin(&cp->mc_sync_lock);
	676	if (cp->mc_purge_cnt > 0) {
	677	lck_mtx_unlock(&cp->mc_sync_lock);
	678	return (FALSE);
	679	}
	680	cp->mc_purge_cnt++;
	681	lck_mtx_unlock(&cp->mc_sync_lock);
	682
	683	mcache_dispatch(mcache_purge, cp);
	684
	685	return (TRUE);
	686	}
	687
	688	/*
	689	* Free a single object to a cache.
	690	*/
	691	__private_extern__ void
	692	mcache_free(mcache_t cp, void buf)
	693	{
	694	((mcache_obj_t *)buf)->obj_next = NULL;
	695	mcache_free_ext(cp, (mcache_obj_t *)buf);
	696	}
	697
	698	/*
	699	* Free one or more objects to a cache.
	700	*/
	701	__private_extern__ void
	702	mcache_free_ext(mcache_t cp, mcache_obj_t list)
	703	{
	704	mcache_cpu_t *ccp = MCACHE_CPU(cp);
	705	mcache_bkttype_t *btp;
	706	mcache_obj_t *nlist;
	707	mcache_bkt_t *bkt;
	708
	709	if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL)
	710	(*cp->mc_slab_log)(0, list, FALSE);
	711
	712	/* Invoke the slab layer audit callback if auditing is enabled */
	713	if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL)
	714	(*cp->mc_slab_audit)(cp->mc_private, list, FALSE);
	715
	716	MCACHE_LOCK(&ccp->cc_lock);
	717	for (;;) {
	718	/*
	719	* If there is space in the current CPU's filled bucket, put
	720	* the object there and return once all objects are freed.
	721	* Note the cast to unsigned integer takes care of the case
	722	* where the bucket layer is disabled (when cc_objs is -1).
	723	*/
	724	if ((unsigned int)ccp->cc_objs <
	725	(unsigned int)ccp->cc_bktsize) {
	726	/*
	727	* Reverse the list while we place the object into the
	728	* bucket; this effectively causes the most recently
	729	* freed object(s) to be reused during allocation.
	730	*/
	731	nlist = list->obj_next;
	732	list->obj_next = (ccp->cc_objs == 0) ? NULL :
	733	ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
	734	ccp->cc_filled->bkt_obj[ccp->cc_objs++] = list;
	735	ccp->cc_free++;
	736
	737	if ((list = nlist) != NULL)
	738	continue;
	739
	740	/* We are done; return to caller */
	741	MCACHE_UNLOCK(&ccp->cc_lock);
	742
	743	/* If there is a waiter below, notify it */
	744	if (cp->mc_waiter_cnt > 0)
	745	mcache_notify(cp, MCN_RETRYALLOC);
	746	return;
	747	}
	748
	749	/*
	750	* The CPU's filled bucket is full. If the previous filled
	751	* bucket was empty, exchange and try again.
	752	*/
	753	if (ccp->cc_pobjs == 0) {
	754	mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
	755	continue;
	756	}
	757
	758	/*
	759	* If the bucket layer is disabled, free to slab. This can
	760	* happen either because MCF_NOCPUCACHE is set, or because
	761	* the bucket layer is currently being resized.
	762	*/
	763	if (ccp->cc_bktsize == 0)
	764	break;
	765
	766	/*
	767	* Both of the CPU's buckets are full; try to get an empty
	768	* bucket from the bucket layer. Upon success, empty this
	769	* CPU and place any full bucket into the full list.
	770	*/
	771	bkt = mcache_bkt_alloc(cp, &cp->mc_empty, &btp);
	772	if (bkt != NULL) {
	773	if (ccp->cc_pfilled != NULL)
	774	mcache_bkt_free(cp, &cp->mc_full,
	775	ccp->cc_pfilled);
	776	mcache_cpu_refill(ccp, bkt, 0);
	777	continue;
	778	}
	779
	780	/*
	781	* We need an empty bucket to put our freed objects into
	782	* but couldn't get an empty bucket from the bucket layer;
	783	* attempt to allocate one. We do not want to block for
	784	* allocation here, and if the bucket allocation fails
	785	* we will simply fall through to the slab layer.
	786	*/
	787	MCACHE_UNLOCK(&ccp->cc_lock);
	788	bkt = mcache_alloc(btp->bt_cache, MCR_NOSLEEP);
	789	MCACHE_LOCK(&ccp->cc_lock);
	790
	791	if (bkt != NULL) {
	792	/*
	793	* We have an empty bucket, but since we drop the
	794	* CPU lock above, the cache's bucket size may have
	795	* changed. If so, free the bucket and try again.
	796	*/
	797	if (ccp->cc_bktsize != btp->bt_bktsize) {
	798	MCACHE_UNLOCK(&ccp->cc_lock);
	799	mcache_free(btp->bt_cache, bkt);
	800	MCACHE_LOCK(&ccp->cc_lock);
	801	continue;
	802	}
	803
	804	/*
	805	* We have an empty bucket of the right size;
	806	* add it to the bucket layer and try again.
	807	*/
	808	mcache_bkt_free(cp, &cp->mc_empty, bkt);
	809	continue;
	810	}
	811
	812	/*
	813	* The bucket layer has no empty buckets; free the
	814	* object(s) directly to the slab layer.
	815	*/
	816	break;
	817	}
	818	MCACHE_UNLOCK(&ccp->cc_lock);
	819
	820	/* If there is a waiter below, notify it */
	821	if (cp->mc_waiter_cnt > 0)
	822	mcache_notify(cp, MCN_RETRYALLOC);
	823
	824	/* Advise the slab layer to purge the object(s) */
	825	(*cp->mc_slab_free)(cp->mc_private, list,
	826	(cp->mc_flags & MCF_DEBUG) \|\| cp->mc_purge_cnt);
	827	}
	828
	829	/*
	830	* Cache destruction routine.
	831	*/
	832	__private_extern__ void
	833	mcache_destroy(mcache_t *cp)
	834	{
	835	void **pbuf;
	836
	837	MCACHE_LIST_LOCK();
	838	LIST_REMOVE(cp, mc_list);
	839	MCACHE_LIST_UNLOCK();
	840
	841	mcache_bkt_purge(cp);
	842
	843	/*
	844	* This cache is dead; there should be no further transaction.
	845	* If it's still invoked, make sure that it induces a fault.
	846	*/
	847	cp->mc_slab_alloc = NULL;
	848	cp->mc_slab_free = NULL;
	849	cp->mc_slab_audit = NULL;
	850
	851	lck_attr_free(cp->mc_bkt_lock_attr);
	852	lck_grp_free(cp->mc_bkt_lock_grp);
	853	lck_grp_attr_free(cp->mc_bkt_lock_grp_attr);
	854
	855	lck_attr_free(cp->mc_cpu_lock_attr);
	856	lck_grp_free(cp->mc_cpu_lock_grp);
	857	lck_grp_attr_free(cp->mc_cpu_lock_grp_attr);
	858
	859	lck_attr_free(cp->mc_sync_lock_attr);
	860	lck_grp_free(cp->mc_sync_lock_grp);
	861	lck_grp_attr_free(cp->mc_sync_lock_grp_attr);
	862
	863	/*
	864	* TODO: We need to destroy the zone here, but cannot do it
	865	* because there is no such way to achieve that. Until then
	866	* the memory allocated for the zone structure is leaked.
	867	* Once it is achievable, uncomment these lines:
	868	*
	869	* if (cp->mc_slab_zone != NULL) {
	870	* zdestroy(cp->mc_slab_zone);
	871	* cp->mc_slab_zone = NULL;
	872	* }
	873	*/
	874
	875	/* Get the original address since we're about to free it */
	876	pbuf = (void *)((intptr_t)cp - sizeof (void ));
	877
	878	zfree(mcache_zone, *pbuf);
	879	}
	880
	881	/*
	882	* Internal slab allocator used as a backend for simple caches. The current
	883	* implementation uses the zone allocator for simplicity reasons.
	884	*/
	885	static unsigned int
	886	mcache_slab_alloc(void arg, mcache_obj_t **plist, unsigned int num, int wait)
	887	{
	888	mcache_t *cp = arg;
	889	unsigned int need = num;
	890	size_t offset = 0;
	891	size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof (u_int64_t));
	892	u_int32_t flags = cp->mc_flags;
	893	void buf, base, **pbuf;
	894	mcache_obj_t *list = plist;
	895
	896	*list = NULL;
	897
	898	/*
	899	* The address of the object returned to the caller is an
	900	* offset from the 64-bit aligned base address only if the
	901	* cache's alignment requirement is neither 1 nor 8 bytes.
	902	*/
	903	if (cp->mc_align != 1 && cp->mc_align != sizeof (u_int64_t))
	904	offset = cp->mc_align;
	905
	906	for (;;) {
	907	if (!(wait & MCR_NOSLEEP))
	908	buf = zalloc(cp->mc_slab_zone);
	909	else
	910	buf = zalloc_noblock(cp->mc_slab_zone);
	911
	912	if (buf == NULL)
	913	break;
	914
	915	/* Get the 64-bit aligned base address for this object */
	916	base = (void *)P2ROUNDUP((intptr_t)buf + sizeof (u_int64_t),
	917	sizeof (u_int64_t));
	918
	919	/*
	920	* Wind back a pointer size from the aligned base and
	921	* save the original address so we can free it later.
	922	*/
	923	pbuf = (void *)((intptr_t)base - sizeof (void ));
	924	*pbuf = buf;
	925
	926	/*
	927	* If auditing is enabled, patternize the contents of
	928	* the buffer starting from the 64-bit aligned base to
	929	* the end of the buffer; the length is rounded up to
	930	* the nearest 64-bit multiply; this is because we use
	931	* 64-bit memory access to set/check the pattern.
	932	*/
	933	if (flags & MCF_DEBUG) {
	934	VERIFY(((intptr_t)base + rsize) <=
	935	((intptr_t)buf + cp->mc_chunksize));
	936	mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
	937	}
	938
	939	/*
	940	* Fix up the object's address to fulfill the cache's
	941	* alignment requirement (if needed) and return this
	942	* to the caller.
	943	*/
	944	VERIFY(((intptr_t)base + offset + cp->mc_bufsize) <=
	945	((intptr_t)buf + cp->mc_chunksize));
	946	list = (mcache_obj_t )((intptr_t)base + offset);
	947
	948	(*list)->obj_next = NULL;
	949	list = plist = &(list)->obj_next;
	950
	951	/* If we got them all, return to mcache */
	952	if (--need == 0)
	953	break;
	954	}
	955
	956	return (num - need);
	957	}
	958
	959	/*
	960	* Internal slab deallocator used as a backend for simple caches.
	961	*/
	962	static void
	963	mcache_slab_free(void arg, mcache_obj_t list, __unused boolean_t purged)
	964	{
	965	mcache_t *cp = arg;
	966	mcache_obj_t *nlist;
	967	size_t offset = 0;
	968	size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof (u_int64_t));
	969	u_int32_t flags = cp->mc_flags;
	970	void *base;
	971	void **pbuf;
	972
	973	/*
	974	* The address of the object is an offset from a 64-bit
	975	* aligned base address only if the cache's alignment
	976	* requirement is neither 1 nor 8 bytes.
	977	*/
	978	if (cp->mc_align != 1 && cp->mc_align != sizeof (u_int64_t))
	979	offset = cp->mc_align;
	980
	981	for (;;) {
	982	nlist = list->obj_next;
	983	list->obj_next = NULL;
	984
	985	/* Get the 64-bit aligned base address of this object */
	986	base = (void *)((intptr_t)list - offset);
	987	VERIFY(IS_P2ALIGNED(base, sizeof (u_int64_t)));
	988
	989	/* Get the original address since we're about to free it */
	990	pbuf = (void *)((intptr_t)base - sizeof (void ));
	991
	992	if (flags & MCF_DEBUG) {
	993	VERIFY(((intptr_t)base + rsize) <=
	994	((intptr_t)*pbuf + cp->mc_chunksize));
	995	mcache_audit_free_verify(NULL, base, offset, rsize);
	996	}
	997
	998	/* Free it to zone */
	999	VERIFY(((intptr_t)base + offset + cp->mc_bufsize) <=
	1000	((intptr_t)*pbuf + cp->mc_chunksize));
	1001	zfree(cp->mc_slab_zone, *pbuf);
	1002
	1003	/* No more objects to free; return to mcache */
	1004	if ((list = nlist) == NULL)
	1005	break;
	1006	}
	1007	}
	1008
	1009	/*
	1010	* Internal slab auditor for simple caches.
	1011	*/
	1012	static void
	1013	mcache_slab_audit(void arg, mcache_obj_t list, boolean_t alloc)
	1014	{
	1015	mcache_t *cp = arg;
	1016	size_t offset = 0;
	1017	size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof (u_int64_t));
	1018	void base, *pbuf;
	1019
	1020	/*
	1021	* The address of the object returned to the caller is an
	1022	* offset from the 64-bit aligned base address only if the
	1023	* cache's alignment requirement is neither 1 nor 8 bytes.
	1024	*/
	1025	if (cp->mc_align != 1 && cp->mc_align != sizeof (u_int64_t))
	1026	offset = cp->mc_align;
	1027
	1028	while (list != NULL) {
	1029	mcache_obj_t *next = list->obj_next;
	1030
	1031	/* Get the 64-bit aligned base address of this object */
	1032	base = (void *)((intptr_t)list - offset);
	1033	VERIFY(IS_P2ALIGNED(base, sizeof (u_int64_t)));
	1034
	1035	/* Get the original address */
	1036	pbuf = (void *)((intptr_t)base - sizeof (void ));
	1037
	1038	VERIFY(((intptr_t)base + rsize) <=
	1039	((intptr_t)*pbuf + cp->mc_chunksize));
	1040
	1041	if (!alloc)
	1042	mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
	1043	else
	1044	mcache_audit_free_verify_set(NULL, base, offset, rsize);
	1045
	1046	list = list->obj_next = next;
	1047	}
	1048	}
	1049
	1050	/*
	1051	* Refill the CPU's filled bucket with bkt and save the previous one.
	1052	*/
	1053	static void
	1054	mcache_cpu_refill(mcache_cpu_t ccp, mcache_bkt_t bkt, int objs)
	1055	{
	1056	ASSERT((ccp->cc_filled == NULL && ccp->cc_objs == -1) \|\|
	1057	(ccp->cc_filled && ccp->cc_objs + objs == ccp->cc_bktsize));
	1058	ASSERT(ccp->cc_bktsize > 0);
	1059
	1060	ccp->cc_pfilled = ccp->cc_filled;
	1061	ccp->cc_pobjs = ccp->cc_objs;
	1062	ccp->cc_filled = bkt;
	1063	ccp->cc_objs = objs;
	1064	}
	1065
	1066	/*
	1067	* Allocate a bucket from the bucket layer.
	1068	*/
	1069	static mcache_bkt_t *
	1070	mcache_bkt_alloc(mcache_t cp, mcache_bktlist_t blp, mcache_bkttype_t **btp)
	1071	{
	1072	mcache_bkt_t *bkt;
	1073
	1074	if (!MCACHE_LOCK_TRY(&cp->mc_bkt_lock)) {
	1075	/*
	1076	* The bucket layer lock is held by another CPU; increase
	1077	* the contention count so that we can later resize the
	1078	* bucket size accordingly.
	1079	*/
	1080	MCACHE_LOCK(&cp->mc_bkt_lock);
	1081	cp->mc_bkt_contention++;
	1082	}
	1083
	1084	if ((bkt = blp->bl_list) != NULL) {
	1085	blp->bl_list = bkt->bkt_next;
	1086	if (--blp->bl_total < blp->bl_min)
	1087	blp->bl_min = blp->bl_total;
	1088	blp->bl_alloc++;
	1089	}
	1090
	1091	if (btp != NULL)
	1092	*btp = cp->cache_bkttype;
	1093
	1094	MCACHE_UNLOCK(&cp->mc_bkt_lock);
	1095
	1096	return (bkt);
	1097	}
	1098
	1099	/*
	1100	* Free a bucket to the bucket layer.
	1101	*/
	1102	static void
	1103	mcache_bkt_free(mcache_t cp, mcache_bktlist_t blp, mcache_bkt_t *bkt)
	1104	{
	1105	MCACHE_LOCK(&cp->mc_bkt_lock);
	1106
	1107	bkt->bkt_next = blp->bl_list;
	1108	blp->bl_list = bkt;
	1109	blp->bl_total++;
	1110
	1111	MCACHE_UNLOCK(&cp->mc_bkt_lock);
	1112	}
	1113
	1114	/*
	1115	* Enable the bucket layer of a cache.
	1116	*/
	1117	static void
	1118	mcache_cache_bkt_enable(mcache_t *cp)
	1119	{
	1120	mcache_cpu_t *ccp;
	1121	int cpu;
	1122
	1123	if (cp->mc_flags & MCF_NOCPUCACHE)
	1124	return;
	1125
	1126	for (cpu = 0; cpu < ncpu; cpu++) {
	1127	ccp = &cp->mc_cpu[cpu];
	1128	MCACHE_LOCK(&ccp->cc_lock);
	1129	ccp->cc_bktsize = cp->cache_bkttype->bt_bktsize;
	1130	MCACHE_UNLOCK(&ccp->cc_lock);
	1131	}
	1132	}
	1133
	1134	/*
	1135	* Purge all buckets from a cache and disable its bucket layer.
	1136	*/
	1137	static void
	1138	mcache_bkt_purge(mcache_t *cp)
	1139	{
	1140	mcache_cpu_t *ccp;
	1141	mcache_bkt_t bp, pbp;
	1142	mcache_bkttype_t *btp;
	1143	int cpu, objs, pobjs;
	1144
	1145	for (cpu = 0; cpu < ncpu; cpu++) {
	1146	ccp = &cp->mc_cpu[cpu];
	1147
	1148	MCACHE_LOCK(&ccp->cc_lock);
	1149
	1150	btp = cp->cache_bkttype;
	1151	bp = ccp->cc_filled;
	1152	pbp = ccp->cc_pfilled;
	1153	objs = ccp->cc_objs;
	1154	pobjs = ccp->cc_pobjs;
	1155	ccp->cc_filled = NULL;
	1156	ccp->cc_pfilled = NULL;
	1157	ccp->cc_objs = -1;
	1158	ccp->cc_pobjs = -1;
	1159	ccp->cc_bktsize = 0;
	1160
	1161	MCACHE_UNLOCK(&ccp->cc_lock);
	1162
	1163	if (bp != NULL)
	1164	mcache_bkt_destroy(cp, btp, bp, objs);
	1165	if (pbp != NULL)
	1166	mcache_bkt_destroy(cp, btp, pbp, pobjs);
	1167	}
	1168
	1169	/*
	1170	* Updating the working set back to back essentially sets
	1171	* the working set size to zero, so everything is reapable.
	1172	*/
	1173	mcache_bkt_ws_update(cp);
	1174	mcache_bkt_ws_update(cp);
	1175
	1176	mcache_bkt_ws_reap(cp);
	1177	}
	1178
	1179	/*
	1180	* Free one or more objects in the bucket to the slab layer,
	1181	* and also free the bucket itself.
	1182	*/
	1183	static void
	1184	mcache_bkt_destroy(mcache_t cp, mcache_bkttype_t btp, mcache_bkt_t *bkt,
	1185	int nobjs)
	1186	{
	1187	if (nobjs > 0) {
	1188	mcache_obj_t *top = bkt->bkt_obj[nobjs - 1];
	1189
	1190	if (cp->mc_flags & MCF_DEBUG) {
	1191	mcache_obj_t *o = top;
	1192	int cnt = 0;
	1193
	1194	/*
	1195	* Verify that the chain of objects in the bucket is
	1196	* valid. Any mismatch here means a mistake when the
	1197	* object(s) were freed to the CPU layer, so we panic.
	1198	*/
	1199	while (o != NULL) {
	1200	o = o->obj_next;
	1201	++cnt;
	1202	}
	1203	if (cnt != nobjs) {
	1204	panic("mcache_bkt_destroy: %s cp %p corrupted "
	1205	"list in bkt %p (nobjs %d actual %d)\n",
	1206	cp->mc_name, (void )cp, (void )bkt,
	1207	nobjs, cnt);
	1208	}
	1209	}
	1210
	1211	/* Advise the slab layer to purge the object(s) */
	1212	(*cp->mc_slab_free)(cp->mc_private, top,
	1213	(cp->mc_flags & MCF_DEBUG) \|\| cp->mc_purge_cnt);
	1214	}
	1215	mcache_free(btp->bt_cache, bkt);
	1216	}
	1217
	1218	/*
	1219	* Update the bucket layer working set statistics.
	1220	*/
	1221	static void
	1222	mcache_bkt_ws_update(mcache_t *cp)
	1223	{
	1224	MCACHE_LOCK(&cp->mc_bkt_lock);
	1225
	1226	cp->mc_full.bl_reaplimit = cp->mc_full.bl_min;
	1227	cp->mc_full.bl_min = cp->mc_full.bl_total;
	1228	cp->mc_empty.bl_reaplimit = cp->mc_empty.bl_min;
	1229	cp->mc_empty.bl_min = cp->mc_empty.bl_total;
	1230
	1231	MCACHE_UNLOCK(&cp->mc_bkt_lock);
	1232	}
	1233
	1234	/*
	1235	* Reap all buckets that are beyond the working set.
	1236	*/
	1237	static void
	1238	mcache_bkt_ws_reap(mcache_t *cp)
	1239	{
	1240	long reap;
	1241	mcache_bkt_t *bkt;
	1242	mcache_bkttype_t *btp;
	1243
	1244	reap = MIN(cp->mc_full.bl_reaplimit, cp->mc_full.bl_min);
	1245	while (reap-- &&
	1246	(bkt = mcache_bkt_alloc(cp, &cp->mc_full, &btp)) != NULL)
	1247	mcache_bkt_destroy(cp, btp, bkt, btp->bt_bktsize);
	1248
	1249	reap = MIN(cp->mc_empty.bl_reaplimit, cp->mc_empty.bl_min);
	1250	while (reap-- &&
	1251	(bkt = mcache_bkt_alloc(cp, &cp->mc_empty, &btp)) != NULL)
	1252	mcache_bkt_destroy(cp, btp, bkt, 0);
	1253	}
	1254
	1255	static void
	1256	mcache_reap_timeout(void *arg)
	1257	{
	1258	volatile UInt32 *flag = arg;
	1259
	1260	ASSERT(flag == &mcache_reaping);
	1261
	1262	*flag = 0;
	1263	}
	1264
	1265	static void
	1266	mcache_reap_done(void *flag)
	1267	{
	1268	timeout(mcache_reap_timeout, flag, mcache_reap_interval);
	1269	}
	1270
	1271	static void
	1272	mcache_reap_start(void *arg)
	1273	{
	1274	UInt32 *flag = arg;
	1275
	1276	ASSERT(flag == &mcache_reaping);
	1277
	1278	mcache_applyall(mcache_cache_reap);
	1279	mcache_dispatch(mcache_reap_done, flag);
	1280	}
	1281
	1282	__private_extern__ void
	1283	mcache_reap(void)
	1284	{
	1285	UInt32 *flag = &mcache_reaping;
	1286
	1287	if (mcache_llock_owner == current_thread() \|\|
	1288	!OSCompareAndSwap(0, 1, flag))
	1289	return;
	1290
	1291	mcache_dispatch(mcache_reap_start, flag);
	1292	}
	1293
	1294	static void
	1295	mcache_cache_reap(mcache_t *cp)
	1296	{
	1297	mcache_bkt_ws_reap(cp);
	1298	}
	1299
	1300	/*
	1301	* Performs period maintenance on a cache.
	1302	*/
	1303	static void
	1304	mcache_cache_update(mcache_t *cp)
	1305	{
	1306	int need_bkt_resize = 0;
	1307	int need_bkt_reenable = 0;
	1308
	1309	lck_mtx_assert(mcache_llock, LCK_MTX_ASSERT_OWNED);
	1310
	1311	mcache_bkt_ws_update(cp);
	1312
	1313	/*
	1314	* Cache resize and post-purge reenable are mutually exclusive.
	1315	* If the cache was previously purged, there is no point of
	1316	* increasing the bucket size as there was an indication of
	1317	* memory pressure on the system.
	1318	*/
	1319	lck_mtx_lock_spin(&cp->mc_sync_lock);
	1320	if (!(cp->mc_flags & MCF_NOCPUCACHE) && cp->mc_enable_cnt)
	1321	need_bkt_reenable = 1;
	1322	lck_mtx_unlock(&cp->mc_sync_lock);
	1323
	1324	MCACHE_LOCK(&cp->mc_bkt_lock);
	1325	/*
	1326	* If the contention count is greater than the threshold, and if
	1327	* we are not already at the maximum bucket size, increase it.
	1328	* Otherwise, if this cache was previously purged by the user
	1329	* then we simply reenable it.
	1330	*/
	1331	if ((unsigned int)cp->mc_chunksize < cp->cache_bkttype->bt_maxbuf &&
	1332	(int)(cp->mc_bkt_contention - cp->mc_bkt_contention_prev) >
	1333	mcache_bkt_contention && !need_bkt_reenable)
	1334	need_bkt_resize = 1;
	1335
	1336	cp ->mc_bkt_contention_prev = cp->mc_bkt_contention;
	1337	MCACHE_UNLOCK(&cp->mc_bkt_lock);
	1338
	1339	if (need_bkt_resize)
	1340	mcache_dispatch(mcache_cache_bkt_resize, cp);
	1341	else if (need_bkt_reenable)
	1342	mcache_dispatch(mcache_cache_enable, cp);
	1343	}
	1344
	1345	/*
	1346	* Recompute a cache's bucket size. This is an expensive operation
	1347	* and should not be done frequently; larger buckets provide for a
	1348	* higher transfer rate with the bucket while smaller buckets reduce
	1349	* the memory consumption.
	1350	*/
	1351	static void
	1352	mcache_cache_bkt_resize(void *arg)
	1353	{
	1354	mcache_t *cp = arg;
	1355	mcache_bkttype_t *btp = cp->cache_bkttype;
	1356
	1357	if ((unsigned int)cp->mc_chunksize < btp->bt_maxbuf) {
	1358	mcache_bkt_purge(cp);
	1359
	1360	/*
	1361	* Upgrade to the next bucket type with larger bucket size;
	1362	* temporarily set the previous contention snapshot to a
	1363	* negative number to prevent unnecessary resize request.
	1364	*/
	1365	MCACHE_LOCK(&cp->mc_bkt_lock);
	1366	cp->cache_bkttype = ++btp;
	1367	cp ->mc_bkt_contention_prev = cp->mc_bkt_contention + INT_MAX;
	1368	MCACHE_UNLOCK(&cp->mc_bkt_lock);
	1369
	1370	mcache_cache_enable(cp);
	1371	}
	1372	}
	1373
	1374	/*
	1375	* Reenable a previously disabled cache due to purge.
	1376	*/
	1377	static void
	1378	mcache_cache_enable(void *arg)
	1379	{
	1380	mcache_t *cp = arg;
	1381
	1382	lck_mtx_lock_spin(&cp->mc_sync_lock);
	1383	cp->mc_purge_cnt = 0;
	1384	cp->mc_enable_cnt = 0;
	1385	lck_mtx_unlock(&cp->mc_sync_lock);
	1386
	1387	mcache_cache_bkt_enable(cp);
	1388	}
	1389
	1390	static void
	1391	mcache_update_timeout(__unused void *arg)
	1392	{
	1393	timeout(mcache_update, NULL, mcache_reap_interval);
	1394	}
	1395
	1396	static void
	1397	mcache_update(__unused void *arg)
	1398	{
	1399	mcache_applyall(mcache_cache_update);
	1400	mcache_dispatch(mcache_update_timeout, NULL);
	1401	}
	1402
	1403	static void
	1404	mcache_applyall(void (func)(mcache_t ))
	1405	{
	1406	mcache_t *cp;
	1407
	1408	MCACHE_LIST_LOCK();
	1409	LIST_FOREACH(cp, &mcache_head, mc_list) {
	1410	func(cp);
	1411	}
	1412	MCACHE_LIST_UNLOCK();
	1413	}
	1414
	1415	static void
	1416	mcache_dispatch(void (func)(void ), void *arg)
	1417	{
	1418	ASSERT(func != NULL);
	1419	timeout(func, arg, hz/1000);
	1420	}
	1421
	1422	__private_extern__ void
	1423	mcache_buffer_log(mcache_audit_t mca, void addr, mcache_t *cp,
	1424	struct timeval *base_ts)
	1425	{
	1426	struct timeval now, base = { 0, 0 };
	1427	void *stack[MCACHE_STACK_DEPTH + 1];
	1428
	1429	mca->mca_addr = addr;
	1430	mca->mca_cache = cp;
	1431	mca->mca_pthread = mca->mca_thread;
	1432	mca->mca_thread = current_thread();
	1433	bcopy(mca->mca_stack, mca->mca_pstack, sizeof (mca->mca_pstack));
	1434	mca->mca_pdepth = mca->mca_depth;
	1435	bzero(stack, sizeof (stack));
	1436	mca->mca_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
	1437	bcopy(&stack[1], mca->mca_stack, sizeof (mca->mca_pstack));
	1438
	1439	mca->mca_ptstamp = mca->mca_tstamp;
	1440	microuptime(&now);
	1441	if (base_ts != NULL)
	1442	base = *base_ts;
	1443	/* tstamp is in ms relative to base_ts */
	1444	mca->mca_tstamp = ((now.tv_usec - base.tv_usec) / 1000);
	1445	if ((now.tv_sec - base.tv_sec) > 0)
	1446	mca->mca_tstamp += ((now.tv_sec - base.tv_sec) * 1000);
	1447	}
	1448
	1449	__private_extern__ void
	1450	mcache_set_pattern(u_int64_t pattern, void *buf_arg, size_t size)
	1451	{
	1452	u_int64_t buf_end = (u_int64_t )((void )((char )buf_arg + size));
	1453	u_int64_t buf = (u_int64_t )buf_arg;
	1454
	1455	VERIFY(IS_P2ALIGNED(buf_arg, sizeof (u_int64_t)));
	1456	VERIFY(IS_P2ALIGNED(size, sizeof (u_int64_t)));
	1457
	1458	while (buf < buf_end)
	1459	*buf++ = pattern;
	1460	}
	1461
	1462	__private_extern__ void *
	1463	mcache_verify_pattern(u_int64_t pattern, void *buf_arg, size_t size)
	1464	{
	1465	u_int64_t buf_end = (u_int64_t )((void )((char )buf_arg + size));
	1466	u_int64_t *buf;
	1467
	1468	VERIFY(IS_P2ALIGNED(buf_arg, sizeof (u_int64_t)));
	1469	VERIFY(IS_P2ALIGNED(size, sizeof (u_int64_t)));
	1470
	1471	for (buf = buf_arg; buf < buf_end; buf++) {
	1472	if (*buf != pattern)
	1473	return (buf);
	1474	}
	1475	return (NULL);
	1476	}
	1477
	1478	__private_extern__ void *
	1479	mcache_verify_set_pattern(u_int64_t old, u_int64_t new, void *buf_arg,
	1480	size_t size)
	1481	{
	1482	u_int64_t buf_end = (u_int64_t )((void )((char )buf_arg + size));
	1483	u_int64_t *buf;
	1484
	1485	VERIFY(IS_P2ALIGNED(buf_arg, sizeof (u_int64_t)));
	1486	VERIFY(IS_P2ALIGNED(size, sizeof (u_int64_t)));
	1487
	1488	for (buf = buf_arg; buf < buf_end; buf++) {
	1489	if (*buf != old) {
	1490	mcache_set_pattern(old, buf_arg,
	1491	(uintptr_t)buf - (uintptr_t)buf_arg);
	1492	return (buf);
	1493	}
	1494	*buf = new;
	1495	}
	1496	return (NULL);
	1497	}
	1498
	1499	__private_extern__ void
	1500	mcache_audit_free_verify(mcache_audit_t mca, void base, size_t offset,
	1501	size_t size)
	1502	{
	1503	void *addr;
	1504	u_int64_t *oaddr64;
	1505	mcache_obj_t *next;
	1506
	1507	addr = (void *)((uintptr_t)base + offset);
	1508	next = ((mcache_obj_t *)addr)->obj_next;
	1509
	1510	/* For the "obj_next" pointer in the buffer */
	1511	oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof (u_int64_t));
	1512	*oaddr64 = MCACHE_FREE_PATTERN;
	1513
	1514	if ((oaddr64 = mcache_verify_pattern(MCACHE_FREE_PATTERN,
	1515	(caddr_t)base, size)) != NULL) {
	1516	mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
	1517	(int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
	1518	/* NOTREACHED */
	1519	}
	1520	((mcache_obj_t *)addr)->obj_next = next;
	1521	}
	1522
	1523	__private_extern__ void
	1524	mcache_audit_free_verify_set(mcache_audit_t mca, void base, size_t offset,
	1525	size_t size)
	1526	{
	1527	void *addr;
	1528	u_int64_t *oaddr64;
	1529	mcache_obj_t *next;
	1530
	1531	addr = (void *)((uintptr_t)base + offset);
	1532	next = ((mcache_obj_t *)addr)->obj_next;
	1533
	1534	/* For the "obj_next" pointer in the buffer */
	1535	oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof (u_int64_t));
	1536	*oaddr64 = MCACHE_FREE_PATTERN;
	1537
	1538	if ((oaddr64 = mcache_verify_set_pattern(MCACHE_FREE_PATTERN,
	1539	MCACHE_UNINITIALIZED_PATTERN, (caddr_t)base, size)) != NULL) {
	1540	mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
	1541	(int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
	1542	/* NOTREACHED */
	1543	}
	1544	((mcache_obj_t *)addr)->obj_next = next;
	1545	}
	1546
	1547	#undef panic
	1548
	1549	__private_extern__ char *
	1550	mcache_dump_mca(mcache_audit_t *mca)
	1551	{
	1552	if (mca_dump_buf == NULL)
	1553	return (NULL);
	1554
	1555	snprintf(mca_dump_buf, DUMP_MCA_BUF_SIZE,
	1556	"mca %p: addr %p, cache %p (%s)\n"
	1557	"last transaction; thread %p, saved PC stack (%d deep):\n"
	1558	"\t%p, %p, %p, %p, %p, %p, %p, %p\n"
	1559	"\t%p, %p, %p, %p, %p, %p, %p, %p\n"
	1560	"previous transaction; thread %p, saved PC stack (%d deep):\n"
	1561	"\t%p, %p, %p, %p, %p, %p, %p, %p\n"
	1562	"\t%p, %p, %p, %p, %p, %p, %p, %p\n",
	1563	mca, mca->mca_addr, mca->mca_cache,
	1564	mca->mca_cache ? mca->mca_cache->mc_name : "?",
	1565	mca->mca_thread, mca->mca_depth,
	1566	mca->mca_stack[0], mca->mca_stack[1], mca->mca_stack[2],
	1567	mca->mca_stack[3], mca->mca_stack[4], mca->mca_stack[5],
	1568	mca->mca_stack[6], mca->mca_stack[7], mca->mca_stack[8],
	1569	mca->mca_stack[9], mca->mca_stack[10], mca->mca_stack[11],
	1570	mca->mca_stack[12], mca->mca_stack[13], mca->mca_stack[14],
	1571	mca->mca_stack[15],
	1572	mca->mca_pthread, mca->mca_pdepth,
	1573	mca->mca_pstack[0], mca->mca_pstack[1], mca->mca_pstack[2],
	1574	mca->mca_pstack[3], mca->mca_pstack[4], mca->mca_pstack[5],
	1575	mca->mca_pstack[6], mca->mca_pstack[7], mca->mca_pstack[8],
	1576	mca->mca_pstack[9], mca->mca_pstack[10], mca->mca_pstack[11],
	1577	mca->mca_pstack[12], mca->mca_pstack[13], mca->mca_pstack[14],
	1578	mca->mca_pstack[15]);
	1579
	1580	return (mca_dump_buf);
	1581	}
	1582
	1583	__private_extern__ void
	1584	mcache_audit_panic(mcache_audit_t mca, void addr, size_t offset,
	1585	int64_t expected, int64_t got)
	1586	{
	1587	if (mca == NULL) {
	1588	panic("mcache_audit: buffer %p modified after free at "
	1589	"offset 0x%lx (0x%llx instead of 0x%llx)\n", addr,
	1590	offset, got, expected);
	1591	/* NOTREACHED */
	1592	}
	1593
	1594	panic("mcache_audit: buffer %p modified after free at offset 0x%lx "
	1595	"(0x%llx instead of 0x%llx)\n%s\n",
	1596	addr, offset, got, expected, mcache_dump_mca(mca));
	1597	/* NOTREACHED */
	1598	}
	1599
	1600	__private_extern__ int
	1601	assfail(const char a, const char f, int l)
	1602	{
	1603	panic("assertion failed: %s, file: %s, line: %d", a, f, l);
	1604	return (0);
	1605	}