git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2006-2014 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28
	29	/*
	30	* Memory allocator with per-CPU caching, derived from the kmem magazine
	31	* concept and implementation as described in the following paper:
	32	* http://www.usenix.org/events/usenix01/full_papers/bonwick/bonwick.pdf
	33	* That implementation is Copyright 2006 Sun Microsystems, Inc. All rights
	34	* reserved. Use is subject to license terms.
	35	*
	36	* There are several major differences between this and the original kmem
	37	* magazine: this derivative implementation allows for multiple objects to
	38	* be allocated and freed from/to the object cache in one call; in addition,
	39	* it provides for better flexibility where the user is allowed to define
	40	* its own slab allocator (instead of the default zone allocator). Finally,
	41	* no object construction/destruction takes place at the moment, although
	42	* this could be added in future to improve efficiency.
	43	*/
	44
	45	#include <sys/param.h>
	46	#include <sys/types.h>
	47	#include <sys/malloc.h>
	48	#include <sys/mbuf.h>
	49	#include <sys/queue.h>
	50	#include <sys/kernel.h>
	51	#include <sys/systm.h>
	52
	53	#include <kern/debug.h>
	54	#include <kern/zalloc.h>
	55	#include <kern/cpu_number.h>
	56	#include <kern/locks.h>
	57	#include <kern/thread_call.h>
	58
	59	#include <libkern/libkern.h>
	60	#include <libkern/OSAtomic.h>
	61	#include <libkern/OSDebug.h>
	62
	63	#include <mach/vm_param.h>
	64	#include <machine/limits.h>
	65	#include <machine/machine_routines.h>
	66
	67	#include <string.h>
	68
	69	#include <sys/mcache.h>
	70
	71	#define MCACHE_SIZE(n) \
	72	((size_t)(&((mcache_t *)0)->mc_cpu[n]))
	73
	74	/* Allocate extra in case we need to manually align the pointer */
	75	#define MCACHE_ALLOC_SIZE \
	76	(sizeof (void *) + MCACHE_SIZE(ncpu) + CPU_CACHE_LINE_SIZE)
	77
	78	#define MCACHE_CPU(c) \
	79	(mcache_cpu_t )((void )((char *)(c) + MCACHE_SIZE(cpu_number())))
	80
	81	/*
	82	* MCACHE_LIST_LOCK() and MCACHE_LIST_UNLOCK() are macros used
	83	* to serialize accesses to the global list of caches in the system.
	84	* They also record the thread currently running in the critical
	85	* section, so that we can avoid recursive requests to reap the
	86	* caches when memory runs low.
	87	*/
	88	#define MCACHE_LIST_LOCK() { \
	89	lck_mtx_lock(mcache_llock); \
	90	mcache_llock_owner = current_thread(); \
	91	}
	92
	93	#define MCACHE_LIST_UNLOCK() { \
	94	mcache_llock_owner = NULL; \
	95	lck_mtx_unlock(mcache_llock); \
	96	}
	97
	98	#define MCACHE_LOCK(l) lck_mtx_lock(l)
	99	#define MCACHE_UNLOCK(l) lck_mtx_unlock(l)
	100	#define MCACHE_LOCK_TRY(l) lck_mtx_try_lock(l)
	101
	102	static int ncpu;
	103	static unsigned int cache_line_size;
	104	static lck_mtx_t *mcache_llock;
	105	static struct thread *mcache_llock_owner;
	106	static lck_attr_t *mcache_llock_attr;
	107	static lck_grp_t *mcache_llock_grp;
	108	static lck_grp_attr_t *mcache_llock_grp_attr;
	109	static struct zone *mcache_zone;
	110	static const uint32_t mcache_reap_interval = 15;
	111	static const uint32_t mcache_reap_interval_leeway = 2;
	112	static UInt32 mcache_reaping;
	113	static int mcache_ready;
	114	static int mcache_updating;
	115
	116	static int mcache_bkt_contention = 3;
	117	#if DEBUG
	118	static unsigned int mcache_flags = MCF_DEBUG;
	119	#else
	120	static unsigned int mcache_flags = 0;
	121	#endif
	122
	123	int mca_trn_max = MCA_TRN_MAX;
	124
	125	#define DUMP_MCA_BUF_SIZE 512
	126	static char *mca_dump_buf;
	127
	128	static mcache_bkttype_t mcache_bkttype[] = {
	129	{ 1, 4096, 32768, NULL },
	130	{ 3, 2048, 16384, NULL },
	131	{ 7, 1024, 12288, NULL },
	132	{ 15, 256, 8192, NULL },
	133	{ 31, 64, 4096, NULL },
	134	{ 47, 0, 2048, NULL },
	135	{ 63, 0, 1024, NULL },
	136	{ 95, 0, 512, NULL },
	137	{ 143, 0, 256, NULL },
	138	{ 165, 0, 0, NULL },
	139	};
	140
	141	static mcache_t mcache_create_common(const char , size_t, size_t,
	142	mcache_allocfn_t, mcache_freefn_t, mcache_auditfn_t, mcache_logfn_t,
	143	mcache_notifyfn_t, void *, u_int32_t, int, int);
	144	static unsigned int mcache_slab_alloc(void , mcache_obj_t **,
	145	unsigned int, int);
	146	static void mcache_slab_free(void , mcache_obj_t , boolean_t);
	147	static void mcache_slab_audit(void , mcache_obj_t , boolean_t);
	148	static void mcache_cpu_refill(mcache_cpu_t , mcache_bkt_t , int);
	149	static mcache_bkt_t mcache_bkt_alloc(mcache_t , mcache_bktlist_t *,
	150	mcache_bkttype_t **);
	151	static void mcache_bkt_free(mcache_t , mcache_bktlist_t , mcache_bkt_t *);
	152	static void mcache_cache_bkt_enable(mcache_t *);
	153	static void mcache_bkt_purge(mcache_t *);
	154	static void mcache_bkt_destroy(mcache_t , mcache_bkttype_t ,
	155	mcache_bkt_t *, int);
	156	static void mcache_bkt_ws_update(mcache_t *);
	157	static void mcache_bkt_ws_reap(mcache_t *);
	158	static void mcache_dispatch(void ()(void ), void *);
	159	static void mcache_cache_reap(mcache_t *);
	160	static void mcache_cache_update(mcache_t *);
	161	static void mcache_cache_bkt_resize(void *);
	162	static void mcache_cache_enable(void *);
	163	static void mcache_update(thread_call_param_t __unused, thread_call_param_t __unused);
	164	static void mcache_update_timeout(void *);
	165	static void mcache_applyall(void ()(mcache_t ));
	166	static void mcache_reap_start(void *);
	167	static void mcache_reap_done(void *);
	168	static void mcache_reap_timeout(thread_call_param_t __unused, thread_call_param_t);
	169	static void mcache_notify(mcache_t *, u_int32_t);
	170	static void mcache_purge(void *);
	171
	172	static LIST_HEAD(, mcache) mcache_head;
	173	mcache_t *mcache_audit_cache;
	174
	175	static thread_call_t mcache_reap_tcall;
	176	static thread_call_t mcache_update_tcall;
	177
	178	/*
	179	* Initialize the framework; this is currently called as part of BSD init.
	180	*/
	181	__private_extern__ void
	182	mcache_init(void)
	183	{
	184	mcache_bkttype_t *btp;
	185	unsigned int i;
	186	char name[32];
	187
	188	VERIFY(mca_trn_max >= 2);
	189
	190	ncpu = ml_get_max_cpus();
	191	(void) mcache_cache_line_size(); /* prime it */
	192
	193	mcache_llock_grp_attr = lck_grp_attr_alloc_init();
	194	mcache_llock_grp = lck_grp_alloc_init("mcache.list",
	195	mcache_llock_grp_attr);
	196	mcache_llock_attr = lck_attr_alloc_init();
	197	mcache_llock = lck_mtx_alloc_init(mcache_llock_grp, mcache_llock_attr);
	198
	199	mcache_reap_tcall = thread_call_allocate(mcache_reap_timeout, NULL);
	200	mcache_update_tcall = thread_call_allocate(mcache_update, NULL);
	201	if (mcache_reap_tcall == NULL \|\| mcache_update_tcall == NULL)
	202	panic("mcache_init: thread_call_allocate failed");
	203
	204	mcache_zone = zinit(MCACHE_ALLOC_SIZE, 256 * MCACHE_ALLOC_SIZE,
	205	PAGE_SIZE, "mcache");
	206	if (mcache_zone == NULL)
	207	panic("mcache_init: failed to allocate mcache zone\n");
	208	zone_change(mcache_zone, Z_CALLERACCT, FALSE);
	209
	210	LIST_INIT(&mcache_head);
	211
	212	for (i = 0; i < sizeof (mcache_bkttype) / sizeof (*btp); i++) {
	213	btp = &mcache_bkttype[i];
	214	(void) snprintf(name, sizeof (name), "bkt_%d",
	215	btp->bt_bktsize);
	216	btp->bt_cache = mcache_create(name,
	217	(btp->bt_bktsize + 1) * sizeof (void *), 0, 0, MCR_SLEEP);
	218	}
	219
	220	PE_parse_boot_argn("mcache_flags", &mcache_flags, sizeof(mcache_flags));
	221	mcache_flags &= MCF_FLAGS_MASK;
	222
	223	mcache_audit_cache = mcache_create("audit", sizeof (mcache_audit_t),
	224	0, 0, MCR_SLEEP);
	225
	226	mcache_applyall(mcache_cache_bkt_enable);
	227	mcache_ready = 1;
	228
	229	printf("mcache: %d CPU(s), %d bytes CPU cache line size\n",
	230	ncpu, CPU_CACHE_LINE_SIZE);
	231	}
	232
	233	/*
	234	* Return the global mcache flags.
	235	*/
	236	__private_extern__ unsigned int
	237	mcache_getflags(void)
	238	{
	239	return (mcache_flags);
	240	}
	241
	242	/*
	243	* Return the CPU cache line size.
	244	*/
	245	__private_extern__ unsigned int
	246	mcache_cache_line_size(void)
	247	{
	248	if (cache_line_size == 0) {
	249	ml_cpu_info_t cpu_info;
	250	ml_cpu_get_info(&cpu_info);
	251	cache_line_size = cpu_info.cache_line_size;
	252	}
	253	return (cache_line_size);
	254	}
	255
	256	/*
	257	* Create a cache using the zone allocator as the backend slab allocator.
	258	* The caller may specify any alignment for the object; if it specifies 0
	259	* the default alignment (MCACHE_ALIGN) will be used.
	260	*/
	261	__private_extern__ mcache_t *
	262	mcache_create(const char *name, size_t bufsize, size_t align,
	263	u_int32_t flags, int wait)
	264	{
	265	return (mcache_create_common(name, bufsize, align, mcache_slab_alloc,
	266	mcache_slab_free, mcache_slab_audit, NULL, NULL, NULL, flags, 1,
	267	wait));
	268	}
	269
	270	/*
	271	* Create a cache using a custom backend slab allocator. Since the caller
	272	* is responsible for allocation, no alignment guarantee will be provided
	273	* by this framework.
	274	*/
	275	__private_extern__ mcache_t *
	276	mcache_create_ext(const char *name, size_t bufsize,
	277	mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
	278	mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
	279	u_int32_t flags, int wait)
	280	{
	281	return (mcache_create_common(name, bufsize, 0, allocfn,
	282	freefn, auditfn, logfn, notifyfn, arg, flags, 0, wait));
	283	}
	284
	285	/*
	286	* Common cache creation routine.
	287	*/
	288	static mcache_t *
	289	mcache_create_common(const char *name, size_t bufsize, size_t align,
	290	mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
	291	mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
	292	u_int32_t flags, int need_zone, int wait)
	293	{
	294	mcache_bkttype_t *btp;
	295	mcache_t *cp = NULL;
	296	size_t chunksize;
	297	void buf, *pbuf;
	298	int c;
	299	char lck_name[64];
	300
	301	/* If auditing is on and print buffer is NULL, allocate it now */
	302	if ((flags & MCF_DEBUG) && mca_dump_buf == NULL) {
	303	int malloc_wait = (wait & MCR_NOSLEEP) ? M_NOWAIT : M_WAITOK;
	304	MALLOC(mca_dump_buf, char *, DUMP_MCA_BUF_SIZE, M_TEMP,
	305	malloc_wait \| M_ZERO);
	306	if (mca_dump_buf == NULL)
	307	return (NULL);
	308	}
	309
	310	if (!(wait & MCR_NOSLEEP))
	311	buf = zalloc(mcache_zone);
	312	else
	313	buf = zalloc_noblock(mcache_zone);
	314
	315	if (buf == NULL)
	316	goto fail;
	317
	318	bzero(buf, MCACHE_ALLOC_SIZE);
	319
	320	/*
	321	* In case we didn't get a cache-aligned memory, round it up
	322	* accordingly. This is needed in order to get the rest of
	323	* structure members aligned properly. It also means that
	324	* the memory span gets shifted due to the round up, but it
	325	* is okay since we've allocated extra space for this.
	326	*/
	327	cp = (mcache_t *)
	328	P2ROUNDUP((intptr_t)buf + sizeof (void *), CPU_CACHE_LINE_SIZE);
	329	pbuf = (void *)((intptr_t)cp - sizeof (void ));
	330	*pbuf = buf;
	331
	332	/*
	333	* Guaranteed alignment is valid only when we use the internal
	334	* slab allocator (currently set to use the zone allocator).
	335	*/
	336	if (!need_zone)
	337	align = 1;
	338	else if (align == 0)
	339	align = MCACHE_ALIGN;
	340
	341	if ((align & (align - 1)) != 0)
	342	panic("mcache_create: bad alignment %lu", align);
	343
	344	cp->mc_align = align;
	345	cp->mc_slab_alloc = allocfn;
	346	cp->mc_slab_free = freefn;
	347	cp->mc_slab_audit = auditfn;
	348	cp->mc_slab_log = logfn;
	349	cp->mc_slab_notify = notifyfn;
	350	cp->mc_private = need_zone ? cp : arg;
	351	cp->mc_bufsize = bufsize;
	352	cp->mc_flags = (flags & MCF_FLAGS_MASK) \| mcache_flags;
	353
	354	(void) snprintf(cp->mc_name, sizeof (cp->mc_name), "mcache.%s", name);
	355
	356	(void) snprintf(lck_name, sizeof (lck_name), "%s.cpu", cp->mc_name);
	357	cp->mc_cpu_lock_grp_attr = lck_grp_attr_alloc_init();
	358	cp->mc_cpu_lock_grp = lck_grp_alloc_init(lck_name,
	359	cp->mc_cpu_lock_grp_attr);
	360	cp->mc_cpu_lock_attr = lck_attr_alloc_init();
	361
	362	/*
	363	* Allocation chunk size is the object's size plus any extra size
	364	* needed to satisfy the object's alignment. It is enforced to be
	365	* at least the size of an LP64 pointer to simplify auditing and to
	366	* handle multiple-element allocation requests, where the elements
	367	* returned are linked together in a list.
	368	*/
	369	chunksize = MAX(bufsize, sizeof (u_int64_t));
	370	if (need_zone) {
	371	/* Enforce 64-bit minimum alignment for zone-based buffers */
	372	align = MAX(align, sizeof (u_int64_t));
	373	chunksize += sizeof (void *) + align;
	374	chunksize = P2ROUNDUP(chunksize, align);
	375	if ((cp->mc_slab_zone = zinit(chunksize, 64 * 1024 * ncpu,
	376	PAGE_SIZE, cp->mc_name)) == NULL)
	377	goto fail;
	378	zone_change(cp->mc_slab_zone, Z_EXPAND, TRUE);
	379	}
	380	cp->mc_chunksize = chunksize;
	381
	382	/*
	383	* Initialize the bucket layer.
	384	*/
	385	(void) snprintf(lck_name, sizeof (lck_name), "%s.bkt", cp->mc_name);
	386	cp->mc_bkt_lock_grp_attr = lck_grp_attr_alloc_init();
	387	cp->mc_bkt_lock_grp = lck_grp_alloc_init(lck_name,
	388	cp->mc_bkt_lock_grp_attr);
	389	cp->mc_bkt_lock_attr = lck_attr_alloc_init();
	390	lck_mtx_init(&cp->mc_bkt_lock, cp->mc_bkt_lock_grp,
	391	cp->mc_bkt_lock_attr);
	392
	393	(void) snprintf(lck_name, sizeof (lck_name), "%s.sync", cp->mc_name);
	394	cp->mc_sync_lock_grp_attr = lck_grp_attr_alloc_init();
	395	cp->mc_sync_lock_grp = lck_grp_alloc_init(lck_name,
	396	cp->mc_sync_lock_grp_attr);
	397	cp->mc_sync_lock_attr = lck_attr_alloc_init();
	398	lck_mtx_init(&cp->mc_sync_lock, cp->mc_sync_lock_grp,
	399	cp->mc_sync_lock_attr);
	400
	401	for (btp = mcache_bkttype; chunksize <= btp->bt_minbuf; btp++)
	402	continue;
	403
	404	cp->cache_bkttype = btp;
	405
	406	/*
	407	* Initialize the CPU layer. Each per-CPU structure is aligned
	408	* on the CPU cache line boundary to prevent false sharing.
	409	*/
	410	for (c = 0; c < ncpu; c++) {
	411	mcache_cpu_t *ccp = &cp->mc_cpu[c];
	412
	413	VERIFY(IS_P2ALIGNED(ccp, CPU_CACHE_LINE_SIZE));
	414	lck_mtx_init(&ccp->cc_lock, cp->mc_cpu_lock_grp,
	415	cp->mc_cpu_lock_attr);
	416	ccp->cc_objs = -1;
	417	ccp->cc_pobjs = -1;
	418	}
	419
	420	if (mcache_ready)
	421	mcache_cache_bkt_enable(cp);
	422
	423	/* TODO: dynamically create sysctl for stats */
	424
	425	MCACHE_LIST_LOCK();
	426	LIST_INSERT_HEAD(&mcache_head, cp, mc_list);
	427	MCACHE_LIST_UNLOCK();
	428
	429	/*
	430	* If cache buckets are enabled and this is the first cache
	431	* created, start the periodic cache update.
	432	*/
	433	if (!(mcache_flags & MCF_NOCPUCACHE) && !mcache_updating) {
	434	mcache_updating = 1;
	435	mcache_update_timeout(NULL);
	436	}
	437	if (cp->mc_flags & MCF_DEBUG) {
	438	printf("mcache_create: %s (%s) arg %p bufsize %lu align %lu "
	439	"chunksize %lu bktsize %d\n", name, need_zone ? "i" : "e",
	440	arg, bufsize, cp->mc_align, chunksize, btp->bt_bktsize);
	441	}
	442	return (cp);
	443
	444	fail:
	445	if (buf != NULL)
	446	zfree(mcache_zone, buf);
	447	return (NULL);
	448	}
	449
	450	/*
	451	* Allocate one or more objects from a cache.
	452	*/
	453	__private_extern__ unsigned int
	454	mcache_alloc_ext(mcache_t cp, mcache_obj_t *list, unsigned int num, int wait)
	455	{
	456	mcache_cpu_t *ccp;
	457	mcache_obj_t *top = &(list);
	458	mcache_bkt_t *bkt;
	459	unsigned int need = num;
	460	boolean_t nwretry = FALSE;
	461
	462	/* MCR_NOSLEEP and MCR_FAILOK are mutually exclusive */
	463	VERIFY((wait & (MCR_NOSLEEP\|MCR_FAILOK)) != (MCR_NOSLEEP\|MCR_FAILOK));
	464
	465	ASSERT(list != NULL);
	466	*list = NULL;
	467
	468	if (num == 0)
	469	return (0);
	470
	471	retry_alloc:
	472	/* We may not always be running in the same CPU in case of retries */
	473	ccp = MCACHE_CPU(cp);
	474
	475	MCACHE_LOCK(&ccp->cc_lock);
	476	for (;;) {
	477	/*
	478	* If we have an object in the current CPU's filled bucket,
	479	* chain the object to any previous objects and return if
	480	* we've satisfied the number of requested objects.
	481	*/
	482	if (ccp->cc_objs > 0) {
	483	mcache_obj_t *tail;
	484	int objs;
	485
	486	/*
	487	* Objects in the bucket are already linked together
	488	* with the most recently freed object at the head of
	489	* the list; grab as many objects as we can.
	490	*/
	491	objs = MIN((unsigned int)ccp->cc_objs, need);
	492	*list = ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
	493	ccp->cc_objs -= objs;
	494	ccp->cc_alloc += objs;
	495
	496	tail = ccp->cc_filled->bkt_obj[ccp->cc_objs];
	497	list = &tail->obj_next;
	498	*list = NULL;
	499
	500	/* If we got them all, return to caller */
	501	if ((need -= objs) == 0) {
	502	MCACHE_UNLOCK(&ccp->cc_lock);
	503
	504	if (!(cp->mc_flags & MCF_NOLEAKLOG) &&
	505	cp->mc_slab_log != NULL)
	506	(cp->mc_slab_log)(num, top, TRUE);
	507
	508	if (cp->mc_flags & MCF_DEBUG)
	509	goto debug_alloc;
	510
	511	return (num);
	512	}
	513	}
	514
	515	/*
	516	* The CPU's filled bucket is empty. If the previous filled
	517	* bucket was full, exchange and try again.
	518	*/
	519	if (ccp->cc_pobjs > 0) {
	520	mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
	521	continue;
	522	}
	523
	524	/*
	525	* If the bucket layer is disabled, allocate from slab. This
	526	* can happen either because MCF_NOCPUCACHE is set, or because
	527	* the bucket layer is currently being resized.
	528	*/
	529	if (ccp->cc_bktsize == 0)
	530	break;
	531
	532	/*
	533	* Both of the CPU's buckets are empty; try to get a full
	534	* bucket from the bucket layer. Upon success, refill this
	535	* CPU and place any empty bucket into the empty list.
	536	*/
	537	bkt = mcache_bkt_alloc(cp, &cp->mc_full, NULL);
	538	if (bkt != NULL) {
	539	if (ccp->cc_pfilled != NULL)
	540	mcache_bkt_free(cp, &cp->mc_empty,
	541	ccp->cc_pfilled);
	542	mcache_cpu_refill(ccp, bkt, ccp->cc_bktsize);
	543	continue;
	544	}
	545
	546	/*
	547	* The bucket layer has no full buckets; allocate the
	548	* object(s) directly from the slab layer.
	549	*/
	550	break;
	551	}
	552	MCACHE_UNLOCK(&ccp->cc_lock);
	553
	554	need -= (*cp->mc_slab_alloc)(cp->mc_private, &list, need, wait);
	555
	556	/*
	557	* If this is a blocking allocation, or if it is non-blocking and
	558	* the cache's full bucket is non-empty, then retry the allocation.
	559	*/
	560	if (need > 0) {
	561	if (!(wait & MCR_NONBLOCKING)) {
	562	atomic_add_32(&cp->mc_wretry_cnt, 1);
	563	goto retry_alloc;
	564	} else if ((wait & (MCR_NOSLEEP \| MCR_TRYHARD)) &&
	565	!mcache_bkt_isempty(cp)) {
	566	if (!nwretry)
	567	nwretry = TRUE;
	568	atomic_add_32(&cp->mc_nwretry_cnt, 1);
	569	goto retry_alloc;
	570	} else if (nwretry) {
	571	atomic_add_32(&cp->mc_nwfail_cnt, 1);
	572	}
	573	}
	574
	575	if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL)
	576	(cp->mc_slab_log)((num - need), top, TRUE);
	577
	578	if (!(cp->mc_flags & MCF_DEBUG))
	579	return (num - need);
	580
	581	debug_alloc:
	582	if (cp->mc_flags & MCF_DEBUG) {
	583	mcache_obj_t **o = top;
	584	unsigned int n;
	585
	586	n = 0;
	587	/*
	588	* Verify that the chain of objects have the same count as
	589	* what we are about to report to the caller. Any mismatch
	590	* here means that the object list is insanely broken and
	591	* therefore we must panic.
	592	*/
	593	while (*o != NULL) {
	594	o = &(*o)->obj_next;
	595	++n;
	596	}
	597	if (n != (num - need)) {
	598	panic("mcache_alloc_ext: %s cp %p corrupted list "
	599	"(got %d actual %d)\n", cp->mc_name,
	600	(void *)cp, num - need, n);
	601	}
	602	}
	603
	604	/* Invoke the slab layer audit callback if auditing is enabled */
	605	if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL)
	606	(cp->mc_slab_audit)(cp->mc_private, top, TRUE);
	607
	608	return (num - need);
	609	}
	610
	611	/*
	612	* Allocate a single object from a cache.
	613	*/
	614	__private_extern__ void *
	615	mcache_alloc(mcache_t *cp, int wait)
	616	{
	617	mcache_obj_t *buf;
	618
	619	(void) mcache_alloc_ext(cp, &buf, 1, wait);
	620	return (buf);
	621	}
	622
	623	__private_extern__ void
	624	mcache_waiter_inc(mcache_t *cp)
	625	{
	626	atomic_add_32(&cp->mc_waiter_cnt, 1);
	627	}
	628
	629	__private_extern__ void
	630	mcache_waiter_dec(mcache_t *cp)
	631	{
	632	atomic_add_32(&cp->mc_waiter_cnt, -1);
	633	}
	634
	635	__private_extern__ boolean_t
	636	mcache_bkt_isempty(mcache_t *cp)
	637	{
	638	/*
	639	* This isn't meant to accurately tell whether there are
	640	* any full buckets in the cache; it is simply a way to
	641	* obtain "hints" about the state of the cache.
	642	*/
	643	return (cp->mc_full.bl_total == 0);
	644	}
	645
	646	/*
	647	* Notify the slab layer about an event.
	648	*/
	649	static void
	650	mcache_notify(mcache_t *cp, u_int32_t event)
	651	{
	652	if (cp->mc_slab_notify != NULL)
	653	(*cp->mc_slab_notify)(cp->mc_private, event);
	654	}
	655
	656	/*
	657	* Purge the cache and disable its buckets.
	658	*/
	659	static void
	660	mcache_purge(void *arg)
	661	{
	662	mcache_t *cp = arg;
	663
	664	mcache_bkt_purge(cp);
	665	/*
	666	* We cannot simply call mcache_cache_bkt_enable() from here as
	667	* a bucket resize may be in flight and we would cause the CPU
	668	* layers of the cache to point to different sizes. Therefore,
	669	* we simply increment the enable count so that during the next
	670	* periodic cache update the buckets can be reenabled.
	671	*/
	672	lck_mtx_lock_spin(&cp->mc_sync_lock);
	673	cp->mc_enable_cnt++;
	674	lck_mtx_unlock(&cp->mc_sync_lock);
	675	}
	676
	677	__private_extern__ boolean_t
	678	mcache_purge_cache(mcache_t *cp, boolean_t async)
	679	{
	680	/*
	681	* Purging a cache that has no per-CPU caches or is already
	682	* in the process of being purged is rather pointless.
	683	*/
	684	if (cp->mc_flags & MCF_NOCPUCACHE)
	685	return (FALSE);
	686
	687	lck_mtx_lock_spin(&cp->mc_sync_lock);
	688	if (cp->mc_purge_cnt > 0) {
	689	lck_mtx_unlock(&cp->mc_sync_lock);
	690	return (FALSE);
	691	}
	692	cp->mc_purge_cnt++;
	693	lck_mtx_unlock(&cp->mc_sync_lock);
	694
	695	if (async)
	696	mcache_dispatch(mcache_purge, cp);
	697	else
	698	mcache_purge(cp);
	699
	700	return (TRUE);
	701	}
	702
	703	/*
	704	* Free a single object to a cache.
	705	*/
	706	__private_extern__ void
	707	mcache_free(mcache_t cp, void buf)
	708	{
	709	((mcache_obj_t *)buf)->obj_next = NULL;
	710	mcache_free_ext(cp, (mcache_obj_t *)buf);
	711	}
	712
	713	/*
	714	* Free one or more objects to a cache.
	715	*/
	716	__private_extern__ void
	717	mcache_free_ext(mcache_t cp, mcache_obj_t list)
	718	{
	719	mcache_cpu_t *ccp = MCACHE_CPU(cp);
	720	mcache_bkttype_t *btp;
	721	mcache_obj_t *nlist;
	722	mcache_bkt_t *bkt;
	723
	724	if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL)
	725	(*cp->mc_slab_log)(0, list, FALSE);
	726
	727	/* Invoke the slab layer audit callback if auditing is enabled */
	728	if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL)
	729	(*cp->mc_slab_audit)(cp->mc_private, list, FALSE);
	730
	731	MCACHE_LOCK(&ccp->cc_lock);
	732	for (;;) {
	733	/*
	734	* If there is space in the current CPU's filled bucket, put
	735	* the object there and return once all objects are freed.
	736	* Note the cast to unsigned integer takes care of the case
	737	* where the bucket layer is disabled (when cc_objs is -1).
	738	*/
	739	if ((unsigned int)ccp->cc_objs <
	740	(unsigned int)ccp->cc_bktsize) {
	741	/*
	742	* Reverse the list while we place the object into the
	743	* bucket; this effectively causes the most recently
	744	* freed object(s) to be reused during allocation.
	745	*/
	746	nlist = list->obj_next;
	747	list->obj_next = (ccp->cc_objs == 0) ? NULL :
	748	ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
	749	ccp->cc_filled->bkt_obj[ccp->cc_objs++] = list;
	750	ccp->cc_free++;
	751
	752	if ((list = nlist) != NULL)
	753	continue;
	754
	755	/* We are done; return to caller */
	756	MCACHE_UNLOCK(&ccp->cc_lock);
	757
	758	/* If there is a waiter below, notify it */
	759	if (cp->mc_waiter_cnt > 0)
	760	mcache_notify(cp, MCN_RETRYALLOC);
	761	return;
	762	}
	763
	764	/*
	765	* The CPU's filled bucket is full. If the previous filled
	766	* bucket was empty, exchange and try again.
	767	*/
	768	if (ccp->cc_pobjs == 0) {
	769	mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
	770	continue;
	771	}
	772
	773	/*
	774	* If the bucket layer is disabled, free to slab. This can
	775	* happen either because MCF_NOCPUCACHE is set, or because
	776	* the bucket layer is currently being resized.
	777	*/
	778	if (ccp->cc_bktsize == 0)
	779	break;
	780
	781	/*
	782	* Both of the CPU's buckets are full; try to get an empty
	783	* bucket from the bucket layer. Upon success, empty this
	784	* CPU and place any full bucket into the full list.
	785	*/
	786	bkt = mcache_bkt_alloc(cp, &cp->mc_empty, &btp);
	787	if (bkt != NULL) {
	788	if (ccp->cc_pfilled != NULL)
	789	mcache_bkt_free(cp, &cp->mc_full,
	790	ccp->cc_pfilled);
	791	mcache_cpu_refill(ccp, bkt, 0);
	792	continue;
	793	}
	794
	795	/*
	796	* We need an empty bucket to put our freed objects into
	797	* but couldn't get an empty bucket from the bucket layer;
	798	* attempt to allocate one. We do not want to block for
	799	* allocation here, and if the bucket allocation fails
	800	* we will simply fall through to the slab layer.
	801	*/
	802	MCACHE_UNLOCK(&ccp->cc_lock);
	803	bkt = mcache_alloc(btp->bt_cache, MCR_NOSLEEP);
	804	MCACHE_LOCK(&ccp->cc_lock);
	805
	806	if (bkt != NULL) {
	807	/*
	808	* We have an empty bucket, but since we drop the
	809	* CPU lock above, the cache's bucket size may have
	810	* changed. If so, free the bucket and try again.
	811	*/
	812	if (ccp->cc_bktsize != btp->bt_bktsize) {
	813	MCACHE_UNLOCK(&ccp->cc_lock);
	814	mcache_free(btp->bt_cache, bkt);
	815	MCACHE_LOCK(&ccp->cc_lock);
	816	continue;
	817	}
	818
	819	/*
	820	* We have an empty bucket of the right size;
	821	* add it to the bucket layer and try again.
	822	*/
	823	mcache_bkt_free(cp, &cp->mc_empty, bkt);
	824	continue;
	825	}
	826
	827	/*
	828	* The bucket layer has no empty buckets; free the
	829	* object(s) directly to the slab layer.
	830	*/
	831	break;
	832	}
	833	MCACHE_UNLOCK(&ccp->cc_lock);
	834
	835	/* If there is a waiter below, notify it */
	836	if (cp->mc_waiter_cnt > 0)
	837	mcache_notify(cp, MCN_RETRYALLOC);
	838
	839	/* Advise the slab layer to purge the object(s) */
	840	(*cp->mc_slab_free)(cp->mc_private, list,
	841	(cp->mc_flags & MCF_DEBUG) \|\| cp->mc_purge_cnt);
	842	}
	843
	844	/*
	845	* Cache destruction routine.
	846	*/
	847	__private_extern__ void
	848	mcache_destroy(mcache_t *cp)
	849	{
	850	void **pbuf;
	851
	852	MCACHE_LIST_LOCK();
	853	LIST_REMOVE(cp, mc_list);
	854	MCACHE_LIST_UNLOCK();
	855
	856	mcache_bkt_purge(cp);
	857
	858	/*
	859	* This cache is dead; there should be no further transaction.
	860	* If it's still invoked, make sure that it induces a fault.
	861	*/
	862	cp->mc_slab_alloc = NULL;
	863	cp->mc_slab_free = NULL;
	864	cp->mc_slab_audit = NULL;
	865
	866	lck_attr_free(cp->mc_bkt_lock_attr);
	867	lck_grp_free(cp->mc_bkt_lock_grp);
	868	lck_grp_attr_free(cp->mc_bkt_lock_grp_attr);
	869
	870	lck_attr_free(cp->mc_cpu_lock_attr);
	871	lck_grp_free(cp->mc_cpu_lock_grp);
	872	lck_grp_attr_free(cp->mc_cpu_lock_grp_attr);
	873
	874	lck_attr_free(cp->mc_sync_lock_attr);
	875	lck_grp_free(cp->mc_sync_lock_grp);
	876	lck_grp_attr_free(cp->mc_sync_lock_grp_attr);
	877
	878	/*
	879	* TODO: We need to destroy the zone here, but cannot do it
	880	* because there is no such way to achieve that. Until then
	881	* the memory allocated for the zone structure is leaked.
	882	* Once it is achievable, uncomment these lines:
	883	*
	884	* if (cp->mc_slab_zone != NULL) {
	885	* zdestroy(cp->mc_slab_zone);
	886	* cp->mc_slab_zone = NULL;
	887	* }
	888	*/
	889
	890	/* Get the original address since we're about to free it */
	891	pbuf = (void *)((intptr_t)cp - sizeof (void ));
	892
	893	zfree(mcache_zone, *pbuf);
	894	}
	895
	896	/*
	897	* Internal slab allocator used as a backend for simple caches. The current
	898	* implementation uses the zone allocator for simplicity reasons.
	899	*/
	900	static unsigned int
	901	mcache_slab_alloc(void arg, mcache_obj_t **plist, unsigned int num, int wait)
	902	{
	903	mcache_t *cp = arg;
	904	unsigned int need = num;
	905	size_t offset = 0;
	906	size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof (u_int64_t));
	907	u_int32_t flags = cp->mc_flags;
	908	void buf, base, **pbuf;
	909	mcache_obj_t *list = plist;
	910
	911	*list = NULL;
	912
	913	/*
	914	* The address of the object returned to the caller is an
	915	* offset from the 64-bit aligned base address only if the
	916	* cache's alignment requirement is neither 1 nor 8 bytes.
	917	*/
	918	if (cp->mc_align != 1 && cp->mc_align != sizeof (u_int64_t))
	919	offset = cp->mc_align;
	920
	921	for (;;) {
	922	if (!(wait & MCR_NOSLEEP))
	923	buf = zalloc(cp->mc_slab_zone);
	924	else
	925	buf = zalloc_noblock(cp->mc_slab_zone);
	926
	927	if (buf == NULL)
	928	break;
	929
	930	/* Get the 64-bit aligned base address for this object */
	931	base = (void *)P2ROUNDUP((intptr_t)buf + sizeof (u_int64_t),
	932	sizeof (u_int64_t));
	933
	934	/*
	935	* Wind back a pointer size from the aligned base and
	936	* save the original address so we can free it later.
	937	*/
	938	pbuf = (void *)((intptr_t)base - sizeof (void ));
	939	*pbuf = buf;
	940
	941	/*
	942	* If auditing is enabled, patternize the contents of
	943	* the buffer starting from the 64-bit aligned base to
	944	* the end of the buffer; the length is rounded up to
	945	* the nearest 64-bit multiply; this is because we use
	946	* 64-bit memory access to set/check the pattern.
	947	*/
	948	if (flags & MCF_DEBUG) {
	949	VERIFY(((intptr_t)base + rsize) <=
	950	((intptr_t)buf + cp->mc_chunksize));
	951	mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
	952	}
	953
	954	/*
	955	* Fix up the object's address to fulfill the cache's
	956	* alignment requirement (if needed) and return this
	957	* to the caller.
	958	*/
	959	VERIFY(((intptr_t)base + offset + cp->mc_bufsize) <=
	960	((intptr_t)buf + cp->mc_chunksize));
	961	list = (mcache_obj_t )((intptr_t)base + offset);
	962
	963	(*list)->obj_next = NULL;
	964	list = plist = &(list)->obj_next;
	965
	966	/* If we got them all, return to mcache */
	967	if (--need == 0)
	968	break;
	969	}
	970
	971	return (num - need);
	972	}
	973
	974	/*
	975	* Internal slab deallocator used as a backend for simple caches.
	976	*/
	977	static void
	978	mcache_slab_free(void arg, mcache_obj_t list, __unused boolean_t purged)
	979	{
	980	mcache_t *cp = arg;
	981	mcache_obj_t *nlist;
	982	size_t offset = 0;
	983	size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof (u_int64_t));
	984	u_int32_t flags = cp->mc_flags;
	985	void *base;
	986	void **pbuf;
	987
	988	/*
	989	* The address of the object is an offset from a 64-bit
	990	* aligned base address only if the cache's alignment
	991	* requirement is neither 1 nor 8 bytes.
	992	*/
	993	if (cp->mc_align != 1 && cp->mc_align != sizeof (u_int64_t))
	994	offset = cp->mc_align;
	995
	996	for (;;) {
	997	nlist = list->obj_next;
	998	list->obj_next = NULL;
	999
	1000	/* Get the 64-bit aligned base address of this object */
	1001	base = (void *)((intptr_t)list - offset);
	1002	VERIFY(IS_P2ALIGNED(base, sizeof (u_int64_t)));
	1003
	1004	/* Get the original address since we're about to free it */
	1005	pbuf = (void *)((intptr_t)base - sizeof (void ));
	1006
	1007	if (flags & MCF_DEBUG) {
	1008	VERIFY(((intptr_t)base + rsize) <=
	1009	((intptr_t)*pbuf + cp->mc_chunksize));
	1010	mcache_audit_free_verify(NULL, base, offset, rsize);
	1011	}
	1012
	1013	/* Free it to zone */
	1014	VERIFY(((intptr_t)base + offset + cp->mc_bufsize) <=
	1015	((intptr_t)*pbuf + cp->mc_chunksize));
	1016	zfree(cp->mc_slab_zone, *pbuf);
	1017
	1018	/* No more objects to free; return to mcache */
	1019	if ((list = nlist) == NULL)
	1020	break;
	1021	}
	1022	}
	1023
	1024	/*
	1025	* Internal slab auditor for simple caches.
	1026	*/
	1027	static void
	1028	mcache_slab_audit(void arg, mcache_obj_t list, boolean_t alloc)
	1029	{
	1030	mcache_t *cp = arg;
	1031	size_t offset = 0;
	1032	size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof (u_int64_t));
	1033	void base, *pbuf;
	1034
	1035	/*
	1036	* The address of the object returned to the caller is an
	1037	* offset from the 64-bit aligned base address only if the
	1038	* cache's alignment requirement is neither 1 nor 8 bytes.
	1039	*/
	1040	if (cp->mc_align != 1 && cp->mc_align != sizeof (u_int64_t))
	1041	offset = cp->mc_align;
	1042
	1043	while (list != NULL) {
	1044	mcache_obj_t *next = list->obj_next;
	1045
	1046	/* Get the 64-bit aligned base address of this object */
	1047	base = (void *)((intptr_t)list - offset);
	1048	VERIFY(IS_P2ALIGNED(base, sizeof (u_int64_t)));
	1049
	1050	/* Get the original address */
	1051	pbuf = (void *)((intptr_t)base - sizeof (void ));
	1052
	1053	VERIFY(((intptr_t)base + rsize) <=
	1054	((intptr_t)*pbuf + cp->mc_chunksize));
	1055
	1056	if (!alloc)
	1057	mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
	1058	else
	1059	mcache_audit_free_verify_set(NULL, base, offset, rsize);
	1060
	1061	list = list->obj_next = next;
	1062	}
	1063	}
	1064
	1065	/*
	1066	* Refill the CPU's filled bucket with bkt and save the previous one.
	1067	*/
	1068	static void
	1069	mcache_cpu_refill(mcache_cpu_t ccp, mcache_bkt_t bkt, int objs)
	1070	{
	1071	ASSERT((ccp->cc_filled == NULL && ccp->cc_objs == -1) \|\|
	1072	(ccp->cc_filled && ccp->cc_objs + objs == ccp->cc_bktsize));
	1073	ASSERT(ccp->cc_bktsize > 0);
	1074
	1075	ccp->cc_pfilled = ccp->cc_filled;
	1076	ccp->cc_pobjs = ccp->cc_objs;
	1077	ccp->cc_filled = bkt;
	1078	ccp->cc_objs = objs;
	1079	}
	1080
	1081	/*
	1082	* Allocate a bucket from the bucket layer.
	1083	*/
	1084	static mcache_bkt_t *
	1085	mcache_bkt_alloc(mcache_t cp, mcache_bktlist_t blp, mcache_bkttype_t **btp)
	1086	{
	1087	mcache_bkt_t *bkt;
	1088
	1089	if (!MCACHE_LOCK_TRY(&cp->mc_bkt_lock)) {
	1090	/*
	1091	* The bucket layer lock is held by another CPU; increase
	1092	* the contention count so that we can later resize the
	1093	* bucket size accordingly.
	1094	*/
	1095	MCACHE_LOCK(&cp->mc_bkt_lock);
	1096	cp->mc_bkt_contention++;
	1097	}
	1098
	1099	if ((bkt = blp->bl_list) != NULL) {
	1100	blp->bl_list = bkt->bkt_next;
	1101	if (--blp->bl_total < blp->bl_min)
	1102	blp->bl_min = blp->bl_total;
	1103	blp->bl_alloc++;
	1104	}
	1105
	1106	if (btp != NULL)
	1107	*btp = cp->cache_bkttype;
	1108
	1109	MCACHE_UNLOCK(&cp->mc_bkt_lock);
	1110
	1111	return (bkt);
	1112	}
	1113
	1114	/*
	1115	* Free a bucket to the bucket layer.
	1116	*/
	1117	static void
	1118	mcache_bkt_free(mcache_t cp, mcache_bktlist_t blp, mcache_bkt_t *bkt)
	1119	{
	1120	MCACHE_LOCK(&cp->mc_bkt_lock);
	1121
	1122	bkt->bkt_next = blp->bl_list;
	1123	blp->bl_list = bkt;
	1124	blp->bl_total++;
	1125
	1126	MCACHE_UNLOCK(&cp->mc_bkt_lock);
	1127	}
	1128
	1129	/*
	1130	* Enable the bucket layer of a cache.
	1131	*/
	1132	static void
	1133	mcache_cache_bkt_enable(mcache_t *cp)
	1134	{
	1135	mcache_cpu_t *ccp;
	1136	int cpu;
	1137
	1138	if (cp->mc_flags & MCF_NOCPUCACHE)
	1139	return;
	1140
	1141	for (cpu = 0; cpu < ncpu; cpu++) {
	1142	ccp = &cp->mc_cpu[cpu];
	1143	MCACHE_LOCK(&ccp->cc_lock);
	1144	ccp->cc_bktsize = cp->cache_bkttype->bt_bktsize;
	1145	MCACHE_UNLOCK(&ccp->cc_lock);
	1146	}
	1147	}
	1148
	1149	/*
	1150	* Purge all buckets from a cache and disable its bucket layer.
	1151	*/
	1152	static void
	1153	mcache_bkt_purge(mcache_t *cp)
	1154	{
	1155	mcache_cpu_t *ccp;
	1156	mcache_bkt_t bp, pbp;
	1157	mcache_bkttype_t *btp;
	1158	int cpu, objs, pobjs;
	1159
	1160	for (cpu = 0; cpu < ncpu; cpu++) {
	1161	ccp = &cp->mc_cpu[cpu];
	1162
	1163	MCACHE_LOCK(&ccp->cc_lock);
	1164
	1165	btp = cp->cache_bkttype;
	1166	bp = ccp->cc_filled;
	1167	pbp = ccp->cc_pfilled;
	1168	objs = ccp->cc_objs;
	1169	pobjs = ccp->cc_pobjs;
	1170	ccp->cc_filled = NULL;
	1171	ccp->cc_pfilled = NULL;
	1172	ccp->cc_objs = -1;
	1173	ccp->cc_pobjs = -1;
	1174	ccp->cc_bktsize = 0;
	1175
	1176	MCACHE_UNLOCK(&ccp->cc_lock);
	1177
	1178	if (bp != NULL)
	1179	mcache_bkt_destroy(cp, btp, bp, objs);
	1180	if (pbp != NULL)
	1181	mcache_bkt_destroy(cp, btp, pbp, pobjs);
	1182	}
	1183
	1184	/*
	1185	* Updating the working set back to back essentially sets
	1186	* the working set size to zero, so everything is reapable.
	1187	*/
	1188	mcache_bkt_ws_update(cp);
	1189	mcache_bkt_ws_update(cp);
	1190
	1191	mcache_bkt_ws_reap(cp);
	1192	}
	1193
	1194	/*
	1195	* Free one or more objects in the bucket to the slab layer,
	1196	* and also free the bucket itself.
	1197	*/
	1198	static void
	1199	mcache_bkt_destroy(mcache_t cp, mcache_bkttype_t btp, mcache_bkt_t *bkt,
	1200	int nobjs)
	1201	{
	1202	if (nobjs > 0) {
	1203	mcache_obj_t *top = bkt->bkt_obj[nobjs - 1];
	1204
	1205	if (cp->mc_flags & MCF_DEBUG) {
	1206	mcache_obj_t *o = top;
	1207	int cnt = 0;
	1208
	1209	/*
	1210	* Verify that the chain of objects in the bucket is
	1211	* valid. Any mismatch here means a mistake when the
	1212	* object(s) were freed to the CPU layer, so we panic.
	1213	*/
	1214	while (o != NULL) {
	1215	o = o->obj_next;
	1216	++cnt;
	1217	}
	1218	if (cnt != nobjs) {
	1219	panic("mcache_bkt_destroy: %s cp %p corrupted "
	1220	"list in bkt %p (nobjs %d actual %d)\n",
	1221	cp->mc_name, (void )cp, (void )bkt,
	1222	nobjs, cnt);
	1223	}
	1224	}
	1225
	1226	/* Advise the slab layer to purge the object(s) */
	1227	(*cp->mc_slab_free)(cp->mc_private, top,
	1228	(cp->mc_flags & MCF_DEBUG) \|\| cp->mc_purge_cnt);
	1229	}
	1230	mcache_free(btp->bt_cache, bkt);
	1231	}
	1232
	1233	/*
	1234	* Update the bucket layer working set statistics.
	1235	*/
	1236	static void
	1237	mcache_bkt_ws_update(mcache_t *cp)
	1238	{
	1239	MCACHE_LOCK(&cp->mc_bkt_lock);
	1240
	1241	cp->mc_full.bl_reaplimit = cp->mc_full.bl_min;
	1242	cp->mc_full.bl_min = cp->mc_full.bl_total;
	1243	cp->mc_empty.bl_reaplimit = cp->mc_empty.bl_min;
	1244	cp->mc_empty.bl_min = cp->mc_empty.bl_total;
	1245
	1246	MCACHE_UNLOCK(&cp->mc_bkt_lock);
	1247	}
	1248
	1249	/*
	1250	* Reap all buckets that are beyond the working set.
	1251	*/
	1252	static void
	1253	mcache_bkt_ws_reap(mcache_t *cp)
	1254	{
	1255	long reap;
	1256	mcache_bkt_t *bkt;
	1257	mcache_bkttype_t *btp;
	1258
	1259	reap = MIN(cp->mc_full.bl_reaplimit, cp->mc_full.bl_min);
	1260	while (reap-- &&
	1261	(bkt = mcache_bkt_alloc(cp, &cp->mc_full, &btp)) != NULL)
	1262	mcache_bkt_destroy(cp, btp, bkt, btp->bt_bktsize);
	1263
	1264	reap = MIN(cp->mc_empty.bl_reaplimit, cp->mc_empty.bl_min);
	1265	while (reap-- &&
	1266	(bkt = mcache_bkt_alloc(cp, &cp->mc_empty, &btp)) != NULL)
	1267	mcache_bkt_destroy(cp, btp, bkt, 0);
	1268	}
	1269
	1270	static void
	1271	mcache_reap_timeout(thread_call_param_t dummy __unused,
	1272	thread_call_param_t arg)
	1273	{
	1274	volatile UInt32 *flag = arg;
	1275
	1276	ASSERT(flag == &mcache_reaping);
	1277
	1278	*flag = 0;
	1279	}
	1280
	1281	static void
	1282	mcache_reap_done(void *flag)
	1283	{
	1284	uint64_t deadline, leeway;
	1285
	1286	clock_interval_to_deadline(mcache_reap_interval, NSEC_PER_SEC,
	1287	&deadline);
	1288	clock_interval_to_absolutetime_interval(mcache_reap_interval_leeway,
	1289	NSEC_PER_SEC, &leeway);
	1290	thread_call_enter_delayed_with_leeway(mcache_reap_tcall, flag,
	1291	deadline, leeway, THREAD_CALL_DELAY_LEEWAY);
	1292	}
	1293
	1294	static void
	1295	mcache_reap_start(void *arg)
	1296	{
	1297	UInt32 *flag = arg;
	1298
	1299	ASSERT(flag == &mcache_reaping);
	1300
	1301	mcache_applyall(mcache_cache_reap);
	1302	mcache_dispatch(mcache_reap_done, flag);
	1303	}
	1304
	1305	__private_extern__ void
	1306	mcache_reap(void)
	1307	{
	1308	UInt32 *flag = &mcache_reaping;
	1309
	1310	if (mcache_llock_owner == current_thread() \|\|
	1311	!OSCompareAndSwap(0, 1, flag))
	1312	return;
	1313
	1314	mcache_dispatch(mcache_reap_start, flag);
	1315	}
	1316
	1317	static void
	1318	mcache_cache_reap(mcache_t *cp)
	1319	{
	1320	mcache_bkt_ws_reap(cp);
	1321	}
	1322
	1323	/*
	1324	* Performs period maintenance on a cache.
	1325	*/
	1326	static void
	1327	mcache_cache_update(mcache_t *cp)
	1328	{
	1329	int need_bkt_resize = 0;
	1330	int need_bkt_reenable = 0;
	1331
	1332	lck_mtx_assert(mcache_llock, LCK_MTX_ASSERT_OWNED);
	1333
	1334	mcache_bkt_ws_update(cp);
	1335
	1336	/*
	1337	* Cache resize and post-purge reenable are mutually exclusive.
	1338	* If the cache was previously purged, there is no point of
	1339	* increasing the bucket size as there was an indication of
	1340	* memory pressure on the system.
	1341	*/
	1342	lck_mtx_lock_spin(&cp->mc_sync_lock);
	1343	if (!(cp->mc_flags & MCF_NOCPUCACHE) && cp->mc_enable_cnt)
	1344	need_bkt_reenable = 1;
	1345	lck_mtx_unlock(&cp->mc_sync_lock);
	1346
	1347	MCACHE_LOCK(&cp->mc_bkt_lock);
	1348	/*
	1349	* If the contention count is greater than the threshold, and if
	1350	* we are not already at the maximum bucket size, increase it.
	1351	* Otherwise, if this cache was previously purged by the user
	1352	* then we simply reenable it.
	1353	*/
	1354	if ((unsigned int)cp->mc_chunksize < cp->cache_bkttype->bt_maxbuf &&
	1355	(int)(cp->mc_bkt_contention - cp->mc_bkt_contention_prev) >
	1356	mcache_bkt_contention && !need_bkt_reenable)
	1357	need_bkt_resize = 1;
	1358
	1359	cp ->mc_bkt_contention_prev = cp->mc_bkt_contention;
	1360	MCACHE_UNLOCK(&cp->mc_bkt_lock);
	1361
	1362	if (need_bkt_resize)
	1363	mcache_dispatch(mcache_cache_bkt_resize, cp);
	1364	else if (need_bkt_reenable)
	1365	mcache_dispatch(mcache_cache_enable, cp);
	1366	}
	1367
	1368	/*
	1369	* Recompute a cache's bucket size. This is an expensive operation
	1370	* and should not be done frequently; larger buckets provide for a
	1371	* higher transfer rate with the bucket while smaller buckets reduce
	1372	* the memory consumption.
	1373	*/
	1374	static void
	1375	mcache_cache_bkt_resize(void *arg)
	1376	{
	1377	mcache_t *cp = arg;
	1378	mcache_bkttype_t *btp = cp->cache_bkttype;
	1379
	1380	if ((unsigned int)cp->mc_chunksize < btp->bt_maxbuf) {
	1381	mcache_bkt_purge(cp);
	1382
	1383	/*
	1384	* Upgrade to the next bucket type with larger bucket size;
	1385	* temporarily set the previous contention snapshot to a
	1386	* negative number to prevent unnecessary resize request.
	1387	*/
	1388	MCACHE_LOCK(&cp->mc_bkt_lock);
	1389	cp->cache_bkttype = ++btp;
	1390	cp ->mc_bkt_contention_prev = cp->mc_bkt_contention + INT_MAX;
	1391	MCACHE_UNLOCK(&cp->mc_bkt_lock);
	1392
	1393	mcache_cache_enable(cp);
	1394	}
	1395	}
	1396
	1397	/*
	1398	* Reenable a previously disabled cache due to purge.
	1399	*/
	1400	static void
	1401	mcache_cache_enable(void *arg)
	1402	{
	1403	mcache_t *cp = arg;
	1404
	1405	lck_mtx_lock_spin(&cp->mc_sync_lock);
	1406	cp->mc_purge_cnt = 0;
	1407	cp->mc_enable_cnt = 0;
	1408	lck_mtx_unlock(&cp->mc_sync_lock);
	1409
	1410	mcache_cache_bkt_enable(cp);
	1411	}
	1412
	1413	static void
	1414	mcache_update_timeout(__unused void *arg)
	1415	{
	1416	uint64_t deadline, leeway;
	1417
	1418	clock_interval_to_deadline(mcache_reap_interval, NSEC_PER_SEC,
	1419	&deadline);
	1420	clock_interval_to_absolutetime_interval(mcache_reap_interval_leeway,
	1421	NSEC_PER_SEC, &leeway);
	1422	thread_call_enter_delayed_with_leeway(mcache_update_tcall, NULL,
	1423	deadline, leeway, THREAD_CALL_DELAY_LEEWAY);
	1424	}
	1425
	1426	static void
	1427	mcache_update(thread_call_param_t arg __unused,
	1428	thread_call_param_t dummy __unused)
	1429	{
	1430	mcache_applyall(mcache_cache_update);
	1431	mcache_update_timeout(NULL);
	1432	}
	1433
	1434	static void
	1435	mcache_applyall(void (func)(mcache_t ))
	1436	{
	1437	mcache_t *cp;
	1438
	1439	MCACHE_LIST_LOCK();
	1440	LIST_FOREACH(cp, &mcache_head, mc_list) {
	1441	func(cp);
	1442	}
	1443	MCACHE_LIST_UNLOCK();
	1444	}
	1445
	1446	static void
	1447	mcache_dispatch(void (func)(void ), void *arg)
	1448	{
	1449	ASSERT(func != NULL);
	1450	timeout(func, arg, hz/1000);
	1451	}
	1452
	1453	__private_extern__ void
	1454	mcache_buffer_log(mcache_audit_t mca, void addr, mcache_t *cp,
	1455	struct timeval *base_ts)
	1456	{
	1457	struct timeval now, base = { 0, 0 };
	1458	void *stack[MCACHE_STACK_DEPTH + 1];
	1459	struct mca_trn *transaction;
	1460
	1461	transaction = &mca->mca_trns[mca->mca_next_trn];
	1462
	1463	mca->mca_addr = addr;
	1464	mca->mca_cache = cp;
	1465
	1466	transaction->mca_thread = current_thread();
	1467
	1468	bzero(stack, sizeof (stack));
	1469	transaction->mca_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
	1470	bcopy(&stack[1], transaction->mca_stack,
	1471	sizeof (transaction->mca_stack));
	1472
	1473	microuptime(&now);
	1474	if (base_ts != NULL)
	1475	base = *base_ts;
	1476	/* tstamp is in ms relative to base_ts */
	1477	transaction->mca_tstamp = ((now.tv_usec - base.tv_usec) / 1000);
	1478	if ((now.tv_sec - base.tv_sec) > 0)
	1479	transaction->mca_tstamp += ((now.tv_sec - base.tv_sec) * 1000);
	1480
	1481	mca->mca_next_trn =
	1482	(mca->mca_next_trn + 1) % mca_trn_max;
	1483	}
	1484
	1485	__private_extern__ void
	1486	mcache_set_pattern(u_int64_t pattern, void *buf_arg, size_t size)
	1487	{
	1488	u_int64_t buf_end = (u_int64_t )((void )((char )buf_arg + size));
	1489	u_int64_t buf = (u_int64_t )buf_arg;
	1490
	1491	VERIFY(IS_P2ALIGNED(buf_arg, sizeof (u_int64_t)));
	1492	VERIFY(IS_P2ALIGNED(size, sizeof (u_int64_t)));
	1493
	1494	while (buf < buf_end)
	1495	*buf++ = pattern;
	1496	}
	1497
	1498	__private_extern__ void *
	1499	mcache_verify_pattern(u_int64_t pattern, void *buf_arg, size_t size)
	1500	{
	1501	u_int64_t buf_end = (u_int64_t )((void )((char )buf_arg + size));
	1502	u_int64_t *buf;
	1503
	1504	VERIFY(IS_P2ALIGNED(buf_arg, sizeof (u_int64_t)));
	1505	VERIFY(IS_P2ALIGNED(size, sizeof (u_int64_t)));
	1506
	1507	for (buf = buf_arg; buf < buf_end; buf++) {
	1508	if (*buf != pattern)
	1509	return (buf);
	1510	}
	1511	return (NULL);
	1512	}
	1513
	1514	__private_extern__ void *
	1515	mcache_verify_set_pattern(u_int64_t old, u_int64_t new, void *buf_arg,
	1516	size_t size)
	1517	{
	1518	u_int64_t buf_end = (u_int64_t )((void )((char )buf_arg + size));
	1519	u_int64_t *buf;
	1520
	1521	VERIFY(IS_P2ALIGNED(buf_arg, sizeof (u_int64_t)));
	1522	VERIFY(IS_P2ALIGNED(size, sizeof (u_int64_t)));
	1523
	1524	for (buf = buf_arg; buf < buf_end; buf++) {
	1525	if (*buf != old) {
	1526	mcache_set_pattern(old, buf_arg,
	1527	(uintptr_t)buf - (uintptr_t)buf_arg);
	1528	return (buf);
	1529	}
	1530	*buf = new;
	1531	}
	1532	return (NULL);
	1533	}
	1534
	1535	__private_extern__ void
	1536	mcache_audit_free_verify(mcache_audit_t mca, void base, size_t offset,
	1537	size_t size)
	1538	{
	1539	void *addr;
	1540	u_int64_t *oaddr64;
	1541	mcache_obj_t *next;
	1542
	1543	addr = (void *)((uintptr_t)base + offset);
	1544	next = ((mcache_obj_t *)addr)->obj_next;
	1545
	1546	/* For the "obj_next" pointer in the buffer */
	1547	oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof (u_int64_t));
	1548	*oaddr64 = MCACHE_FREE_PATTERN;
	1549
	1550	if ((oaddr64 = mcache_verify_pattern(MCACHE_FREE_PATTERN,
	1551	(caddr_t)base, size)) != NULL) {
	1552	mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
	1553	(int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
	1554	/* NOTREACHED */
	1555	}
	1556	((mcache_obj_t *)addr)->obj_next = next;
	1557	}
	1558
	1559	__private_extern__ void
	1560	mcache_audit_free_verify_set(mcache_audit_t mca, void base, size_t offset,
	1561	size_t size)
	1562	{
	1563	void *addr;
	1564	u_int64_t *oaddr64;
	1565	mcache_obj_t *next;
	1566
	1567	addr = (void *)((uintptr_t)base + offset);
	1568	next = ((mcache_obj_t *)addr)->obj_next;
	1569
	1570	/* For the "obj_next" pointer in the buffer */
	1571	oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof (u_int64_t));
	1572	*oaddr64 = MCACHE_FREE_PATTERN;
	1573
	1574	if ((oaddr64 = mcache_verify_set_pattern(MCACHE_FREE_PATTERN,
	1575	MCACHE_UNINITIALIZED_PATTERN, (caddr_t)base, size)) != NULL) {
	1576	mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
	1577	(int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
	1578	/* NOTREACHED */
	1579	}
	1580	((mcache_obj_t *)addr)->obj_next = next;
	1581	}
	1582
	1583	#undef panic
	1584
	1585	#define DUMP_TRN_FMT() \
	1586	"%s transaction thread %p saved PC stack (%d deep):\n" \
	1587	"\t%p, %p, %p, %p, %p, %p, %p, %p\n" \
	1588	"\t%p, %p, %p, %p, %p, %p, %p, %p\n"
	1589
	1590	#define DUMP_TRN_FIELDS(s, x) \
	1591	s, \
	1592	mca->mca_trns[x].mca_thread, mca->mca_trns[x].mca_depth, \
	1593	mca->mca_trns[x].mca_stack[0], mca->mca_trns[x].mca_stack[1], \
	1594	mca->mca_trns[x].mca_stack[2], mca->mca_trns[x].mca_stack[3], \
	1595	mca->mca_trns[x].mca_stack[4], mca->mca_trns[x].mca_stack[5], \
	1596	mca->mca_trns[x].mca_stack[6], mca->mca_trns[x].mca_stack[7], \
	1597	mca->mca_trns[x].mca_stack[8], mca->mca_trns[x].mca_stack[9], \
	1598	mca->mca_trns[x].mca_stack[10], mca->mca_trns[x].mca_stack[11], \
	1599	mca->mca_trns[x].mca_stack[12], mca->mca_trns[x].mca_stack[13], \
	1600	mca->mca_trns[x].mca_stack[14], mca->mca_trns[x].mca_stack[15]
	1601
	1602	#define MCA_TRN_LAST ((mca->mca_next_trn + mca_trn_max) % mca_trn_max)
	1603	#define MCA_TRN_PREV ((mca->mca_next_trn + mca_trn_max - 1) % mca_trn_max)
	1604
	1605	__private_extern__ char *
	1606	mcache_dump_mca(mcache_audit_t *mca)
	1607	{
	1608	if (mca_dump_buf == NULL)
	1609	return (NULL);
	1610
	1611	snprintf(mca_dump_buf, DUMP_MCA_BUF_SIZE,
	1612	"mca %p: addr %p, cache %p (%s) nxttrn %d\n"
	1613	DUMP_TRN_FMT()
	1614	DUMP_TRN_FMT(),
	1615
	1616	mca, mca->mca_addr, mca->mca_cache,
	1617	mca->mca_cache ? mca->mca_cache->mc_name : "?",
	1618	mca->mca_next_trn,
	1619
	1620	DUMP_TRN_FIELDS("last", MCA_TRN_LAST),
	1621	DUMP_TRN_FIELDS("previous", MCA_TRN_PREV));
	1622
	1623	return (mca_dump_buf);
	1624	}
	1625
	1626	__private_extern__ void
	1627	mcache_audit_panic(mcache_audit_t mca, void addr, size_t offset,
	1628	int64_t expected, int64_t got)
	1629	{
	1630	if (mca == NULL) {
	1631	panic("mcache_audit: buffer %p modified after free at "
	1632	"offset 0x%lx (0x%llx instead of 0x%llx)\n", addr,
	1633	offset, got, expected);
	1634	/* NOTREACHED */
	1635	}
	1636
	1637	panic("mcache_audit: buffer %p modified after free at offset 0x%lx "
	1638	"(0x%llx instead of 0x%llx)\n%s\n",
	1639	addr, offset, got, expected, mcache_dump_mca(mca));
	1640	/* NOTREACHED */
	1641	}
	1642
	1643	__private_extern__ int
	1644	assfail(const char a, const char f, int l)
	1645	{
	1646	panic("assertion failed: %s, file: %s, line: %d", a, f, l);
	1647	return (0);
	1648	}