]> git.saurik.com Git - apple/xnu.git/blame_incremental - bsd/kern/mcache.c
xnu-3247.1.106.tar.gz
[apple/xnu.git] / bsd / kern / mcache.c
... / ...
CommitLineData
1/*
2 * Copyright (c) 2006-2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*
30 * Memory allocator with per-CPU caching, derived from the kmem magazine
31 * concept and implementation as described in the following paper:
32 * http://www.usenix.org/events/usenix01/full_papers/bonwick/bonwick.pdf
33 * That implementation is Copyright 2006 Sun Microsystems, Inc. All rights
34 * reserved. Use is subject to license terms.
35 *
36 * There are several major differences between this and the original kmem
37 * magazine: this derivative implementation allows for multiple objects to
38 * be allocated and freed from/to the object cache in one call; in addition,
39 * it provides for better flexibility where the user is allowed to define
40 * its own slab allocator (instead of the default zone allocator). Finally,
41 * no object construction/destruction takes place at the moment, although
42 * this could be added in future to improve efficiency.
43 */
44
45#include <sys/param.h>
46#include <sys/types.h>
47#include <sys/malloc.h>
48#include <sys/mbuf.h>
49#include <sys/queue.h>
50#include <sys/kernel.h>
51#include <sys/systm.h>
52
53#include <kern/debug.h>
54#include <kern/zalloc.h>
55#include <kern/cpu_number.h>
56#include <kern/locks.h>
57#include <kern/thread_call.h>
58
59#include <libkern/libkern.h>
60#include <libkern/OSAtomic.h>
61#include <libkern/OSDebug.h>
62
63#include <mach/vm_param.h>
64#include <machine/limits.h>
65#include <machine/machine_routines.h>
66
67#include <string.h>
68
69#include <sys/mcache.h>
70
71#define MCACHE_SIZE(n) \
72 ((size_t)(&((mcache_t *)0)->mc_cpu[n]))
73
74/* Allocate extra in case we need to manually align the pointer */
75#define MCACHE_ALLOC_SIZE \
76 (sizeof (void *) + MCACHE_SIZE(ncpu) + CPU_CACHE_LINE_SIZE)
77
78#define MCACHE_CPU(c) \
79 (mcache_cpu_t *)((void *)((char *)(c) + MCACHE_SIZE(cpu_number())))
80
81/*
82 * MCACHE_LIST_LOCK() and MCACHE_LIST_UNLOCK() are macros used
83 * to serialize accesses to the global list of caches in the system.
84 * They also record the thread currently running in the critical
85 * section, so that we can avoid recursive requests to reap the
86 * caches when memory runs low.
87 */
88#define MCACHE_LIST_LOCK() { \
89 lck_mtx_lock(mcache_llock); \
90 mcache_llock_owner = current_thread(); \
91}
92
93#define MCACHE_LIST_UNLOCK() { \
94 mcache_llock_owner = NULL; \
95 lck_mtx_unlock(mcache_llock); \
96}
97
98#define MCACHE_LOCK(l) lck_mtx_lock(l)
99#define MCACHE_UNLOCK(l) lck_mtx_unlock(l)
100#define MCACHE_LOCK_TRY(l) lck_mtx_try_lock(l)
101
102static int ncpu;
103static unsigned int cache_line_size;
104static lck_mtx_t *mcache_llock;
105static struct thread *mcache_llock_owner;
106static lck_attr_t *mcache_llock_attr;
107static lck_grp_t *mcache_llock_grp;
108static lck_grp_attr_t *mcache_llock_grp_attr;
109static struct zone *mcache_zone;
110static const uint32_t mcache_reap_interval = 15;
111static const uint32_t mcache_reap_interval_leeway = 2;
112static UInt32 mcache_reaping;
113static int mcache_ready;
114static int mcache_updating;
115
116static int mcache_bkt_contention = 3;
117#if DEBUG
118static unsigned int mcache_flags = MCF_DEBUG;
119#else
120static unsigned int mcache_flags = 0;
121#endif
122
123int mca_trn_max = MCA_TRN_MAX;
124
125#define DUMP_MCA_BUF_SIZE 512
126static char *mca_dump_buf;
127
128static mcache_bkttype_t mcache_bkttype[] = {
129 { 1, 4096, 32768, NULL },
130 { 3, 2048, 16384, NULL },
131 { 7, 1024, 12288, NULL },
132 { 15, 256, 8192, NULL },
133 { 31, 64, 4096, NULL },
134 { 47, 0, 2048, NULL },
135 { 63, 0, 1024, NULL },
136 { 95, 0, 512, NULL },
137 { 143, 0, 256, NULL },
138 { 165, 0, 0, NULL },
139};
140
141static mcache_t *mcache_create_common(const char *, size_t, size_t,
142 mcache_allocfn_t, mcache_freefn_t, mcache_auditfn_t, mcache_logfn_t,
143 mcache_notifyfn_t, void *, u_int32_t, int, int);
144static unsigned int mcache_slab_alloc(void *, mcache_obj_t ***,
145 unsigned int, int);
146static void mcache_slab_free(void *, mcache_obj_t *, boolean_t);
147static void mcache_slab_audit(void *, mcache_obj_t *, boolean_t);
148static void mcache_cpu_refill(mcache_cpu_t *, mcache_bkt_t *, int);
149static mcache_bkt_t *mcache_bkt_alloc(mcache_t *, mcache_bktlist_t *,
150 mcache_bkttype_t **);
151static void mcache_bkt_free(mcache_t *, mcache_bktlist_t *, mcache_bkt_t *);
152static void mcache_cache_bkt_enable(mcache_t *);
153static void mcache_bkt_purge(mcache_t *);
154static void mcache_bkt_destroy(mcache_t *, mcache_bkttype_t *,
155 mcache_bkt_t *, int);
156static void mcache_bkt_ws_update(mcache_t *);
157static void mcache_bkt_ws_reap(mcache_t *);
158static void mcache_dispatch(void (*)(void *), void *);
159static void mcache_cache_reap(mcache_t *);
160static void mcache_cache_update(mcache_t *);
161static void mcache_cache_bkt_resize(void *);
162static void mcache_cache_enable(void *);
163static void mcache_update(thread_call_param_t __unused, thread_call_param_t __unused);
164static void mcache_update_timeout(void *);
165static void mcache_applyall(void (*)(mcache_t *));
166static void mcache_reap_start(void *);
167static void mcache_reap_done(void *);
168static void mcache_reap_timeout(thread_call_param_t __unused, thread_call_param_t);
169static void mcache_notify(mcache_t *, u_int32_t);
170static void mcache_purge(void *);
171
172static LIST_HEAD(, mcache) mcache_head;
173mcache_t *mcache_audit_cache;
174
175static thread_call_t mcache_reap_tcall;
176static thread_call_t mcache_update_tcall;
177
178/*
179 * Initialize the framework; this is currently called as part of BSD init.
180 */
181__private_extern__ void
182mcache_init(void)
183{
184 mcache_bkttype_t *btp;
185 unsigned int i;
186 char name[32];
187
188 VERIFY(mca_trn_max >= 2);
189
190 ncpu = ml_get_max_cpus();
191 (void) mcache_cache_line_size(); /* prime it */
192
193 mcache_llock_grp_attr = lck_grp_attr_alloc_init();
194 mcache_llock_grp = lck_grp_alloc_init("mcache.list",
195 mcache_llock_grp_attr);
196 mcache_llock_attr = lck_attr_alloc_init();
197 mcache_llock = lck_mtx_alloc_init(mcache_llock_grp, mcache_llock_attr);
198
199 mcache_reap_tcall = thread_call_allocate(mcache_reap_timeout, NULL);
200 mcache_update_tcall = thread_call_allocate(mcache_update, NULL);
201 if (mcache_reap_tcall == NULL || mcache_update_tcall == NULL)
202 panic("mcache_init: thread_call_allocate failed");
203
204 mcache_zone = zinit(MCACHE_ALLOC_SIZE, 256 * MCACHE_ALLOC_SIZE,
205 PAGE_SIZE, "mcache");
206 if (mcache_zone == NULL)
207 panic("mcache_init: failed to allocate mcache zone\n");
208 zone_change(mcache_zone, Z_CALLERACCT, FALSE);
209
210 LIST_INIT(&mcache_head);
211
212 for (i = 0; i < sizeof (mcache_bkttype) / sizeof (*btp); i++) {
213 btp = &mcache_bkttype[i];
214 (void) snprintf(name, sizeof (name), "bkt_%d",
215 btp->bt_bktsize);
216 btp->bt_cache = mcache_create(name,
217 (btp->bt_bktsize + 1) * sizeof (void *), 0, 0, MCR_SLEEP);
218 }
219
220 PE_parse_boot_argn("mcache_flags", &mcache_flags, sizeof(mcache_flags));
221 mcache_flags &= MCF_FLAGS_MASK;
222
223 mcache_audit_cache = mcache_create("audit", sizeof (mcache_audit_t),
224 0, 0, MCR_SLEEP);
225
226 mcache_applyall(mcache_cache_bkt_enable);
227 mcache_ready = 1;
228
229 printf("mcache: %d CPU(s), %d bytes CPU cache line size\n",
230 ncpu, CPU_CACHE_LINE_SIZE);
231}
232
233/*
234 * Return the global mcache flags.
235 */
236__private_extern__ unsigned int
237mcache_getflags(void)
238{
239 return (mcache_flags);
240}
241
242/*
243 * Return the CPU cache line size.
244 */
245__private_extern__ unsigned int
246mcache_cache_line_size(void)
247{
248 if (cache_line_size == 0) {
249 ml_cpu_info_t cpu_info;
250 ml_cpu_get_info(&cpu_info);
251 cache_line_size = cpu_info.cache_line_size;
252 }
253 return (cache_line_size);
254}
255
256/*
257 * Create a cache using the zone allocator as the backend slab allocator.
258 * The caller may specify any alignment for the object; if it specifies 0
259 * the default alignment (MCACHE_ALIGN) will be used.
260 */
261__private_extern__ mcache_t *
262mcache_create(const char *name, size_t bufsize, size_t align,
263 u_int32_t flags, int wait)
264{
265 return (mcache_create_common(name, bufsize, align, mcache_slab_alloc,
266 mcache_slab_free, mcache_slab_audit, NULL, NULL, NULL, flags, 1,
267 wait));
268}
269
270/*
271 * Create a cache using a custom backend slab allocator. Since the caller
272 * is responsible for allocation, no alignment guarantee will be provided
273 * by this framework.
274 */
275__private_extern__ mcache_t *
276mcache_create_ext(const char *name, size_t bufsize,
277 mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
278 mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
279 u_int32_t flags, int wait)
280{
281 return (mcache_create_common(name, bufsize, 0, allocfn,
282 freefn, auditfn, logfn, notifyfn, arg, flags, 0, wait));
283}
284
285/*
286 * Common cache creation routine.
287 */
288static mcache_t *
289mcache_create_common(const char *name, size_t bufsize, size_t align,
290 mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
291 mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
292 u_int32_t flags, int need_zone, int wait)
293{
294 mcache_bkttype_t *btp;
295 mcache_t *cp = NULL;
296 size_t chunksize;
297 void *buf, **pbuf;
298 int c;
299 char lck_name[64];
300
301 /* If auditing is on and print buffer is NULL, allocate it now */
302 if ((flags & MCF_DEBUG) && mca_dump_buf == NULL) {
303 int malloc_wait = (wait & MCR_NOSLEEP) ? M_NOWAIT : M_WAITOK;
304 MALLOC(mca_dump_buf, char *, DUMP_MCA_BUF_SIZE, M_TEMP,
305 malloc_wait | M_ZERO);
306 if (mca_dump_buf == NULL)
307 return (NULL);
308 }
309
310 if (!(wait & MCR_NOSLEEP))
311 buf = zalloc(mcache_zone);
312 else
313 buf = zalloc_noblock(mcache_zone);
314
315 if (buf == NULL)
316 goto fail;
317
318 bzero(buf, MCACHE_ALLOC_SIZE);
319
320 /*
321 * In case we didn't get a cache-aligned memory, round it up
322 * accordingly. This is needed in order to get the rest of
323 * structure members aligned properly. It also means that
324 * the memory span gets shifted due to the round up, but it
325 * is okay since we've allocated extra space for this.
326 */
327 cp = (mcache_t *)
328 P2ROUNDUP((intptr_t)buf + sizeof (void *), CPU_CACHE_LINE_SIZE);
329 pbuf = (void **)((intptr_t)cp - sizeof (void *));
330 *pbuf = buf;
331
332 /*
333 * Guaranteed alignment is valid only when we use the internal
334 * slab allocator (currently set to use the zone allocator).
335 */
336 if (!need_zone)
337 align = 1;
338 else if (align == 0)
339 align = MCACHE_ALIGN;
340
341 if ((align & (align - 1)) != 0)
342 panic("mcache_create: bad alignment %lu", align);
343
344 cp->mc_align = align;
345 cp->mc_slab_alloc = allocfn;
346 cp->mc_slab_free = freefn;
347 cp->mc_slab_audit = auditfn;
348 cp->mc_slab_log = logfn;
349 cp->mc_slab_notify = notifyfn;
350 cp->mc_private = need_zone ? cp : arg;
351 cp->mc_bufsize = bufsize;
352 cp->mc_flags = (flags & MCF_FLAGS_MASK) | mcache_flags;
353
354 (void) snprintf(cp->mc_name, sizeof (cp->mc_name), "mcache.%s", name);
355
356 (void) snprintf(lck_name, sizeof (lck_name), "%s.cpu", cp->mc_name);
357 cp->mc_cpu_lock_grp_attr = lck_grp_attr_alloc_init();
358 cp->mc_cpu_lock_grp = lck_grp_alloc_init(lck_name,
359 cp->mc_cpu_lock_grp_attr);
360 cp->mc_cpu_lock_attr = lck_attr_alloc_init();
361
362 /*
363 * Allocation chunk size is the object's size plus any extra size
364 * needed to satisfy the object's alignment. It is enforced to be
365 * at least the size of an LP64 pointer to simplify auditing and to
366 * handle multiple-element allocation requests, where the elements
367 * returned are linked together in a list.
368 */
369 chunksize = MAX(bufsize, sizeof (u_int64_t));
370 if (need_zone) {
371 /* Enforce 64-bit minimum alignment for zone-based buffers */
372 align = MAX(align, sizeof (u_int64_t));
373 chunksize += sizeof (void *) + align;
374 chunksize = P2ROUNDUP(chunksize, align);
375 if ((cp->mc_slab_zone = zinit(chunksize, 64 * 1024 * ncpu,
376 PAGE_SIZE, cp->mc_name)) == NULL)
377 goto fail;
378 zone_change(cp->mc_slab_zone, Z_EXPAND, TRUE);
379 }
380 cp->mc_chunksize = chunksize;
381
382 /*
383 * Initialize the bucket layer.
384 */
385 (void) snprintf(lck_name, sizeof (lck_name), "%s.bkt", cp->mc_name);
386 cp->mc_bkt_lock_grp_attr = lck_grp_attr_alloc_init();
387 cp->mc_bkt_lock_grp = lck_grp_alloc_init(lck_name,
388 cp->mc_bkt_lock_grp_attr);
389 cp->mc_bkt_lock_attr = lck_attr_alloc_init();
390 lck_mtx_init(&cp->mc_bkt_lock, cp->mc_bkt_lock_grp,
391 cp->mc_bkt_lock_attr);
392
393 (void) snprintf(lck_name, sizeof (lck_name), "%s.sync", cp->mc_name);
394 cp->mc_sync_lock_grp_attr = lck_grp_attr_alloc_init();
395 cp->mc_sync_lock_grp = lck_grp_alloc_init(lck_name,
396 cp->mc_sync_lock_grp_attr);
397 cp->mc_sync_lock_attr = lck_attr_alloc_init();
398 lck_mtx_init(&cp->mc_sync_lock, cp->mc_sync_lock_grp,
399 cp->mc_sync_lock_attr);
400
401 for (btp = mcache_bkttype; chunksize <= btp->bt_minbuf; btp++)
402 continue;
403
404 cp->cache_bkttype = btp;
405
406 /*
407 * Initialize the CPU layer. Each per-CPU structure is aligned
408 * on the CPU cache line boundary to prevent false sharing.
409 */
410 for (c = 0; c < ncpu; c++) {
411 mcache_cpu_t *ccp = &cp->mc_cpu[c];
412
413 VERIFY(IS_P2ALIGNED(ccp, CPU_CACHE_LINE_SIZE));
414 lck_mtx_init(&ccp->cc_lock, cp->mc_cpu_lock_grp,
415 cp->mc_cpu_lock_attr);
416 ccp->cc_objs = -1;
417 ccp->cc_pobjs = -1;
418 }
419
420 if (mcache_ready)
421 mcache_cache_bkt_enable(cp);
422
423 /* TODO: dynamically create sysctl for stats */
424
425 MCACHE_LIST_LOCK();
426 LIST_INSERT_HEAD(&mcache_head, cp, mc_list);
427 MCACHE_LIST_UNLOCK();
428
429 /*
430 * If cache buckets are enabled and this is the first cache
431 * created, start the periodic cache update.
432 */
433 if (!(mcache_flags & MCF_NOCPUCACHE) && !mcache_updating) {
434 mcache_updating = 1;
435 mcache_update_timeout(NULL);
436 }
437 if (cp->mc_flags & MCF_DEBUG) {
438 printf("mcache_create: %s (%s) arg %p bufsize %lu align %lu "
439 "chunksize %lu bktsize %d\n", name, need_zone ? "i" : "e",
440 arg, bufsize, cp->mc_align, chunksize, btp->bt_bktsize);
441 }
442 return (cp);
443
444fail:
445 if (buf != NULL)
446 zfree(mcache_zone, buf);
447 return (NULL);
448}
449
450/*
451 * Allocate one or more objects from a cache.
452 */
453__private_extern__ unsigned int
454mcache_alloc_ext(mcache_t *cp, mcache_obj_t **list, unsigned int num, int wait)
455{
456 mcache_cpu_t *ccp;
457 mcache_obj_t **top = &(*list);
458 mcache_bkt_t *bkt;
459 unsigned int need = num;
460 boolean_t nwretry = FALSE;
461
462 /* MCR_NOSLEEP and MCR_FAILOK are mutually exclusive */
463 VERIFY((wait & (MCR_NOSLEEP|MCR_FAILOK)) != (MCR_NOSLEEP|MCR_FAILOK));
464
465 ASSERT(list != NULL);
466 *list = NULL;
467
468 if (num == 0)
469 return (0);
470
471retry_alloc:
472 /* We may not always be running in the same CPU in case of retries */
473 ccp = MCACHE_CPU(cp);
474
475 MCACHE_LOCK(&ccp->cc_lock);
476 for (;;) {
477 /*
478 * If we have an object in the current CPU's filled bucket,
479 * chain the object to any previous objects and return if
480 * we've satisfied the number of requested objects.
481 */
482 if (ccp->cc_objs > 0) {
483 mcache_obj_t *tail;
484 int objs;
485
486 /*
487 * Objects in the bucket are already linked together
488 * with the most recently freed object at the head of
489 * the list; grab as many objects as we can.
490 */
491 objs = MIN((unsigned int)ccp->cc_objs, need);
492 *list = ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
493 ccp->cc_objs -= objs;
494 ccp->cc_alloc += objs;
495
496 tail = ccp->cc_filled->bkt_obj[ccp->cc_objs];
497 list = &tail->obj_next;
498 *list = NULL;
499
500 /* If we got them all, return to caller */
501 if ((need -= objs) == 0) {
502 MCACHE_UNLOCK(&ccp->cc_lock);
503
504 if (!(cp->mc_flags & MCF_NOLEAKLOG) &&
505 cp->mc_slab_log != NULL)
506 (*cp->mc_slab_log)(num, *top, TRUE);
507
508 if (cp->mc_flags & MCF_DEBUG)
509 goto debug_alloc;
510
511 return (num);
512 }
513 }
514
515 /*
516 * The CPU's filled bucket is empty. If the previous filled
517 * bucket was full, exchange and try again.
518 */
519 if (ccp->cc_pobjs > 0) {
520 mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
521 continue;
522 }
523
524 /*
525 * If the bucket layer is disabled, allocate from slab. This
526 * can happen either because MCF_NOCPUCACHE is set, or because
527 * the bucket layer is currently being resized.
528 */
529 if (ccp->cc_bktsize == 0)
530 break;
531
532 /*
533 * Both of the CPU's buckets are empty; try to get a full
534 * bucket from the bucket layer. Upon success, refill this
535 * CPU and place any empty bucket into the empty list.
536 */
537 bkt = mcache_bkt_alloc(cp, &cp->mc_full, NULL);
538 if (bkt != NULL) {
539 if (ccp->cc_pfilled != NULL)
540 mcache_bkt_free(cp, &cp->mc_empty,
541 ccp->cc_pfilled);
542 mcache_cpu_refill(ccp, bkt, ccp->cc_bktsize);
543 continue;
544 }
545
546 /*
547 * The bucket layer has no full buckets; allocate the
548 * object(s) directly from the slab layer.
549 */
550 break;
551 }
552 MCACHE_UNLOCK(&ccp->cc_lock);
553
554 need -= (*cp->mc_slab_alloc)(cp->mc_private, &list, need, wait);
555
556 /*
557 * If this is a blocking allocation, or if it is non-blocking and
558 * the cache's full bucket is non-empty, then retry the allocation.
559 */
560 if (need > 0) {
561 if (!(wait & MCR_NONBLOCKING)) {
562 atomic_add_32(&cp->mc_wretry_cnt, 1);
563 goto retry_alloc;
564 } else if ((wait & (MCR_NOSLEEP | MCR_TRYHARD)) &&
565 !mcache_bkt_isempty(cp)) {
566 if (!nwretry)
567 nwretry = TRUE;
568 atomic_add_32(&cp->mc_nwretry_cnt, 1);
569 goto retry_alloc;
570 } else if (nwretry) {
571 atomic_add_32(&cp->mc_nwfail_cnt, 1);
572 }
573 }
574
575 if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL)
576 (*cp->mc_slab_log)((num - need), *top, TRUE);
577
578 if (!(cp->mc_flags & MCF_DEBUG))
579 return (num - need);
580
581debug_alloc:
582 if (cp->mc_flags & MCF_DEBUG) {
583 mcache_obj_t **o = top;
584 unsigned int n;
585
586 n = 0;
587 /*
588 * Verify that the chain of objects have the same count as
589 * what we are about to report to the caller. Any mismatch
590 * here means that the object list is insanely broken and
591 * therefore we must panic.
592 */
593 while (*o != NULL) {
594 o = &(*o)->obj_next;
595 ++n;
596 }
597 if (n != (num - need)) {
598 panic("mcache_alloc_ext: %s cp %p corrupted list "
599 "(got %d actual %d)\n", cp->mc_name,
600 (void *)cp, num - need, n);
601 }
602 }
603
604 /* Invoke the slab layer audit callback if auditing is enabled */
605 if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL)
606 (*cp->mc_slab_audit)(cp->mc_private, *top, TRUE);
607
608 return (num - need);
609}
610
611/*
612 * Allocate a single object from a cache.
613 */
614__private_extern__ void *
615mcache_alloc(mcache_t *cp, int wait)
616{
617 mcache_obj_t *buf;
618
619 (void) mcache_alloc_ext(cp, &buf, 1, wait);
620 return (buf);
621}
622
623__private_extern__ void
624mcache_waiter_inc(mcache_t *cp)
625{
626 atomic_add_32(&cp->mc_waiter_cnt, 1);
627}
628
629__private_extern__ void
630mcache_waiter_dec(mcache_t *cp)
631{
632 atomic_add_32(&cp->mc_waiter_cnt, -1);
633}
634
635__private_extern__ boolean_t
636mcache_bkt_isempty(mcache_t *cp)
637{
638 /*
639 * This isn't meant to accurately tell whether there are
640 * any full buckets in the cache; it is simply a way to
641 * obtain "hints" about the state of the cache.
642 */
643 return (cp->mc_full.bl_total == 0);
644}
645
646/*
647 * Notify the slab layer about an event.
648 */
649static void
650mcache_notify(mcache_t *cp, u_int32_t event)
651{
652 if (cp->mc_slab_notify != NULL)
653 (*cp->mc_slab_notify)(cp->mc_private, event);
654}
655
656/*
657 * Purge the cache and disable its buckets.
658 */
659static void
660mcache_purge(void *arg)
661{
662 mcache_t *cp = arg;
663
664 mcache_bkt_purge(cp);
665 /*
666 * We cannot simply call mcache_cache_bkt_enable() from here as
667 * a bucket resize may be in flight and we would cause the CPU
668 * layers of the cache to point to different sizes. Therefore,
669 * we simply increment the enable count so that during the next
670 * periodic cache update the buckets can be reenabled.
671 */
672 lck_mtx_lock_spin(&cp->mc_sync_lock);
673 cp->mc_enable_cnt++;
674 lck_mtx_unlock(&cp->mc_sync_lock);
675}
676
677__private_extern__ boolean_t
678mcache_purge_cache(mcache_t *cp, boolean_t async)
679{
680 /*
681 * Purging a cache that has no per-CPU caches or is already
682 * in the process of being purged is rather pointless.
683 */
684 if (cp->mc_flags & MCF_NOCPUCACHE)
685 return (FALSE);
686
687 lck_mtx_lock_spin(&cp->mc_sync_lock);
688 if (cp->mc_purge_cnt > 0) {
689 lck_mtx_unlock(&cp->mc_sync_lock);
690 return (FALSE);
691 }
692 cp->mc_purge_cnt++;
693 lck_mtx_unlock(&cp->mc_sync_lock);
694
695 if (async)
696 mcache_dispatch(mcache_purge, cp);
697 else
698 mcache_purge(cp);
699
700 return (TRUE);
701}
702
703/*
704 * Free a single object to a cache.
705 */
706__private_extern__ void
707mcache_free(mcache_t *cp, void *buf)
708{
709 ((mcache_obj_t *)buf)->obj_next = NULL;
710 mcache_free_ext(cp, (mcache_obj_t *)buf);
711}
712
713/*
714 * Free one or more objects to a cache.
715 */
716__private_extern__ void
717mcache_free_ext(mcache_t *cp, mcache_obj_t *list)
718{
719 mcache_cpu_t *ccp = MCACHE_CPU(cp);
720 mcache_bkttype_t *btp;
721 mcache_obj_t *nlist;
722 mcache_bkt_t *bkt;
723
724 if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL)
725 (*cp->mc_slab_log)(0, list, FALSE);
726
727 /* Invoke the slab layer audit callback if auditing is enabled */
728 if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL)
729 (*cp->mc_slab_audit)(cp->mc_private, list, FALSE);
730
731 MCACHE_LOCK(&ccp->cc_lock);
732 for (;;) {
733 /*
734 * If there is space in the current CPU's filled bucket, put
735 * the object there and return once all objects are freed.
736 * Note the cast to unsigned integer takes care of the case
737 * where the bucket layer is disabled (when cc_objs is -1).
738 */
739 if ((unsigned int)ccp->cc_objs <
740 (unsigned int)ccp->cc_bktsize) {
741 /*
742 * Reverse the list while we place the object into the
743 * bucket; this effectively causes the most recently
744 * freed object(s) to be reused during allocation.
745 */
746 nlist = list->obj_next;
747 list->obj_next = (ccp->cc_objs == 0) ? NULL :
748 ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
749 ccp->cc_filled->bkt_obj[ccp->cc_objs++] = list;
750 ccp->cc_free++;
751
752 if ((list = nlist) != NULL)
753 continue;
754
755 /* We are done; return to caller */
756 MCACHE_UNLOCK(&ccp->cc_lock);
757
758 /* If there is a waiter below, notify it */
759 if (cp->mc_waiter_cnt > 0)
760 mcache_notify(cp, MCN_RETRYALLOC);
761 return;
762 }
763
764 /*
765 * The CPU's filled bucket is full. If the previous filled
766 * bucket was empty, exchange and try again.
767 */
768 if (ccp->cc_pobjs == 0) {
769 mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
770 continue;
771 }
772
773 /*
774 * If the bucket layer is disabled, free to slab. This can
775 * happen either because MCF_NOCPUCACHE is set, or because
776 * the bucket layer is currently being resized.
777 */
778 if (ccp->cc_bktsize == 0)
779 break;
780
781 /*
782 * Both of the CPU's buckets are full; try to get an empty
783 * bucket from the bucket layer. Upon success, empty this
784 * CPU and place any full bucket into the full list.
785 */
786 bkt = mcache_bkt_alloc(cp, &cp->mc_empty, &btp);
787 if (bkt != NULL) {
788 if (ccp->cc_pfilled != NULL)
789 mcache_bkt_free(cp, &cp->mc_full,
790 ccp->cc_pfilled);
791 mcache_cpu_refill(ccp, bkt, 0);
792 continue;
793 }
794
795 /*
796 * We need an empty bucket to put our freed objects into
797 * but couldn't get an empty bucket from the bucket layer;
798 * attempt to allocate one. We do not want to block for
799 * allocation here, and if the bucket allocation fails
800 * we will simply fall through to the slab layer.
801 */
802 MCACHE_UNLOCK(&ccp->cc_lock);
803 bkt = mcache_alloc(btp->bt_cache, MCR_NOSLEEP);
804 MCACHE_LOCK(&ccp->cc_lock);
805
806 if (bkt != NULL) {
807 /*
808 * We have an empty bucket, but since we drop the
809 * CPU lock above, the cache's bucket size may have
810 * changed. If so, free the bucket and try again.
811 */
812 if (ccp->cc_bktsize != btp->bt_bktsize) {
813 MCACHE_UNLOCK(&ccp->cc_lock);
814 mcache_free(btp->bt_cache, bkt);
815 MCACHE_LOCK(&ccp->cc_lock);
816 continue;
817 }
818
819 /*
820 * We have an empty bucket of the right size;
821 * add it to the bucket layer and try again.
822 */
823 mcache_bkt_free(cp, &cp->mc_empty, bkt);
824 continue;
825 }
826
827 /*
828 * The bucket layer has no empty buckets; free the
829 * object(s) directly to the slab layer.
830 */
831 break;
832 }
833 MCACHE_UNLOCK(&ccp->cc_lock);
834
835 /* If there is a waiter below, notify it */
836 if (cp->mc_waiter_cnt > 0)
837 mcache_notify(cp, MCN_RETRYALLOC);
838
839 /* Advise the slab layer to purge the object(s) */
840 (*cp->mc_slab_free)(cp->mc_private, list,
841 (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
842}
843
844/*
845 * Cache destruction routine.
846 */
847__private_extern__ void
848mcache_destroy(mcache_t *cp)
849{
850 void **pbuf;
851
852 MCACHE_LIST_LOCK();
853 LIST_REMOVE(cp, mc_list);
854 MCACHE_LIST_UNLOCK();
855
856 mcache_bkt_purge(cp);
857
858 /*
859 * This cache is dead; there should be no further transaction.
860 * If it's still invoked, make sure that it induces a fault.
861 */
862 cp->mc_slab_alloc = NULL;
863 cp->mc_slab_free = NULL;
864 cp->mc_slab_audit = NULL;
865
866 lck_attr_free(cp->mc_bkt_lock_attr);
867 lck_grp_free(cp->mc_bkt_lock_grp);
868 lck_grp_attr_free(cp->mc_bkt_lock_grp_attr);
869
870 lck_attr_free(cp->mc_cpu_lock_attr);
871 lck_grp_free(cp->mc_cpu_lock_grp);
872 lck_grp_attr_free(cp->mc_cpu_lock_grp_attr);
873
874 lck_attr_free(cp->mc_sync_lock_attr);
875 lck_grp_free(cp->mc_sync_lock_grp);
876 lck_grp_attr_free(cp->mc_sync_lock_grp_attr);
877
878 /*
879 * TODO: We need to destroy the zone here, but cannot do it
880 * because there is no such way to achieve that. Until then
881 * the memory allocated for the zone structure is leaked.
882 * Once it is achievable, uncomment these lines:
883 *
884 * if (cp->mc_slab_zone != NULL) {
885 * zdestroy(cp->mc_slab_zone);
886 * cp->mc_slab_zone = NULL;
887 * }
888 */
889
890 /* Get the original address since we're about to free it */
891 pbuf = (void **)((intptr_t)cp - sizeof (void *));
892
893 zfree(mcache_zone, *pbuf);
894}
895
896/*
897 * Internal slab allocator used as a backend for simple caches. The current
898 * implementation uses the zone allocator for simplicity reasons.
899 */
900static unsigned int
901mcache_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
902{
903 mcache_t *cp = arg;
904 unsigned int need = num;
905 size_t offset = 0;
906 size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof (u_int64_t));
907 u_int32_t flags = cp->mc_flags;
908 void *buf, *base, **pbuf;
909 mcache_obj_t **list = *plist;
910
911 *list = NULL;
912
913 /*
914 * The address of the object returned to the caller is an
915 * offset from the 64-bit aligned base address only if the
916 * cache's alignment requirement is neither 1 nor 8 bytes.
917 */
918 if (cp->mc_align != 1 && cp->mc_align != sizeof (u_int64_t))
919 offset = cp->mc_align;
920
921 for (;;) {
922 if (!(wait & MCR_NOSLEEP))
923 buf = zalloc(cp->mc_slab_zone);
924 else
925 buf = zalloc_noblock(cp->mc_slab_zone);
926
927 if (buf == NULL)
928 break;
929
930 /* Get the 64-bit aligned base address for this object */
931 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof (u_int64_t),
932 sizeof (u_int64_t));
933
934 /*
935 * Wind back a pointer size from the aligned base and
936 * save the original address so we can free it later.
937 */
938 pbuf = (void **)((intptr_t)base - sizeof (void *));
939 *pbuf = buf;
940
941 /*
942 * If auditing is enabled, patternize the contents of
943 * the buffer starting from the 64-bit aligned base to
944 * the end of the buffer; the length is rounded up to
945 * the nearest 64-bit multiply; this is because we use
946 * 64-bit memory access to set/check the pattern.
947 */
948 if (flags & MCF_DEBUG) {
949 VERIFY(((intptr_t)base + rsize) <=
950 ((intptr_t)buf + cp->mc_chunksize));
951 mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
952 }
953
954 /*
955 * Fix up the object's address to fulfill the cache's
956 * alignment requirement (if needed) and return this
957 * to the caller.
958 */
959 VERIFY(((intptr_t)base + offset + cp->mc_bufsize) <=
960 ((intptr_t)buf + cp->mc_chunksize));
961 *list = (mcache_obj_t *)((intptr_t)base + offset);
962
963 (*list)->obj_next = NULL;
964 list = *plist = &(*list)->obj_next;
965
966 /* If we got them all, return to mcache */
967 if (--need == 0)
968 break;
969 }
970
971 return (num - need);
972}
973
974/*
975 * Internal slab deallocator used as a backend for simple caches.
976 */
977static void
978mcache_slab_free(void *arg, mcache_obj_t *list, __unused boolean_t purged)
979{
980 mcache_t *cp = arg;
981 mcache_obj_t *nlist;
982 size_t offset = 0;
983 size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof (u_int64_t));
984 u_int32_t flags = cp->mc_flags;
985 void *base;
986 void **pbuf;
987
988 /*
989 * The address of the object is an offset from a 64-bit
990 * aligned base address only if the cache's alignment
991 * requirement is neither 1 nor 8 bytes.
992 */
993 if (cp->mc_align != 1 && cp->mc_align != sizeof (u_int64_t))
994 offset = cp->mc_align;
995
996 for (;;) {
997 nlist = list->obj_next;
998 list->obj_next = NULL;
999
1000 /* Get the 64-bit aligned base address of this object */
1001 base = (void *)((intptr_t)list - offset);
1002 VERIFY(IS_P2ALIGNED(base, sizeof (u_int64_t)));
1003
1004 /* Get the original address since we're about to free it */
1005 pbuf = (void **)((intptr_t)base - sizeof (void *));
1006
1007 if (flags & MCF_DEBUG) {
1008 VERIFY(((intptr_t)base + rsize) <=
1009 ((intptr_t)*pbuf + cp->mc_chunksize));
1010 mcache_audit_free_verify(NULL, base, offset, rsize);
1011 }
1012
1013 /* Free it to zone */
1014 VERIFY(((intptr_t)base + offset + cp->mc_bufsize) <=
1015 ((intptr_t)*pbuf + cp->mc_chunksize));
1016 zfree(cp->mc_slab_zone, *pbuf);
1017
1018 /* No more objects to free; return to mcache */
1019 if ((list = nlist) == NULL)
1020 break;
1021 }
1022}
1023
1024/*
1025 * Internal slab auditor for simple caches.
1026 */
1027static void
1028mcache_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
1029{
1030 mcache_t *cp = arg;
1031 size_t offset = 0;
1032 size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof (u_int64_t));
1033 void *base, **pbuf;
1034
1035 /*
1036 * The address of the object returned to the caller is an
1037 * offset from the 64-bit aligned base address only if the
1038 * cache's alignment requirement is neither 1 nor 8 bytes.
1039 */
1040 if (cp->mc_align != 1 && cp->mc_align != sizeof (u_int64_t))
1041 offset = cp->mc_align;
1042
1043 while (list != NULL) {
1044 mcache_obj_t *next = list->obj_next;
1045
1046 /* Get the 64-bit aligned base address of this object */
1047 base = (void *)((intptr_t)list - offset);
1048 VERIFY(IS_P2ALIGNED(base, sizeof (u_int64_t)));
1049
1050 /* Get the original address */
1051 pbuf = (void **)((intptr_t)base - sizeof (void *));
1052
1053 VERIFY(((intptr_t)base + rsize) <=
1054 ((intptr_t)*pbuf + cp->mc_chunksize));
1055
1056 if (!alloc)
1057 mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
1058 else
1059 mcache_audit_free_verify_set(NULL, base, offset, rsize);
1060
1061 list = list->obj_next = next;
1062 }
1063}
1064
1065/*
1066 * Refill the CPU's filled bucket with bkt and save the previous one.
1067 */
1068static void
1069mcache_cpu_refill(mcache_cpu_t *ccp, mcache_bkt_t *bkt, int objs)
1070{
1071 ASSERT((ccp->cc_filled == NULL && ccp->cc_objs == -1) ||
1072 (ccp->cc_filled && ccp->cc_objs + objs == ccp->cc_bktsize));
1073 ASSERT(ccp->cc_bktsize > 0);
1074
1075 ccp->cc_pfilled = ccp->cc_filled;
1076 ccp->cc_pobjs = ccp->cc_objs;
1077 ccp->cc_filled = bkt;
1078 ccp->cc_objs = objs;
1079}
1080
1081/*
1082 * Allocate a bucket from the bucket layer.
1083 */
1084static mcache_bkt_t *
1085mcache_bkt_alloc(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkttype_t **btp)
1086{
1087 mcache_bkt_t *bkt;
1088
1089 if (!MCACHE_LOCK_TRY(&cp->mc_bkt_lock)) {
1090 /*
1091 * The bucket layer lock is held by another CPU; increase
1092 * the contention count so that we can later resize the
1093 * bucket size accordingly.
1094 */
1095 MCACHE_LOCK(&cp->mc_bkt_lock);
1096 cp->mc_bkt_contention++;
1097 }
1098
1099 if ((bkt = blp->bl_list) != NULL) {
1100 blp->bl_list = bkt->bkt_next;
1101 if (--blp->bl_total < blp->bl_min)
1102 blp->bl_min = blp->bl_total;
1103 blp->bl_alloc++;
1104 }
1105
1106 if (btp != NULL)
1107 *btp = cp->cache_bkttype;
1108
1109 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1110
1111 return (bkt);
1112}
1113
1114/*
1115 * Free a bucket to the bucket layer.
1116 */
1117static void
1118mcache_bkt_free(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkt_t *bkt)
1119{
1120 MCACHE_LOCK(&cp->mc_bkt_lock);
1121
1122 bkt->bkt_next = blp->bl_list;
1123 blp->bl_list = bkt;
1124 blp->bl_total++;
1125
1126 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1127}
1128
1129/*
1130 * Enable the bucket layer of a cache.
1131 */
1132static void
1133mcache_cache_bkt_enable(mcache_t *cp)
1134{
1135 mcache_cpu_t *ccp;
1136 int cpu;
1137
1138 if (cp->mc_flags & MCF_NOCPUCACHE)
1139 return;
1140
1141 for (cpu = 0; cpu < ncpu; cpu++) {
1142 ccp = &cp->mc_cpu[cpu];
1143 MCACHE_LOCK(&ccp->cc_lock);
1144 ccp->cc_bktsize = cp->cache_bkttype->bt_bktsize;
1145 MCACHE_UNLOCK(&ccp->cc_lock);
1146 }
1147}
1148
1149/*
1150 * Purge all buckets from a cache and disable its bucket layer.
1151 */
1152static void
1153mcache_bkt_purge(mcache_t *cp)
1154{
1155 mcache_cpu_t *ccp;
1156 mcache_bkt_t *bp, *pbp;
1157 mcache_bkttype_t *btp;
1158 int cpu, objs, pobjs;
1159
1160 for (cpu = 0; cpu < ncpu; cpu++) {
1161 ccp = &cp->mc_cpu[cpu];
1162
1163 MCACHE_LOCK(&ccp->cc_lock);
1164
1165 btp = cp->cache_bkttype;
1166 bp = ccp->cc_filled;
1167 pbp = ccp->cc_pfilled;
1168 objs = ccp->cc_objs;
1169 pobjs = ccp->cc_pobjs;
1170 ccp->cc_filled = NULL;
1171 ccp->cc_pfilled = NULL;
1172 ccp->cc_objs = -1;
1173 ccp->cc_pobjs = -1;
1174 ccp->cc_bktsize = 0;
1175
1176 MCACHE_UNLOCK(&ccp->cc_lock);
1177
1178 if (bp != NULL)
1179 mcache_bkt_destroy(cp, btp, bp, objs);
1180 if (pbp != NULL)
1181 mcache_bkt_destroy(cp, btp, pbp, pobjs);
1182 }
1183
1184 /*
1185 * Updating the working set back to back essentially sets
1186 * the working set size to zero, so everything is reapable.
1187 */
1188 mcache_bkt_ws_update(cp);
1189 mcache_bkt_ws_update(cp);
1190
1191 mcache_bkt_ws_reap(cp);
1192}
1193
1194/*
1195 * Free one or more objects in the bucket to the slab layer,
1196 * and also free the bucket itself.
1197 */
1198static void
1199mcache_bkt_destroy(mcache_t *cp, mcache_bkttype_t *btp, mcache_bkt_t *bkt,
1200 int nobjs)
1201{
1202 if (nobjs > 0) {
1203 mcache_obj_t *top = bkt->bkt_obj[nobjs - 1];
1204
1205 if (cp->mc_flags & MCF_DEBUG) {
1206 mcache_obj_t *o = top;
1207 int cnt = 0;
1208
1209 /*
1210 * Verify that the chain of objects in the bucket is
1211 * valid. Any mismatch here means a mistake when the
1212 * object(s) were freed to the CPU layer, so we panic.
1213 */
1214 while (o != NULL) {
1215 o = o->obj_next;
1216 ++cnt;
1217 }
1218 if (cnt != nobjs) {
1219 panic("mcache_bkt_destroy: %s cp %p corrupted "
1220 "list in bkt %p (nobjs %d actual %d)\n",
1221 cp->mc_name, (void *)cp, (void *)bkt,
1222 nobjs, cnt);
1223 }
1224 }
1225
1226 /* Advise the slab layer to purge the object(s) */
1227 (*cp->mc_slab_free)(cp->mc_private, top,
1228 (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
1229 }
1230 mcache_free(btp->bt_cache, bkt);
1231}
1232
1233/*
1234 * Update the bucket layer working set statistics.
1235 */
1236static void
1237mcache_bkt_ws_update(mcache_t *cp)
1238{
1239 MCACHE_LOCK(&cp->mc_bkt_lock);
1240
1241 cp->mc_full.bl_reaplimit = cp->mc_full.bl_min;
1242 cp->mc_full.bl_min = cp->mc_full.bl_total;
1243 cp->mc_empty.bl_reaplimit = cp->mc_empty.bl_min;
1244 cp->mc_empty.bl_min = cp->mc_empty.bl_total;
1245
1246 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1247}
1248
1249/*
1250 * Reap all buckets that are beyond the working set.
1251 */
1252static void
1253mcache_bkt_ws_reap(mcache_t *cp)
1254{
1255 long reap;
1256 mcache_bkt_t *bkt;
1257 mcache_bkttype_t *btp;
1258
1259 reap = MIN(cp->mc_full.bl_reaplimit, cp->mc_full.bl_min);
1260 while (reap-- &&
1261 (bkt = mcache_bkt_alloc(cp, &cp->mc_full, &btp)) != NULL)
1262 mcache_bkt_destroy(cp, btp, bkt, btp->bt_bktsize);
1263
1264 reap = MIN(cp->mc_empty.bl_reaplimit, cp->mc_empty.bl_min);
1265 while (reap-- &&
1266 (bkt = mcache_bkt_alloc(cp, &cp->mc_empty, &btp)) != NULL)
1267 mcache_bkt_destroy(cp, btp, bkt, 0);
1268}
1269
1270static void
1271mcache_reap_timeout(thread_call_param_t dummy __unused,
1272 thread_call_param_t arg)
1273{
1274 volatile UInt32 *flag = arg;
1275
1276 ASSERT(flag == &mcache_reaping);
1277
1278 *flag = 0;
1279}
1280
1281static void
1282mcache_reap_done(void *flag)
1283{
1284 uint64_t deadline, leeway;
1285
1286 clock_interval_to_deadline(mcache_reap_interval, NSEC_PER_SEC,
1287 &deadline);
1288 clock_interval_to_absolutetime_interval(mcache_reap_interval_leeway,
1289 NSEC_PER_SEC, &leeway);
1290 thread_call_enter_delayed_with_leeway(mcache_reap_tcall, flag,
1291 deadline, leeway, THREAD_CALL_DELAY_LEEWAY);
1292}
1293
1294static void
1295mcache_reap_start(void *arg)
1296{
1297 UInt32 *flag = arg;
1298
1299 ASSERT(flag == &mcache_reaping);
1300
1301 mcache_applyall(mcache_cache_reap);
1302 mcache_dispatch(mcache_reap_done, flag);
1303}
1304
1305__private_extern__ void
1306mcache_reap(void)
1307{
1308 UInt32 *flag = &mcache_reaping;
1309
1310 if (mcache_llock_owner == current_thread() ||
1311 !OSCompareAndSwap(0, 1, flag))
1312 return;
1313
1314 mcache_dispatch(mcache_reap_start, flag);
1315}
1316
1317static void
1318mcache_cache_reap(mcache_t *cp)
1319{
1320 mcache_bkt_ws_reap(cp);
1321}
1322
1323/*
1324 * Performs period maintenance on a cache.
1325 */
1326static void
1327mcache_cache_update(mcache_t *cp)
1328{
1329 int need_bkt_resize = 0;
1330 int need_bkt_reenable = 0;
1331
1332 lck_mtx_assert(mcache_llock, LCK_MTX_ASSERT_OWNED);
1333
1334 mcache_bkt_ws_update(cp);
1335
1336 /*
1337 * Cache resize and post-purge reenable are mutually exclusive.
1338 * If the cache was previously purged, there is no point of
1339 * increasing the bucket size as there was an indication of
1340 * memory pressure on the system.
1341 */
1342 lck_mtx_lock_spin(&cp->mc_sync_lock);
1343 if (!(cp->mc_flags & MCF_NOCPUCACHE) && cp->mc_enable_cnt)
1344 need_bkt_reenable = 1;
1345 lck_mtx_unlock(&cp->mc_sync_lock);
1346
1347 MCACHE_LOCK(&cp->mc_bkt_lock);
1348 /*
1349 * If the contention count is greater than the threshold, and if
1350 * we are not already at the maximum bucket size, increase it.
1351 * Otherwise, if this cache was previously purged by the user
1352 * then we simply reenable it.
1353 */
1354 if ((unsigned int)cp->mc_chunksize < cp->cache_bkttype->bt_maxbuf &&
1355 (int)(cp->mc_bkt_contention - cp->mc_bkt_contention_prev) >
1356 mcache_bkt_contention && !need_bkt_reenable)
1357 need_bkt_resize = 1;
1358
1359 cp ->mc_bkt_contention_prev = cp->mc_bkt_contention;
1360 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1361
1362 if (need_bkt_resize)
1363 mcache_dispatch(mcache_cache_bkt_resize, cp);
1364 else if (need_bkt_reenable)
1365 mcache_dispatch(mcache_cache_enable, cp);
1366}
1367
1368/*
1369 * Recompute a cache's bucket size. This is an expensive operation
1370 * and should not be done frequently; larger buckets provide for a
1371 * higher transfer rate with the bucket while smaller buckets reduce
1372 * the memory consumption.
1373 */
1374static void
1375mcache_cache_bkt_resize(void *arg)
1376{
1377 mcache_t *cp = arg;
1378 mcache_bkttype_t *btp = cp->cache_bkttype;
1379
1380 if ((unsigned int)cp->mc_chunksize < btp->bt_maxbuf) {
1381 mcache_bkt_purge(cp);
1382
1383 /*
1384 * Upgrade to the next bucket type with larger bucket size;
1385 * temporarily set the previous contention snapshot to a
1386 * negative number to prevent unnecessary resize request.
1387 */
1388 MCACHE_LOCK(&cp->mc_bkt_lock);
1389 cp->cache_bkttype = ++btp;
1390 cp ->mc_bkt_contention_prev = cp->mc_bkt_contention + INT_MAX;
1391 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1392
1393 mcache_cache_enable(cp);
1394 }
1395}
1396
1397/*
1398 * Reenable a previously disabled cache due to purge.
1399 */
1400static void
1401mcache_cache_enable(void *arg)
1402{
1403 mcache_t *cp = arg;
1404
1405 lck_mtx_lock_spin(&cp->mc_sync_lock);
1406 cp->mc_purge_cnt = 0;
1407 cp->mc_enable_cnt = 0;
1408 lck_mtx_unlock(&cp->mc_sync_lock);
1409
1410 mcache_cache_bkt_enable(cp);
1411}
1412
1413static void
1414mcache_update_timeout(__unused void *arg)
1415{
1416 uint64_t deadline, leeway;
1417
1418 clock_interval_to_deadline(mcache_reap_interval, NSEC_PER_SEC,
1419 &deadline);
1420 clock_interval_to_absolutetime_interval(mcache_reap_interval_leeway,
1421 NSEC_PER_SEC, &leeway);
1422 thread_call_enter_delayed_with_leeway(mcache_update_tcall, NULL,
1423 deadline, leeway, THREAD_CALL_DELAY_LEEWAY);
1424}
1425
1426static void
1427mcache_update(thread_call_param_t arg __unused,
1428 thread_call_param_t dummy __unused)
1429{
1430 mcache_applyall(mcache_cache_update);
1431 mcache_update_timeout(NULL);
1432}
1433
1434static void
1435mcache_applyall(void (*func)(mcache_t *))
1436{
1437 mcache_t *cp;
1438
1439 MCACHE_LIST_LOCK();
1440 LIST_FOREACH(cp, &mcache_head, mc_list) {
1441 func(cp);
1442 }
1443 MCACHE_LIST_UNLOCK();
1444}
1445
1446static void
1447mcache_dispatch(void (*func)(void *), void *arg)
1448{
1449 ASSERT(func != NULL);
1450 timeout(func, arg, hz/1000);
1451}
1452
1453__private_extern__ void
1454mcache_buffer_log(mcache_audit_t *mca, void *addr, mcache_t *cp,
1455 struct timeval *base_ts)
1456{
1457 struct timeval now, base = { 0, 0 };
1458 void *stack[MCACHE_STACK_DEPTH + 1];
1459 struct mca_trn *transaction;
1460
1461 transaction = &mca->mca_trns[mca->mca_next_trn];
1462
1463 mca->mca_addr = addr;
1464 mca->mca_cache = cp;
1465
1466 transaction->mca_thread = current_thread();
1467
1468 bzero(stack, sizeof (stack));
1469 transaction->mca_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
1470 bcopy(&stack[1], transaction->mca_stack,
1471 sizeof (transaction->mca_stack));
1472
1473 microuptime(&now);
1474 if (base_ts != NULL)
1475 base = *base_ts;
1476 /* tstamp is in ms relative to base_ts */
1477 transaction->mca_tstamp = ((now.tv_usec - base.tv_usec) / 1000);
1478 if ((now.tv_sec - base.tv_sec) > 0)
1479 transaction->mca_tstamp += ((now.tv_sec - base.tv_sec) * 1000);
1480
1481 mca->mca_next_trn =
1482 (mca->mca_next_trn + 1) % mca_trn_max;
1483}
1484
1485__private_extern__ void
1486mcache_set_pattern(u_int64_t pattern, void *buf_arg, size_t size)
1487{
1488 u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1489 u_int64_t *buf = (u_int64_t *)buf_arg;
1490
1491 VERIFY(IS_P2ALIGNED(buf_arg, sizeof (u_int64_t)));
1492 VERIFY(IS_P2ALIGNED(size, sizeof (u_int64_t)));
1493
1494 while (buf < buf_end)
1495 *buf++ = pattern;
1496}
1497
1498__private_extern__ void *
1499mcache_verify_pattern(u_int64_t pattern, void *buf_arg, size_t size)
1500{
1501 u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1502 u_int64_t *buf;
1503
1504 VERIFY(IS_P2ALIGNED(buf_arg, sizeof (u_int64_t)));
1505 VERIFY(IS_P2ALIGNED(size, sizeof (u_int64_t)));
1506
1507 for (buf = buf_arg; buf < buf_end; buf++) {
1508 if (*buf != pattern)
1509 return (buf);
1510 }
1511 return (NULL);
1512}
1513
1514__private_extern__ void *
1515mcache_verify_set_pattern(u_int64_t old, u_int64_t new, void *buf_arg,
1516 size_t size)
1517{
1518 u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1519 u_int64_t *buf;
1520
1521 VERIFY(IS_P2ALIGNED(buf_arg, sizeof (u_int64_t)));
1522 VERIFY(IS_P2ALIGNED(size, sizeof (u_int64_t)));
1523
1524 for (buf = buf_arg; buf < buf_end; buf++) {
1525 if (*buf != old) {
1526 mcache_set_pattern(old, buf_arg,
1527 (uintptr_t)buf - (uintptr_t)buf_arg);
1528 return (buf);
1529 }
1530 *buf = new;
1531 }
1532 return (NULL);
1533}
1534
1535__private_extern__ void
1536mcache_audit_free_verify(mcache_audit_t *mca, void *base, size_t offset,
1537 size_t size)
1538{
1539 void *addr;
1540 u_int64_t *oaddr64;
1541 mcache_obj_t *next;
1542
1543 addr = (void *)((uintptr_t)base + offset);
1544 next = ((mcache_obj_t *)addr)->obj_next;
1545
1546 /* For the "obj_next" pointer in the buffer */
1547 oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof (u_int64_t));
1548 *oaddr64 = MCACHE_FREE_PATTERN;
1549
1550 if ((oaddr64 = mcache_verify_pattern(MCACHE_FREE_PATTERN,
1551 (caddr_t)base, size)) != NULL) {
1552 mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
1553 (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
1554 /* NOTREACHED */
1555 }
1556 ((mcache_obj_t *)addr)->obj_next = next;
1557}
1558
1559__private_extern__ void
1560mcache_audit_free_verify_set(mcache_audit_t *mca, void *base, size_t offset,
1561 size_t size)
1562{
1563 void *addr;
1564 u_int64_t *oaddr64;
1565 mcache_obj_t *next;
1566
1567 addr = (void *)((uintptr_t)base + offset);
1568 next = ((mcache_obj_t *)addr)->obj_next;
1569
1570 /* For the "obj_next" pointer in the buffer */
1571 oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof (u_int64_t));
1572 *oaddr64 = MCACHE_FREE_PATTERN;
1573
1574 if ((oaddr64 = mcache_verify_set_pattern(MCACHE_FREE_PATTERN,
1575 MCACHE_UNINITIALIZED_PATTERN, (caddr_t)base, size)) != NULL) {
1576 mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
1577 (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
1578 /* NOTREACHED */
1579 }
1580 ((mcache_obj_t *)addr)->obj_next = next;
1581}
1582
1583#undef panic
1584
1585#define DUMP_TRN_FMT() \
1586 "%s transaction thread %p saved PC stack (%d deep):\n" \
1587 "\t%p, %p, %p, %p, %p, %p, %p, %p\n" \
1588 "\t%p, %p, %p, %p, %p, %p, %p, %p\n"
1589
1590#define DUMP_TRN_FIELDS(s, x) \
1591 s, \
1592 mca->mca_trns[x].mca_thread, mca->mca_trns[x].mca_depth, \
1593 mca->mca_trns[x].mca_stack[0], mca->mca_trns[x].mca_stack[1], \
1594 mca->mca_trns[x].mca_stack[2], mca->mca_trns[x].mca_stack[3], \
1595 mca->mca_trns[x].mca_stack[4], mca->mca_trns[x].mca_stack[5], \
1596 mca->mca_trns[x].mca_stack[6], mca->mca_trns[x].mca_stack[7], \
1597 mca->mca_trns[x].mca_stack[8], mca->mca_trns[x].mca_stack[9], \
1598 mca->mca_trns[x].mca_stack[10], mca->mca_trns[x].mca_stack[11], \
1599 mca->mca_trns[x].mca_stack[12], mca->mca_trns[x].mca_stack[13], \
1600 mca->mca_trns[x].mca_stack[14], mca->mca_trns[x].mca_stack[15]
1601
1602#define MCA_TRN_LAST ((mca->mca_next_trn + mca_trn_max) % mca_trn_max)
1603#define MCA_TRN_PREV ((mca->mca_next_trn + mca_trn_max - 1) % mca_trn_max)
1604
1605__private_extern__ char *
1606mcache_dump_mca(mcache_audit_t *mca)
1607{
1608 if (mca_dump_buf == NULL)
1609 return (NULL);
1610
1611 snprintf(mca_dump_buf, DUMP_MCA_BUF_SIZE,
1612 "mca %p: addr %p, cache %p (%s) nxttrn %d\n"
1613 DUMP_TRN_FMT()
1614 DUMP_TRN_FMT(),
1615
1616 mca, mca->mca_addr, mca->mca_cache,
1617 mca->mca_cache ? mca->mca_cache->mc_name : "?",
1618 mca->mca_next_trn,
1619
1620 DUMP_TRN_FIELDS("last", MCA_TRN_LAST),
1621 DUMP_TRN_FIELDS("previous", MCA_TRN_PREV));
1622
1623 return (mca_dump_buf);
1624}
1625
1626__private_extern__ void
1627mcache_audit_panic(mcache_audit_t *mca, void *addr, size_t offset,
1628 int64_t expected, int64_t got)
1629{
1630 if (mca == NULL) {
1631 panic("mcache_audit: buffer %p modified after free at "
1632 "offset 0x%lx (0x%llx instead of 0x%llx)\n", addr,
1633 offset, got, expected);
1634 /* NOTREACHED */
1635 }
1636
1637 panic("mcache_audit: buffer %p modified after free at offset 0x%lx "
1638 "(0x%llx instead of 0x%llx)\n%s\n",
1639 addr, offset, got, expected, mcache_dump_mca(mca));
1640 /* NOTREACHED */
1641}
1642
1643__private_extern__ int
1644assfail(const char *a, const char *f, int l)
1645{
1646 panic("assertion failed: %s, file: %s, line: %d", a, f, l);
1647 return (0);
1648}