]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/mcache.c
xnu-6153.81.5.tar.gz
[apple/xnu.git] / bsd / kern / mcache.c
1 /*
2 * Copyright (c) 2006-2019 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * Memory allocator with per-CPU caching, derived from the kmem magazine
31 * concept and implementation as described in the following paper:
32 * http://www.usenix.org/events/usenix01/full_papers/bonwick/bonwick.pdf
33 * That implementation is Copyright 2006 Sun Microsystems, Inc. All rights
34 * reserved. Use is subject to license terms.
35 *
36 * There are several major differences between this and the original kmem
37 * magazine: this derivative implementation allows for multiple objects to
38 * be allocated and freed from/to the object cache in one call; in addition,
39 * it provides for better flexibility where the user is allowed to define
40 * its own slab allocator (instead of the default zone allocator). Finally,
41 * no object construction/destruction takes place at the moment, although
42 * this could be added in future to improve efficiency.
43 */
44
45 #include <sys/param.h>
46 #include <sys/types.h>
47 #include <sys/malloc.h>
48 #include <sys/mbuf.h>
49 #include <sys/queue.h>
50 #include <sys/kernel.h>
51 #include <sys/systm.h>
52
53 #include <kern/debug.h>
54 #include <kern/zalloc.h>
55 #include <kern/cpu_number.h>
56 #include <kern/locks.h>
57 #include <kern/thread_call.h>
58
59 #include <libkern/libkern.h>
60 #include <libkern/OSAtomic.h>
61 #include <libkern/OSDebug.h>
62
63 #include <mach/vm_param.h>
64 #include <machine/limits.h>
65 #include <machine/machine_routines.h>
66
67 #include <string.h>
68
69 #include <sys/mcache.h>
70
71 #define MCACHE_SIZE(n) \
72 __builtin_offsetof(mcache_t, mc_cpu[n])
73
74 /* Allocate extra in case we need to manually align the pointer */
75 #define MCACHE_ALLOC_SIZE \
76 (sizeof (void *) + MCACHE_SIZE(ncpu) + CPU_CACHE_LINE_SIZE)
77
78 #define MCACHE_CPU(c) \
79 (mcache_cpu_t *)((void *)((char *)(c) + MCACHE_SIZE(cpu_number())))
80
81 /*
82 * MCACHE_LIST_LOCK() and MCACHE_LIST_UNLOCK() are macros used
83 * to serialize accesses to the global list of caches in the system.
84 * They also record the thread currently running in the critical
85 * section, so that we can avoid recursive requests to reap the
86 * caches when memory runs low.
87 */
88 #define MCACHE_LIST_LOCK() { \
89 lck_mtx_lock(mcache_llock); \
90 mcache_llock_owner = current_thread(); \
91 }
92
93 #define MCACHE_LIST_UNLOCK() { \
94 mcache_llock_owner = NULL; \
95 lck_mtx_unlock(mcache_llock); \
96 }
97
98 #define MCACHE_LOCK(l) lck_mtx_lock(l)
99 #define MCACHE_UNLOCK(l) lck_mtx_unlock(l)
100 #define MCACHE_LOCK_TRY(l) lck_mtx_try_lock(l)
101
102 static int ncpu;
103 static unsigned int cache_line_size;
104 static lck_mtx_t *mcache_llock;
105 static struct thread *mcache_llock_owner;
106 static lck_attr_t *mcache_llock_attr;
107 static lck_grp_t *mcache_llock_grp;
108 static lck_grp_attr_t *mcache_llock_grp_attr;
109 static struct zone *mcache_zone;
110 static const uint32_t mcache_reap_interval = 15;
111 static const uint32_t mcache_reap_interval_leeway = 2;
112 static UInt32 mcache_reaping;
113 static int mcache_ready;
114 static int mcache_updating;
115
116 static int mcache_bkt_contention = 3;
117 #if DEBUG
118 static unsigned int mcache_flags = MCF_DEBUG;
119 #else
120 static unsigned int mcache_flags = 0;
121 #endif
122
123 int mca_trn_max = MCA_TRN_MAX;
124
125 #define DUMP_MCA_BUF_SIZE 512
126 static char *mca_dump_buf;
127
128 static mcache_bkttype_t mcache_bkttype[] = {
129 { 1, 4096, 32768, NULL },
130 { 3, 2048, 16384, NULL },
131 { 7, 1024, 12288, NULL },
132 { 15, 256, 8192, NULL },
133 { 31, 64, 4096, NULL },
134 { 47, 0, 2048, NULL },
135 { 63, 0, 1024, NULL },
136 { 95, 0, 512, NULL },
137 { 143, 0, 256, NULL },
138 { 165, 0, 0, NULL },
139 };
140
141 static mcache_t *mcache_create_common(const char *, size_t, size_t,
142 mcache_allocfn_t, mcache_freefn_t, mcache_auditfn_t, mcache_logfn_t,
143 mcache_notifyfn_t, void *, u_int32_t, int, int);
144 static unsigned int mcache_slab_alloc(void *, mcache_obj_t ***,
145 unsigned int, int);
146 static void mcache_slab_free(void *, mcache_obj_t *, boolean_t);
147 static void mcache_slab_audit(void *, mcache_obj_t *, boolean_t);
148 static void mcache_cpu_refill(mcache_cpu_t *, mcache_bkt_t *, int);
149 static mcache_bkt_t *mcache_bkt_alloc(mcache_t *, mcache_bktlist_t *);
150 static void mcache_bkt_free(mcache_t *, mcache_bktlist_t *, mcache_bkt_t *);
151 static void mcache_cache_bkt_enable(mcache_t *);
152 static void mcache_bkt_purge(mcache_t *);
153 static void mcache_bkt_destroy(mcache_t *, mcache_bkt_t *, int);
154 static void mcache_bkt_ws_update(mcache_t *);
155 static void mcache_bkt_ws_zero(mcache_t *);
156 static void mcache_bkt_ws_reap(mcache_t *);
157 static void mcache_dispatch(void (*)(void *), void *);
158 static void mcache_cache_reap(mcache_t *);
159 static void mcache_cache_update(mcache_t *);
160 static void mcache_cache_bkt_resize(void *);
161 static void mcache_cache_enable(void *);
162 static void mcache_update(thread_call_param_t __unused, thread_call_param_t __unused);
163 static void mcache_update_timeout(void *);
164 static void mcache_applyall(void (*)(mcache_t *));
165 static void mcache_reap_start(void *);
166 static void mcache_reap_done(void *);
167 static void mcache_reap_timeout(thread_call_param_t __unused, thread_call_param_t);
168 static void mcache_notify(mcache_t *, u_int32_t);
169 static void mcache_purge(void *);
170
171 static LIST_HEAD(, mcache) mcache_head;
172 mcache_t *mcache_audit_cache;
173
174 static thread_call_t mcache_reap_tcall;
175 static thread_call_t mcache_update_tcall;
176
177 /*
178 * Initialize the framework; this is currently called as part of BSD init.
179 */
180 __private_extern__ void
181 mcache_init(void)
182 {
183 mcache_bkttype_t *btp;
184 unsigned int i;
185 char name[32];
186
187 VERIFY(mca_trn_max >= 2);
188
189 ncpu = ml_get_max_cpus();
190 (void) mcache_cache_line_size(); /* prime it */
191
192 mcache_llock_grp_attr = lck_grp_attr_alloc_init();
193 mcache_llock_grp = lck_grp_alloc_init("mcache.list",
194 mcache_llock_grp_attr);
195 mcache_llock_attr = lck_attr_alloc_init();
196 mcache_llock = lck_mtx_alloc_init(mcache_llock_grp, mcache_llock_attr);
197
198 mcache_reap_tcall = thread_call_allocate(mcache_reap_timeout, NULL);
199 mcache_update_tcall = thread_call_allocate(mcache_update, NULL);
200 if (mcache_reap_tcall == NULL || mcache_update_tcall == NULL) {
201 panic("mcache_init: thread_call_allocate failed");
202 /* NOTREACHED */
203 __builtin_unreachable();
204 }
205
206 mcache_zone = zinit(MCACHE_ALLOC_SIZE, 256 * MCACHE_ALLOC_SIZE,
207 PAGE_SIZE, "mcache");
208 if (mcache_zone == NULL) {
209 panic("mcache_init: failed to allocate mcache zone\n");
210 /* NOTREACHED */
211 __builtin_unreachable();
212 }
213 zone_change(mcache_zone, Z_CALLERACCT, FALSE);
214
215 LIST_INIT(&mcache_head);
216
217 for (i = 0; i < sizeof(mcache_bkttype) / sizeof(*btp); i++) {
218 btp = &mcache_bkttype[i];
219 (void) snprintf(name, sizeof(name), "bkt_%d",
220 btp->bt_bktsize);
221 btp->bt_cache = mcache_create(name,
222 (btp->bt_bktsize + 1) * sizeof(void *), 0, 0, MCR_SLEEP);
223 }
224
225 PE_parse_boot_argn("mcache_flags", &mcache_flags, sizeof(mcache_flags));
226 mcache_flags &= MCF_FLAGS_MASK;
227
228 mcache_audit_cache = mcache_create("audit", sizeof(mcache_audit_t),
229 0, 0, MCR_SLEEP);
230
231 mcache_applyall(mcache_cache_bkt_enable);
232 mcache_ready = 1;
233
234 printf("mcache: %d CPU(s), %d bytes CPU cache line size\n",
235 ncpu, CPU_CACHE_LINE_SIZE);
236 }
237
238 /*
239 * Return the global mcache flags.
240 */
241 __private_extern__ unsigned int
242 mcache_getflags(void)
243 {
244 return mcache_flags;
245 }
246
247 /*
248 * Return the CPU cache line size.
249 */
250 __private_extern__ unsigned int
251 mcache_cache_line_size(void)
252 {
253 if (cache_line_size == 0) {
254 ml_cpu_info_t cpu_info;
255 ml_cpu_get_info(&cpu_info);
256 cache_line_size = cpu_info.cache_line_size;
257 }
258 return cache_line_size;
259 }
260
261 /*
262 * Create a cache using the zone allocator as the backend slab allocator.
263 * The caller may specify any alignment for the object; if it specifies 0
264 * the default alignment (MCACHE_ALIGN) will be used.
265 */
266 __private_extern__ mcache_t *
267 mcache_create(const char *name, size_t bufsize, size_t align,
268 u_int32_t flags, int wait)
269 {
270 return mcache_create_common(name, bufsize, align, mcache_slab_alloc,
271 mcache_slab_free, mcache_slab_audit, NULL, NULL, NULL, flags, 1,
272 wait);
273 }
274
275 /*
276 * Create a cache using a custom backend slab allocator. Since the caller
277 * is responsible for allocation, no alignment guarantee will be provided
278 * by this framework.
279 */
280 __private_extern__ mcache_t *
281 mcache_create_ext(const char *name, size_t bufsize,
282 mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
283 mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
284 u_int32_t flags, int wait)
285 {
286 return mcache_create_common(name, bufsize, 0, allocfn,
287 freefn, auditfn, logfn, notifyfn, arg, flags, 0, wait);
288 }
289
290 /*
291 * Common cache creation routine.
292 */
293 static mcache_t *
294 mcache_create_common(const char *name, size_t bufsize, size_t align,
295 mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
296 mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
297 u_int32_t flags, int need_zone, int wait)
298 {
299 mcache_bkttype_t *btp;
300 mcache_t *cp = NULL;
301 size_t chunksize;
302 void *buf, **pbuf;
303 int c;
304 char lck_name[64];
305
306 /* If auditing is on and print buffer is NULL, allocate it now */
307 if ((flags & MCF_DEBUG) && mca_dump_buf == NULL) {
308 int malloc_wait = (wait & MCR_NOSLEEP) ? M_NOWAIT : M_WAITOK;
309 MALLOC(mca_dump_buf, char *, DUMP_MCA_BUF_SIZE, M_TEMP,
310 malloc_wait | M_ZERO);
311 if (mca_dump_buf == NULL) {
312 return NULL;
313 }
314 }
315
316 buf = zalloc(mcache_zone);
317 if (buf == NULL) {
318 goto fail;
319 }
320
321 bzero(buf, MCACHE_ALLOC_SIZE);
322
323 /*
324 * In case we didn't get a cache-aligned memory, round it up
325 * accordingly. This is needed in order to get the rest of
326 * structure members aligned properly. It also means that
327 * the memory span gets shifted due to the round up, but it
328 * is okay since we've allocated extra space for this.
329 */
330 cp = (mcache_t *)
331 P2ROUNDUP((intptr_t)buf + sizeof(void *), CPU_CACHE_LINE_SIZE);
332 pbuf = (void **)((intptr_t)cp - sizeof(void *));
333 *pbuf = buf;
334
335 /*
336 * Guaranteed alignment is valid only when we use the internal
337 * slab allocator (currently set to use the zone allocator).
338 */
339 if (!need_zone) {
340 align = 1;
341 } else {
342 /* Enforce 64-bit minimum alignment for zone-based buffers */
343 if (align == 0) {
344 align = MCACHE_ALIGN;
345 }
346 align = P2ROUNDUP(align, MCACHE_ALIGN);
347 }
348
349 if ((align & (align - 1)) != 0) {
350 panic("mcache_create: bad alignment %lu", align);
351 /* NOTREACHED */
352 __builtin_unreachable();
353 }
354
355 cp->mc_align = align;
356 cp->mc_slab_alloc = allocfn;
357 cp->mc_slab_free = freefn;
358 cp->mc_slab_audit = auditfn;
359 cp->mc_slab_log = logfn;
360 cp->mc_slab_notify = notifyfn;
361 cp->mc_private = need_zone ? cp : arg;
362 cp->mc_bufsize = bufsize;
363 cp->mc_flags = (flags & MCF_FLAGS_MASK) | mcache_flags;
364
365 (void) snprintf(cp->mc_name, sizeof(cp->mc_name), "mcache.%s", name);
366
367 (void) snprintf(lck_name, sizeof(lck_name), "%s.cpu", cp->mc_name);
368 cp->mc_cpu_lock_grp_attr = lck_grp_attr_alloc_init();
369 cp->mc_cpu_lock_grp = lck_grp_alloc_init(lck_name,
370 cp->mc_cpu_lock_grp_attr);
371 cp->mc_cpu_lock_attr = lck_attr_alloc_init();
372
373 /*
374 * Allocation chunk size is the object's size plus any extra size
375 * needed to satisfy the object's alignment. It is enforced to be
376 * at least the size of an LP64 pointer to simplify auditing and to
377 * handle multiple-element allocation requests, where the elements
378 * returned are linked together in a list.
379 */
380 chunksize = MAX(bufsize, sizeof(u_int64_t));
381 if (need_zone) {
382 VERIFY(align != 0 && (align % MCACHE_ALIGN) == 0);
383 chunksize += sizeof(uint64_t) + align;
384 chunksize = P2ROUNDUP(chunksize, align);
385 if ((cp->mc_slab_zone = zinit(chunksize, 64 * 1024 * ncpu,
386 PAGE_SIZE, cp->mc_name)) == NULL) {
387 goto fail;
388 }
389 zone_change(cp->mc_slab_zone, Z_EXPAND, TRUE);
390 }
391 cp->mc_chunksize = chunksize;
392
393 /*
394 * Initialize the bucket layer.
395 */
396 (void) snprintf(lck_name, sizeof(lck_name), "%s.bkt", cp->mc_name);
397 cp->mc_bkt_lock_grp_attr = lck_grp_attr_alloc_init();
398 cp->mc_bkt_lock_grp = lck_grp_alloc_init(lck_name,
399 cp->mc_bkt_lock_grp_attr);
400 cp->mc_bkt_lock_attr = lck_attr_alloc_init();
401 lck_mtx_init(&cp->mc_bkt_lock, cp->mc_bkt_lock_grp,
402 cp->mc_bkt_lock_attr);
403
404 (void) snprintf(lck_name, sizeof(lck_name), "%s.sync", cp->mc_name);
405 cp->mc_sync_lock_grp_attr = lck_grp_attr_alloc_init();
406 cp->mc_sync_lock_grp = lck_grp_alloc_init(lck_name,
407 cp->mc_sync_lock_grp_attr);
408 cp->mc_sync_lock_attr = lck_attr_alloc_init();
409 lck_mtx_init(&cp->mc_sync_lock, cp->mc_sync_lock_grp,
410 cp->mc_sync_lock_attr);
411
412 for (btp = mcache_bkttype; chunksize <= btp->bt_minbuf; btp++) {
413 continue;
414 }
415
416 cp->cache_bkttype = btp;
417
418 /*
419 * Initialize the CPU layer. Each per-CPU structure is aligned
420 * on the CPU cache line boundary to prevent false sharing.
421 */
422 for (c = 0; c < ncpu; c++) {
423 mcache_cpu_t *ccp = &cp->mc_cpu[c];
424
425 VERIFY(IS_P2ALIGNED(ccp, CPU_CACHE_LINE_SIZE));
426 lck_mtx_init(&ccp->cc_lock, cp->mc_cpu_lock_grp,
427 cp->mc_cpu_lock_attr);
428 ccp->cc_objs = -1;
429 ccp->cc_pobjs = -1;
430 }
431
432 if (mcache_ready) {
433 mcache_cache_bkt_enable(cp);
434 }
435
436 /* TODO: dynamically create sysctl for stats */
437
438 MCACHE_LIST_LOCK();
439 LIST_INSERT_HEAD(&mcache_head, cp, mc_list);
440 MCACHE_LIST_UNLOCK();
441
442 /*
443 * If cache buckets are enabled and this is the first cache
444 * created, start the periodic cache update.
445 */
446 if (!(mcache_flags & MCF_NOCPUCACHE) && !mcache_updating) {
447 mcache_updating = 1;
448 mcache_update_timeout(NULL);
449 }
450 if (cp->mc_flags & MCF_DEBUG) {
451 printf("mcache_create: %s (%s) arg %p bufsize %lu align %lu "
452 "chunksize %lu bktsize %d\n", name, need_zone ? "i" : "e",
453 arg, bufsize, cp->mc_align, chunksize, btp->bt_bktsize);
454 }
455 return cp;
456
457 fail:
458 if (buf != NULL) {
459 zfree(mcache_zone, buf);
460 }
461 return NULL;
462 }
463
464 /*
465 * Allocate one or more objects from a cache.
466 */
467 __private_extern__ unsigned int
468 mcache_alloc_ext(mcache_t *cp, mcache_obj_t **list, unsigned int num, int wait)
469 {
470 mcache_cpu_t *ccp;
471 mcache_obj_t **top = &(*list);
472 mcache_bkt_t *bkt;
473 unsigned int need = num;
474 boolean_t nwretry = FALSE;
475
476 /* MCR_NOSLEEP and MCR_FAILOK are mutually exclusive */
477 VERIFY((wait & (MCR_NOSLEEP | MCR_FAILOK)) != (MCR_NOSLEEP | MCR_FAILOK));
478
479 ASSERT(list != NULL);
480 *list = NULL;
481
482 if (num == 0) {
483 return 0;
484 }
485
486 retry_alloc:
487 /* We may not always be running in the same CPU in case of retries */
488 ccp = MCACHE_CPU(cp);
489
490 MCACHE_LOCK(&ccp->cc_lock);
491 for (;;) {
492 /*
493 * If we have an object in the current CPU's filled bucket,
494 * chain the object to any previous objects and return if
495 * we've satisfied the number of requested objects.
496 */
497 if (ccp->cc_objs > 0) {
498 mcache_obj_t *tail;
499 int objs;
500
501 /*
502 * Objects in the bucket are already linked together
503 * with the most recently freed object at the head of
504 * the list; grab as many objects as we can.
505 */
506 objs = MIN((unsigned int)ccp->cc_objs, need);
507 *list = ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
508 ccp->cc_objs -= objs;
509 ccp->cc_alloc += objs;
510
511 tail = ccp->cc_filled->bkt_obj[ccp->cc_objs];
512 list = &tail->obj_next;
513 *list = NULL;
514
515 /* If we got them all, return to caller */
516 if ((need -= objs) == 0) {
517 MCACHE_UNLOCK(&ccp->cc_lock);
518
519 if (!(cp->mc_flags & MCF_NOLEAKLOG) &&
520 cp->mc_slab_log != NULL) {
521 (*cp->mc_slab_log)(num, *top, TRUE);
522 }
523
524 if (cp->mc_flags & MCF_DEBUG) {
525 goto debug_alloc;
526 }
527
528 return num;
529 }
530 }
531
532 /*
533 * The CPU's filled bucket is empty. If the previous filled
534 * bucket was full, exchange and try again.
535 */
536 if (ccp->cc_pobjs > 0) {
537 mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
538 continue;
539 }
540
541 /*
542 * If the bucket layer is disabled, allocate from slab. This
543 * can happen either because MCF_NOCPUCACHE is set, or because
544 * the bucket layer is currently being resized.
545 */
546 if (ccp->cc_bktsize == 0) {
547 break;
548 }
549
550 /*
551 * Both of the CPU's buckets are empty; try to get a full
552 * bucket from the bucket layer. Upon success, refill this
553 * CPU and place any empty bucket into the empty list.
554 */
555 bkt = mcache_bkt_alloc(cp, &cp->mc_full);
556 if (bkt != NULL) {
557 if (ccp->cc_pfilled != NULL) {
558 mcache_bkt_free(cp, &cp->mc_empty,
559 ccp->cc_pfilled);
560 }
561 mcache_cpu_refill(ccp, bkt, ccp->cc_bktsize);
562 continue;
563 }
564
565 /*
566 * The bucket layer has no full buckets; allocate the
567 * object(s) directly from the slab layer.
568 */
569 break;
570 }
571 MCACHE_UNLOCK(&ccp->cc_lock);
572
573 need -= (*cp->mc_slab_alloc)(cp->mc_private, &list, need, wait);
574
575 /*
576 * If this is a blocking allocation, or if it is non-blocking and
577 * the cache's full bucket is non-empty, then retry the allocation.
578 */
579 if (need > 0) {
580 if (!(wait & MCR_NONBLOCKING)) {
581 atomic_add_32(&cp->mc_wretry_cnt, 1);
582 goto retry_alloc;
583 } else if ((wait & (MCR_NOSLEEP | MCR_TRYHARD)) &&
584 !mcache_bkt_isempty(cp)) {
585 if (!nwretry) {
586 nwretry = TRUE;
587 }
588 atomic_add_32(&cp->mc_nwretry_cnt, 1);
589 goto retry_alloc;
590 } else if (nwretry) {
591 atomic_add_32(&cp->mc_nwfail_cnt, 1);
592 }
593 }
594
595 if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL) {
596 (*cp->mc_slab_log)((num - need), *top, TRUE);
597 }
598
599 if (!(cp->mc_flags & MCF_DEBUG)) {
600 return num - need;
601 }
602
603 debug_alloc:
604 if (cp->mc_flags & MCF_DEBUG) {
605 mcache_obj_t **o = top;
606 unsigned int n;
607
608 n = 0;
609 /*
610 * Verify that the chain of objects have the same count as
611 * what we are about to report to the caller. Any mismatch
612 * here means that the object list is insanely broken and
613 * therefore we must panic.
614 */
615 while (*o != NULL) {
616 o = &(*o)->obj_next;
617 ++n;
618 }
619 if (n != (num - need)) {
620 panic("mcache_alloc_ext: %s cp %p corrupted list "
621 "(got %d actual %d)\n", cp->mc_name,
622 (void *)cp, num - need, n);
623 /* NOTREACHED */
624 __builtin_unreachable();
625 }
626 }
627
628 /* Invoke the slab layer audit callback if auditing is enabled */
629 if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL) {
630 (*cp->mc_slab_audit)(cp->mc_private, *top, TRUE);
631 }
632
633 return num - need;
634 }
635
636 /*
637 * Allocate a single object from a cache.
638 */
639 __private_extern__ void *
640 mcache_alloc(mcache_t *cp, int wait)
641 {
642 mcache_obj_t *buf;
643
644 (void) mcache_alloc_ext(cp, &buf, 1, wait);
645 return buf;
646 }
647
648 __private_extern__ void
649 mcache_waiter_inc(mcache_t *cp)
650 {
651 atomic_add_32(&cp->mc_waiter_cnt, 1);
652 }
653
654 __private_extern__ void
655 mcache_waiter_dec(mcache_t *cp)
656 {
657 atomic_add_32(&cp->mc_waiter_cnt, -1);
658 }
659
660 __private_extern__ boolean_t
661 mcache_bkt_isempty(mcache_t *cp)
662 {
663 /*
664 * This isn't meant to accurately tell whether there are
665 * any full buckets in the cache; it is simply a way to
666 * obtain "hints" about the state of the cache.
667 */
668 return cp->mc_full.bl_total == 0;
669 }
670
671 /*
672 * Notify the slab layer about an event.
673 */
674 static void
675 mcache_notify(mcache_t *cp, u_int32_t event)
676 {
677 if (cp->mc_slab_notify != NULL) {
678 (*cp->mc_slab_notify)(cp->mc_private, event);
679 }
680 }
681
682 /*
683 * Purge the cache and disable its buckets.
684 */
685 static void
686 mcache_purge(void *arg)
687 {
688 mcache_t *cp = arg;
689
690 mcache_bkt_purge(cp);
691 /*
692 * We cannot simply call mcache_cache_bkt_enable() from here as
693 * a bucket resize may be in flight and we would cause the CPU
694 * layers of the cache to point to different sizes. Therefore,
695 * we simply increment the enable count so that during the next
696 * periodic cache update the buckets can be reenabled.
697 */
698 lck_mtx_lock_spin(&cp->mc_sync_lock);
699 cp->mc_enable_cnt++;
700 lck_mtx_unlock(&cp->mc_sync_lock);
701 }
702
703 __private_extern__ boolean_t
704 mcache_purge_cache(mcache_t *cp, boolean_t async)
705 {
706 /*
707 * Purging a cache that has no per-CPU caches or is already
708 * in the process of being purged is rather pointless.
709 */
710 if (cp->mc_flags & MCF_NOCPUCACHE) {
711 return FALSE;
712 }
713
714 lck_mtx_lock_spin(&cp->mc_sync_lock);
715 if (cp->mc_purge_cnt > 0) {
716 lck_mtx_unlock(&cp->mc_sync_lock);
717 return FALSE;
718 }
719 cp->mc_purge_cnt++;
720 lck_mtx_unlock(&cp->mc_sync_lock);
721
722 if (async) {
723 mcache_dispatch(mcache_purge, cp);
724 } else {
725 mcache_purge(cp);
726 }
727
728 return TRUE;
729 }
730
731 /*
732 * Free a single object to a cache.
733 */
734 __private_extern__ void
735 mcache_free(mcache_t *cp, void *buf)
736 {
737 ((mcache_obj_t *)buf)->obj_next = NULL;
738 mcache_free_ext(cp, (mcache_obj_t *)buf);
739 }
740
741 /*
742 * Free one or more objects to a cache.
743 */
744 __private_extern__ void
745 mcache_free_ext(mcache_t *cp, mcache_obj_t *list)
746 {
747 mcache_cpu_t *ccp = MCACHE_CPU(cp);
748 mcache_bkttype_t *btp;
749 mcache_obj_t *nlist;
750 mcache_bkt_t *bkt;
751
752 if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL) {
753 (*cp->mc_slab_log)(0, list, FALSE);
754 }
755
756 /* Invoke the slab layer audit callback if auditing is enabled */
757 if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL) {
758 (*cp->mc_slab_audit)(cp->mc_private, list, FALSE);
759 }
760
761 MCACHE_LOCK(&ccp->cc_lock);
762 for (;;) {
763 /*
764 * If there is space in the current CPU's filled bucket, put
765 * the object there and return once all objects are freed.
766 * Note the cast to unsigned integer takes care of the case
767 * where the bucket layer is disabled (when cc_objs is -1).
768 */
769 if ((unsigned int)ccp->cc_objs <
770 (unsigned int)ccp->cc_bktsize) {
771 /*
772 * Reverse the list while we place the object into the
773 * bucket; this effectively causes the most recently
774 * freed object(s) to be reused during allocation.
775 */
776 nlist = list->obj_next;
777 list->obj_next = (ccp->cc_objs == 0) ? NULL :
778 ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
779 ccp->cc_filled->bkt_obj[ccp->cc_objs++] = list;
780 ccp->cc_free++;
781
782 if ((list = nlist) != NULL) {
783 continue;
784 }
785
786 /* We are done; return to caller */
787 MCACHE_UNLOCK(&ccp->cc_lock);
788
789 /* If there is a waiter below, notify it */
790 if (cp->mc_waiter_cnt > 0) {
791 mcache_notify(cp, MCN_RETRYALLOC);
792 }
793 return;
794 }
795
796 /*
797 * The CPU's filled bucket is full. If the previous filled
798 * bucket was empty, exchange and try again.
799 */
800 if (ccp->cc_pobjs == 0) {
801 mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
802 continue;
803 }
804
805 /*
806 * If the bucket layer is disabled, free to slab. This can
807 * happen either because MCF_NOCPUCACHE is set, or because
808 * the bucket layer is currently being resized.
809 */
810 if (ccp->cc_bktsize == 0) {
811 break;
812 }
813
814 /*
815 * Both of the CPU's buckets are full; try to get an empty
816 * bucket from the bucket layer. Upon success, empty this
817 * CPU and place any full bucket into the full list.
818 */
819 bkt = mcache_bkt_alloc(cp, &cp->mc_empty);
820 if (bkt != NULL) {
821 if (ccp->cc_pfilled != NULL) {
822 mcache_bkt_free(cp, &cp->mc_full,
823 ccp->cc_pfilled);
824 }
825 mcache_cpu_refill(ccp, bkt, 0);
826 continue;
827 }
828 btp = cp->cache_bkttype;
829
830 /*
831 * We need an empty bucket to put our freed objects into
832 * but couldn't get an empty bucket from the bucket layer;
833 * attempt to allocate one. We do not want to block for
834 * allocation here, and if the bucket allocation fails
835 * we will simply fall through to the slab layer.
836 */
837 MCACHE_UNLOCK(&ccp->cc_lock);
838 bkt = mcache_alloc(btp->bt_cache, MCR_NOSLEEP);
839 MCACHE_LOCK(&ccp->cc_lock);
840
841 if (bkt != NULL) {
842 /*
843 * We have an empty bucket, but since we drop the
844 * CPU lock above, the cache's bucket size may have
845 * changed. If so, free the bucket and try again.
846 */
847 if (ccp->cc_bktsize != btp->bt_bktsize) {
848 MCACHE_UNLOCK(&ccp->cc_lock);
849 mcache_free(btp->bt_cache, bkt);
850 MCACHE_LOCK(&ccp->cc_lock);
851 continue;
852 }
853
854 /*
855 * Store it in the bucket object since we'll
856 * need to refer to it during bucket destroy;
857 * we can't safely refer to cache_bkttype as
858 * the bucket lock may not be acquired then.
859 */
860 bkt->bkt_type = btp;
861
862 /*
863 * We have an empty bucket of the right size;
864 * add it to the bucket layer and try again.
865 */
866 mcache_bkt_free(cp, &cp->mc_empty, bkt);
867 continue;
868 }
869
870 /*
871 * The bucket layer has no empty buckets; free the
872 * object(s) directly to the slab layer.
873 */
874 break;
875 }
876 MCACHE_UNLOCK(&ccp->cc_lock);
877
878 /* If there is a waiter below, notify it */
879 if (cp->mc_waiter_cnt > 0) {
880 mcache_notify(cp, MCN_RETRYALLOC);
881 }
882
883 /* Advise the slab layer to purge the object(s) */
884 (*cp->mc_slab_free)(cp->mc_private, list,
885 (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
886 }
887
888 /*
889 * Cache destruction routine.
890 */
891 __private_extern__ void
892 mcache_destroy(mcache_t *cp)
893 {
894 void **pbuf;
895
896 MCACHE_LIST_LOCK();
897 LIST_REMOVE(cp, mc_list);
898 MCACHE_LIST_UNLOCK();
899
900 mcache_bkt_purge(cp);
901
902 /*
903 * This cache is dead; there should be no further transaction.
904 * If it's still invoked, make sure that it induces a fault.
905 */
906 cp->mc_slab_alloc = NULL;
907 cp->mc_slab_free = NULL;
908 cp->mc_slab_audit = NULL;
909
910 lck_attr_free(cp->mc_bkt_lock_attr);
911 lck_grp_free(cp->mc_bkt_lock_grp);
912 lck_grp_attr_free(cp->mc_bkt_lock_grp_attr);
913
914 lck_attr_free(cp->mc_cpu_lock_attr);
915 lck_grp_free(cp->mc_cpu_lock_grp);
916 lck_grp_attr_free(cp->mc_cpu_lock_grp_attr);
917
918 lck_attr_free(cp->mc_sync_lock_attr);
919 lck_grp_free(cp->mc_sync_lock_grp);
920 lck_grp_attr_free(cp->mc_sync_lock_grp_attr);
921
922 /*
923 * TODO: We need to destroy the zone here, but cannot do it
924 * because there is no such way to achieve that. Until then
925 * the memory allocated for the zone structure is leaked.
926 * Once it is achievable, uncomment these lines:
927 *
928 * if (cp->mc_slab_zone != NULL) {
929 * zdestroy(cp->mc_slab_zone);
930 * cp->mc_slab_zone = NULL;
931 * }
932 */
933
934 /* Get the original address since we're about to free it */
935 pbuf = (void **)((intptr_t)cp - sizeof(void *));
936
937 zfree(mcache_zone, *pbuf);
938 }
939
940 /*
941 * Internal slab allocator used as a backend for simple caches. The current
942 * implementation uses the zone allocator for simplicity reasons.
943 */
944 static unsigned int
945 mcache_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num,
946 int wait)
947 {
948 #pragma unused(wait)
949 mcache_t *cp = arg;
950 unsigned int need = num;
951 size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof(u_int64_t));
952 u_int32_t flags = cp->mc_flags;
953 void *buf, *base, **pbuf;
954 mcache_obj_t **list = *plist;
955
956 *list = NULL;
957
958 for (;;) {
959 buf = zalloc(cp->mc_slab_zone);
960 if (buf == NULL) {
961 break;
962 }
963
964 /* Get the aligned base address for this object */
965 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
966 cp->mc_align);
967
968 /*
969 * Wind back a pointer size from the aligned base and
970 * save the original address so we can free it later.
971 */
972 pbuf = (void **)((intptr_t)base - sizeof(void *));
973 *pbuf = buf;
974
975 VERIFY(((intptr_t)base + cp->mc_bufsize) <=
976 ((intptr_t)buf + cp->mc_chunksize));
977
978 /*
979 * If auditing is enabled, patternize the contents of
980 * the buffer starting from the 64-bit aligned base to
981 * the end of the buffer; the length is rounded up to
982 * the nearest 64-bit multiply; this is because we use
983 * 64-bit memory access to set/check the pattern.
984 */
985 if (flags & MCF_DEBUG) {
986 VERIFY(((intptr_t)base + rsize) <=
987 ((intptr_t)buf + cp->mc_chunksize));
988 mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
989 }
990
991 VERIFY(IS_P2ALIGNED(base, cp->mc_align));
992 *list = (mcache_obj_t *)base;
993
994 (*list)->obj_next = NULL;
995 list = *plist = &(*list)->obj_next;
996
997 /* If we got them all, return to mcache */
998 if (--need == 0) {
999 break;
1000 }
1001 }
1002
1003 return num - need;
1004 }
1005
1006 /*
1007 * Internal slab deallocator used as a backend for simple caches.
1008 */
1009 static void
1010 mcache_slab_free(void *arg, mcache_obj_t *list, __unused boolean_t purged)
1011 {
1012 mcache_t *cp = arg;
1013 mcache_obj_t *nlist;
1014 size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof(u_int64_t));
1015 u_int32_t flags = cp->mc_flags;
1016 void *base;
1017 void **pbuf;
1018
1019 for (;;) {
1020 nlist = list->obj_next;
1021 list->obj_next = NULL;
1022
1023 base = list;
1024 VERIFY(IS_P2ALIGNED(base, cp->mc_align));
1025
1026 /* Get the original address since we're about to free it */
1027 pbuf = (void **)((intptr_t)base - sizeof(void *));
1028
1029 VERIFY(((intptr_t)base + cp->mc_bufsize) <=
1030 ((intptr_t)*pbuf + cp->mc_chunksize));
1031
1032 if (flags & MCF_DEBUG) {
1033 VERIFY(((intptr_t)base + rsize) <=
1034 ((intptr_t)*pbuf + cp->mc_chunksize));
1035 mcache_audit_free_verify(NULL, base, 0, rsize);
1036 }
1037
1038 /* Free it to zone */
1039 zfree(cp->mc_slab_zone, *pbuf);
1040
1041 /* No more objects to free; return to mcache */
1042 if ((list = nlist) == NULL) {
1043 break;
1044 }
1045 }
1046 }
1047
1048 /*
1049 * Internal slab auditor for simple caches.
1050 */
1051 static void
1052 mcache_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
1053 {
1054 mcache_t *cp = arg;
1055 size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof(u_int64_t));
1056 void *base, **pbuf;
1057
1058 while (list != NULL) {
1059 mcache_obj_t *next = list->obj_next;
1060
1061 base = list;
1062 VERIFY(IS_P2ALIGNED(base, cp->mc_align));
1063
1064 /* Get the original address */
1065 pbuf = (void **)((intptr_t)base - sizeof(void *));
1066
1067 VERIFY(((intptr_t)base + rsize) <=
1068 ((intptr_t)*pbuf + cp->mc_chunksize));
1069
1070 if (!alloc) {
1071 mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
1072 } else {
1073 mcache_audit_free_verify_set(NULL, base, 0, rsize);
1074 }
1075
1076 list = list->obj_next = next;
1077 }
1078 }
1079
1080 /*
1081 * Refill the CPU's filled bucket with bkt and save the previous one.
1082 */
1083 static void
1084 mcache_cpu_refill(mcache_cpu_t *ccp, mcache_bkt_t *bkt, int objs)
1085 {
1086 ASSERT((ccp->cc_filled == NULL && ccp->cc_objs == -1) ||
1087 (ccp->cc_filled && ccp->cc_objs + objs == ccp->cc_bktsize));
1088 ASSERT(ccp->cc_bktsize > 0);
1089
1090 ccp->cc_pfilled = ccp->cc_filled;
1091 ccp->cc_pobjs = ccp->cc_objs;
1092 ccp->cc_filled = bkt;
1093 ccp->cc_objs = objs;
1094 }
1095
1096 /*
1097 * Allocate a bucket from the bucket layer.
1098 */
1099 static mcache_bkt_t *
1100 mcache_bkt_alloc(mcache_t *cp, mcache_bktlist_t *blp)
1101 {
1102 mcache_bkt_t *bkt;
1103
1104 if (!MCACHE_LOCK_TRY(&cp->mc_bkt_lock)) {
1105 /*
1106 * The bucket layer lock is held by another CPU; increase
1107 * the contention count so that we can later resize the
1108 * bucket size accordingly.
1109 */
1110 MCACHE_LOCK(&cp->mc_bkt_lock);
1111 cp->mc_bkt_contention++;
1112 }
1113
1114 if ((bkt = blp->bl_list) != NULL) {
1115 blp->bl_list = bkt->bkt_next;
1116 if (--blp->bl_total < blp->bl_min) {
1117 blp->bl_min = blp->bl_total;
1118 }
1119 blp->bl_alloc++;
1120 }
1121
1122 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1123
1124 return bkt;
1125 }
1126
1127 /*
1128 * Free a bucket to the bucket layer.
1129 */
1130 static void
1131 mcache_bkt_free(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkt_t *bkt)
1132 {
1133 MCACHE_LOCK(&cp->mc_bkt_lock);
1134
1135 bkt->bkt_next = blp->bl_list;
1136 blp->bl_list = bkt;
1137 blp->bl_total++;
1138
1139 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1140 }
1141
1142 /*
1143 * Enable the bucket layer of a cache.
1144 */
1145 static void
1146 mcache_cache_bkt_enable(mcache_t *cp)
1147 {
1148 mcache_cpu_t *ccp;
1149 int cpu;
1150
1151 if (cp->mc_flags & MCF_NOCPUCACHE) {
1152 return;
1153 }
1154
1155 for (cpu = 0; cpu < ncpu; cpu++) {
1156 ccp = &cp->mc_cpu[cpu];
1157 MCACHE_LOCK(&ccp->cc_lock);
1158 ccp->cc_bktsize = cp->cache_bkttype->bt_bktsize;
1159 MCACHE_UNLOCK(&ccp->cc_lock);
1160 }
1161 }
1162
1163 /*
1164 * Purge all buckets from a cache and disable its bucket layer.
1165 */
1166 static void
1167 mcache_bkt_purge(mcache_t *cp)
1168 {
1169 mcache_cpu_t *ccp;
1170 mcache_bkt_t *bp, *pbp;
1171 int cpu, objs, pobjs;
1172
1173 for (cpu = 0; cpu < ncpu; cpu++) {
1174 ccp = &cp->mc_cpu[cpu];
1175
1176 MCACHE_LOCK(&ccp->cc_lock);
1177
1178 bp = ccp->cc_filled;
1179 pbp = ccp->cc_pfilled;
1180 objs = ccp->cc_objs;
1181 pobjs = ccp->cc_pobjs;
1182 ccp->cc_filled = NULL;
1183 ccp->cc_pfilled = NULL;
1184 ccp->cc_objs = -1;
1185 ccp->cc_pobjs = -1;
1186 ccp->cc_bktsize = 0;
1187
1188 MCACHE_UNLOCK(&ccp->cc_lock);
1189
1190 if (bp != NULL) {
1191 mcache_bkt_destroy(cp, bp, objs);
1192 }
1193 if (pbp != NULL) {
1194 mcache_bkt_destroy(cp, pbp, pobjs);
1195 }
1196 }
1197
1198 mcache_bkt_ws_zero(cp);
1199 mcache_bkt_ws_reap(cp);
1200 }
1201
1202 /*
1203 * Free one or more objects in the bucket to the slab layer,
1204 * and also free the bucket itself.
1205 */
1206 static void
1207 mcache_bkt_destroy(mcache_t *cp, mcache_bkt_t *bkt, int nobjs)
1208 {
1209 if (nobjs > 0) {
1210 mcache_obj_t *top = bkt->bkt_obj[nobjs - 1];
1211
1212 if (cp->mc_flags & MCF_DEBUG) {
1213 mcache_obj_t *o = top;
1214 int cnt = 0;
1215
1216 /*
1217 * Verify that the chain of objects in the bucket is
1218 * valid. Any mismatch here means a mistake when the
1219 * object(s) were freed to the CPU layer, so we panic.
1220 */
1221 while (o != NULL) {
1222 o = o->obj_next;
1223 ++cnt;
1224 }
1225 if (cnt != nobjs) {
1226 panic("mcache_bkt_destroy: %s cp %p corrupted "
1227 "list in bkt %p (nobjs %d actual %d)\n",
1228 cp->mc_name, (void *)cp, (void *)bkt,
1229 nobjs, cnt);
1230 /* NOTREACHED */
1231 __builtin_unreachable();
1232 }
1233 }
1234
1235 /* Advise the slab layer to purge the object(s) */
1236 (*cp->mc_slab_free)(cp->mc_private, top,
1237 (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
1238 }
1239 mcache_free(bkt->bkt_type->bt_cache, bkt);
1240 }
1241
1242 /*
1243 * Update the bucket layer working set statistics.
1244 */
1245 static void
1246 mcache_bkt_ws_update(mcache_t *cp)
1247 {
1248 MCACHE_LOCK(&cp->mc_bkt_lock);
1249
1250 cp->mc_full.bl_reaplimit = cp->mc_full.bl_min;
1251 cp->mc_full.bl_min = cp->mc_full.bl_total;
1252 cp->mc_empty.bl_reaplimit = cp->mc_empty.bl_min;
1253 cp->mc_empty.bl_min = cp->mc_empty.bl_total;
1254
1255 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1256 }
1257
1258 /*
1259 * Mark everything as eligible for reaping (working set is zero).
1260 */
1261 static void
1262 mcache_bkt_ws_zero(mcache_t *cp)
1263 {
1264 MCACHE_LOCK(&cp->mc_bkt_lock);
1265
1266 cp->mc_full.bl_reaplimit = cp->mc_full.bl_total;
1267 cp->mc_full.bl_min = cp->mc_full.bl_total;
1268 cp->mc_empty.bl_reaplimit = cp->mc_empty.bl_total;
1269 cp->mc_empty.bl_min = cp->mc_empty.bl_total;
1270
1271 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1272 }
1273
1274 /*
1275 * Reap all buckets that are beyond the working set.
1276 */
1277 static void
1278 mcache_bkt_ws_reap(mcache_t *cp)
1279 {
1280 long reap;
1281 mcache_bkt_t *bkt;
1282
1283 reap = MIN(cp->mc_full.bl_reaplimit, cp->mc_full.bl_min);
1284 while (reap-- &&
1285 (bkt = mcache_bkt_alloc(cp, &cp->mc_full)) != NULL) {
1286 mcache_bkt_destroy(cp, bkt, bkt->bkt_type->bt_bktsize);
1287 }
1288
1289 reap = MIN(cp->mc_empty.bl_reaplimit, cp->mc_empty.bl_min);
1290 while (reap-- &&
1291 (bkt = mcache_bkt_alloc(cp, &cp->mc_empty)) != NULL) {
1292 mcache_bkt_destroy(cp, bkt, 0);
1293 }
1294 }
1295
1296 static void
1297 mcache_reap_timeout(thread_call_param_t dummy __unused,
1298 thread_call_param_t arg)
1299 {
1300 volatile UInt32 *flag = arg;
1301
1302 ASSERT(flag == &mcache_reaping);
1303
1304 *flag = 0;
1305 }
1306
1307 static void
1308 mcache_reap_done(void *flag)
1309 {
1310 uint64_t deadline, leeway;
1311
1312 clock_interval_to_deadline(mcache_reap_interval, NSEC_PER_SEC,
1313 &deadline);
1314 clock_interval_to_absolutetime_interval(mcache_reap_interval_leeway,
1315 NSEC_PER_SEC, &leeway);
1316 thread_call_enter_delayed_with_leeway(mcache_reap_tcall, flag,
1317 deadline, leeway, THREAD_CALL_DELAY_LEEWAY);
1318 }
1319
1320 static void
1321 mcache_reap_start(void *arg)
1322 {
1323 UInt32 *flag = arg;
1324
1325 ASSERT(flag == &mcache_reaping);
1326
1327 mcache_applyall(mcache_cache_reap);
1328 mcache_dispatch(mcache_reap_done, flag);
1329 }
1330
1331 __private_extern__ void
1332 mcache_reap(void)
1333 {
1334 UInt32 *flag = &mcache_reaping;
1335
1336 if (mcache_llock_owner == current_thread() ||
1337 !OSCompareAndSwap(0, 1, flag)) {
1338 return;
1339 }
1340
1341 mcache_dispatch(mcache_reap_start, flag);
1342 }
1343
1344 __private_extern__ void
1345 mcache_reap_now(mcache_t *cp, boolean_t purge)
1346 {
1347 if (purge) {
1348 mcache_bkt_purge(cp);
1349 mcache_cache_bkt_enable(cp);
1350 } else {
1351 mcache_bkt_ws_zero(cp);
1352 mcache_bkt_ws_reap(cp);
1353 }
1354 }
1355
1356 static void
1357 mcache_cache_reap(mcache_t *cp)
1358 {
1359 mcache_bkt_ws_reap(cp);
1360 }
1361
1362 /*
1363 * Performs period maintenance on a cache.
1364 */
1365 static void
1366 mcache_cache_update(mcache_t *cp)
1367 {
1368 int need_bkt_resize = 0;
1369 int need_bkt_reenable = 0;
1370
1371 lck_mtx_assert(mcache_llock, LCK_MTX_ASSERT_OWNED);
1372
1373 mcache_bkt_ws_update(cp);
1374
1375 /*
1376 * Cache resize and post-purge reenable are mutually exclusive.
1377 * If the cache was previously purged, there is no point of
1378 * increasing the bucket size as there was an indication of
1379 * memory pressure on the system.
1380 */
1381 lck_mtx_lock_spin(&cp->mc_sync_lock);
1382 if (!(cp->mc_flags & MCF_NOCPUCACHE) && cp->mc_enable_cnt) {
1383 need_bkt_reenable = 1;
1384 }
1385 lck_mtx_unlock(&cp->mc_sync_lock);
1386
1387 MCACHE_LOCK(&cp->mc_bkt_lock);
1388 /*
1389 * If the contention count is greater than the threshold, and if
1390 * we are not already at the maximum bucket size, increase it.
1391 * Otherwise, if this cache was previously purged by the user
1392 * then we simply reenable it.
1393 */
1394 if ((unsigned int)cp->mc_chunksize < cp->cache_bkttype->bt_maxbuf &&
1395 (int)(cp->mc_bkt_contention - cp->mc_bkt_contention_prev) >
1396 mcache_bkt_contention && !need_bkt_reenable) {
1397 need_bkt_resize = 1;
1398 }
1399
1400 cp->mc_bkt_contention_prev = cp->mc_bkt_contention;
1401 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1402
1403 if (need_bkt_resize) {
1404 mcache_dispatch(mcache_cache_bkt_resize, cp);
1405 } else if (need_bkt_reenable) {
1406 mcache_dispatch(mcache_cache_enable, cp);
1407 }
1408 }
1409
1410 /*
1411 * Recompute a cache's bucket size. This is an expensive operation
1412 * and should not be done frequently; larger buckets provide for a
1413 * higher transfer rate with the bucket while smaller buckets reduce
1414 * the memory consumption.
1415 */
1416 static void
1417 mcache_cache_bkt_resize(void *arg)
1418 {
1419 mcache_t *cp = arg;
1420 mcache_bkttype_t *btp = cp->cache_bkttype;
1421
1422 if ((unsigned int)cp->mc_chunksize < btp->bt_maxbuf) {
1423 mcache_bkt_purge(cp);
1424
1425 /*
1426 * Upgrade to the next bucket type with larger bucket size;
1427 * temporarily set the previous contention snapshot to a
1428 * negative number to prevent unnecessary resize request.
1429 */
1430 MCACHE_LOCK(&cp->mc_bkt_lock);
1431 cp->cache_bkttype = ++btp;
1432 cp->mc_bkt_contention_prev = cp->mc_bkt_contention + INT_MAX;
1433 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1434
1435 mcache_cache_enable(cp);
1436 }
1437 }
1438
1439 /*
1440 * Reenable a previously disabled cache due to purge.
1441 */
1442 static void
1443 mcache_cache_enable(void *arg)
1444 {
1445 mcache_t *cp = arg;
1446
1447 lck_mtx_lock_spin(&cp->mc_sync_lock);
1448 cp->mc_purge_cnt = 0;
1449 cp->mc_enable_cnt = 0;
1450 lck_mtx_unlock(&cp->mc_sync_lock);
1451
1452 mcache_cache_bkt_enable(cp);
1453 }
1454
1455 static void
1456 mcache_update_timeout(__unused void *arg)
1457 {
1458 uint64_t deadline, leeway;
1459
1460 clock_interval_to_deadline(mcache_reap_interval, NSEC_PER_SEC,
1461 &deadline);
1462 clock_interval_to_absolutetime_interval(mcache_reap_interval_leeway,
1463 NSEC_PER_SEC, &leeway);
1464 thread_call_enter_delayed_with_leeway(mcache_update_tcall, NULL,
1465 deadline, leeway, THREAD_CALL_DELAY_LEEWAY);
1466 }
1467
1468 static void
1469 mcache_update(thread_call_param_t arg __unused,
1470 thread_call_param_t dummy __unused)
1471 {
1472 mcache_applyall(mcache_cache_update);
1473 mcache_update_timeout(NULL);
1474 }
1475
1476 static void
1477 mcache_applyall(void (*func)(mcache_t *))
1478 {
1479 mcache_t *cp;
1480
1481 MCACHE_LIST_LOCK();
1482 LIST_FOREACH(cp, &mcache_head, mc_list) {
1483 func(cp);
1484 }
1485 MCACHE_LIST_UNLOCK();
1486 }
1487
1488 static void
1489 mcache_dispatch(void (*func)(void *), void *arg)
1490 {
1491 ASSERT(func != NULL);
1492 timeout(func, arg, hz / 1000);
1493 }
1494
1495 __private_extern__ void
1496 mcache_buffer_log(mcache_audit_t *mca, void *addr, mcache_t *cp,
1497 struct timeval *base_ts)
1498 {
1499 struct timeval now, base = { .tv_sec = 0, .tv_usec = 0 };
1500 void *stack[MCACHE_STACK_DEPTH + 1];
1501 struct mca_trn *transaction;
1502
1503 transaction = &mca->mca_trns[mca->mca_next_trn];
1504
1505 mca->mca_addr = addr;
1506 mca->mca_cache = cp;
1507
1508 transaction->mca_thread = current_thread();
1509
1510 bzero(stack, sizeof(stack));
1511 transaction->mca_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
1512 bcopy(&stack[1], transaction->mca_stack,
1513 sizeof(transaction->mca_stack));
1514
1515 microuptime(&now);
1516 if (base_ts != NULL) {
1517 base = *base_ts;
1518 }
1519 /* tstamp is in ms relative to base_ts */
1520 transaction->mca_tstamp = ((now.tv_usec - base.tv_usec) / 1000);
1521 if ((now.tv_sec - base.tv_sec) > 0) {
1522 transaction->mca_tstamp += ((now.tv_sec - base.tv_sec) * 1000);
1523 }
1524
1525 mca->mca_next_trn =
1526 (mca->mca_next_trn + 1) % mca_trn_max;
1527 }
1528
1529 __private_extern__ void
1530 mcache_set_pattern(u_int64_t pattern, void *buf_arg, size_t size)
1531 {
1532 u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1533 u_int64_t *buf = (u_int64_t *)buf_arg;
1534
1535 VERIFY(IS_P2ALIGNED(buf_arg, sizeof(u_int64_t)));
1536 VERIFY(IS_P2ALIGNED(size, sizeof(u_int64_t)));
1537
1538 while (buf < buf_end) {
1539 *buf++ = pattern;
1540 }
1541 }
1542
1543 __private_extern__ void *
1544 mcache_verify_pattern(u_int64_t pattern, void *buf_arg, size_t size)
1545 {
1546 u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1547 u_int64_t *buf;
1548
1549 VERIFY(IS_P2ALIGNED(buf_arg, sizeof(u_int64_t)));
1550 VERIFY(IS_P2ALIGNED(size, sizeof(u_int64_t)));
1551
1552 for (buf = buf_arg; buf < buf_end; buf++) {
1553 if (*buf != pattern) {
1554 return buf;
1555 }
1556 }
1557 return NULL;
1558 }
1559
1560 __private_extern__ void *
1561 mcache_verify_set_pattern(u_int64_t old, u_int64_t new, void *buf_arg,
1562 size_t size)
1563 {
1564 u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1565 u_int64_t *buf;
1566
1567 VERIFY(IS_P2ALIGNED(buf_arg, sizeof(u_int64_t)));
1568 VERIFY(IS_P2ALIGNED(size, sizeof(u_int64_t)));
1569
1570 for (buf = buf_arg; buf < buf_end; buf++) {
1571 if (*buf != old) {
1572 mcache_set_pattern(old, buf_arg,
1573 (uintptr_t)buf - (uintptr_t)buf_arg);
1574 return buf;
1575 }
1576 *buf = new;
1577 }
1578 return NULL;
1579 }
1580
1581 __private_extern__ void
1582 mcache_audit_free_verify(mcache_audit_t *mca, void *base, size_t offset,
1583 size_t size)
1584 {
1585 void *addr;
1586 u_int64_t *oaddr64;
1587 mcache_obj_t *next;
1588
1589 addr = (void *)((uintptr_t)base + offset);
1590 next = ((mcache_obj_t *)addr)->obj_next;
1591
1592 /* For the "obj_next" pointer in the buffer */
1593 oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof(u_int64_t));
1594 *oaddr64 = MCACHE_FREE_PATTERN;
1595
1596 if ((oaddr64 = mcache_verify_pattern(MCACHE_FREE_PATTERN,
1597 (caddr_t)base, size)) != NULL) {
1598 mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
1599 (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
1600 /* NOTREACHED */
1601 }
1602 ((mcache_obj_t *)addr)->obj_next = next;
1603 }
1604
1605 __private_extern__ void
1606 mcache_audit_free_verify_set(mcache_audit_t *mca, void *base, size_t offset,
1607 size_t size)
1608 {
1609 void *addr;
1610 u_int64_t *oaddr64;
1611 mcache_obj_t *next;
1612
1613 addr = (void *)((uintptr_t)base + offset);
1614 next = ((mcache_obj_t *)addr)->obj_next;
1615
1616 /* For the "obj_next" pointer in the buffer */
1617 oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof(u_int64_t));
1618 *oaddr64 = MCACHE_FREE_PATTERN;
1619
1620 if ((oaddr64 = mcache_verify_set_pattern(MCACHE_FREE_PATTERN,
1621 MCACHE_UNINITIALIZED_PATTERN, (caddr_t)base, size)) != NULL) {
1622 mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
1623 (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
1624 /* NOTREACHED */
1625 }
1626 ((mcache_obj_t *)addr)->obj_next = next;
1627 }
1628
1629 #undef panic
1630
1631 #define DUMP_TRN_FMT() \
1632 "%s transaction thread %p saved PC stack (%d deep):\n" \
1633 "\t%p, %p, %p, %p, %p, %p, %p, %p\n" \
1634 "\t%p, %p, %p, %p, %p, %p, %p, %p\n"
1635
1636 #define DUMP_TRN_FIELDS(s, x) \
1637 s, \
1638 mca->mca_trns[x].mca_thread, mca->mca_trns[x].mca_depth, \
1639 mca->mca_trns[x].mca_stack[0], mca->mca_trns[x].mca_stack[1], \
1640 mca->mca_trns[x].mca_stack[2], mca->mca_trns[x].mca_stack[3], \
1641 mca->mca_trns[x].mca_stack[4], mca->mca_trns[x].mca_stack[5], \
1642 mca->mca_trns[x].mca_stack[6], mca->mca_trns[x].mca_stack[7], \
1643 mca->mca_trns[x].mca_stack[8], mca->mca_trns[x].mca_stack[9], \
1644 mca->mca_trns[x].mca_stack[10], mca->mca_trns[x].mca_stack[11], \
1645 mca->mca_trns[x].mca_stack[12], mca->mca_trns[x].mca_stack[13], \
1646 mca->mca_trns[x].mca_stack[14], mca->mca_trns[x].mca_stack[15]
1647
1648 #define MCA_TRN_LAST ((mca->mca_next_trn + mca_trn_max) % mca_trn_max)
1649 #define MCA_TRN_PREV ((mca->mca_next_trn + mca_trn_max - 1) % mca_trn_max)
1650
1651 __private_extern__ char *
1652 mcache_dump_mca(mcache_audit_t *mca)
1653 {
1654 if (mca_dump_buf == NULL) {
1655 return NULL;
1656 }
1657
1658 snprintf(mca_dump_buf, DUMP_MCA_BUF_SIZE,
1659 "mca %p: addr %p, cache %p (%s) nxttrn %d\n"
1660 DUMP_TRN_FMT()
1661 DUMP_TRN_FMT(),
1662
1663 mca, mca->mca_addr, mca->mca_cache,
1664 mca->mca_cache ? mca->mca_cache->mc_name : "?",
1665 mca->mca_next_trn,
1666
1667 DUMP_TRN_FIELDS("last", MCA_TRN_LAST),
1668 DUMP_TRN_FIELDS("previous", MCA_TRN_PREV));
1669
1670 return mca_dump_buf;
1671 }
1672
1673 __private_extern__ void
1674 mcache_audit_panic(mcache_audit_t *mca, void *addr, size_t offset,
1675 int64_t expected, int64_t got)
1676 {
1677 if (mca == NULL) {
1678 panic("mcache_audit: buffer %p modified after free at "
1679 "offset 0x%lx (0x%llx instead of 0x%llx)\n", addr,
1680 offset, got, expected);
1681 /* NOTREACHED */
1682 __builtin_unreachable();
1683 }
1684
1685 panic("mcache_audit: buffer %p modified after free at offset 0x%lx "
1686 "(0x%llx instead of 0x%llx)\n%s\n",
1687 addr, offset, got, expected, mcache_dump_mca(mca));
1688 /* NOTREACHED */
1689 __builtin_unreachable();
1690 }
1691
1692 __attribute__((noinline, cold, not_tail_called, noreturn))
1693 __private_extern__ int
1694 assfail(const char *a, const char *f, int l)
1695 {
1696 panic("assertion failed: %s, file: %s, line: %d", a, f, l);
1697 /* NOTREACHED */
1698 __builtin_unreachable();
1699 }