]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/mcache.c
76b4c601e5cc2b524666065aad780c189a4ff963
[apple/xnu.git] / bsd / kern / mcache.c
1 /*
2 * Copyright (c) 2006-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * Memory allocator with per-CPU caching, derived from the kmem magazine
31 * concept and implementation as described in the following paper:
32 * http://www.usenix.org/events/usenix01/full_papers/bonwick/bonwick.pdf
33 * That implementation is Copyright 2006 Sun Microsystems, Inc. All rights
34 * reserved. Use is subject to license terms.
35 *
36 * There are several major differences between this and the original kmem
37 * magazine: this derivative implementation allows for multiple objects to
38 * be allocated and freed from/to the object cache in one call; in addition,
39 * it provides for better flexibility where the user is allowed to define
40 * its own slab allocator (instead of the default zone allocator). Finally,
41 * no object construction/destruction takes place at the moment, although
42 * this could be added in future to improve efficiency.
43 */
44
45 #include <sys/param.h>
46 #include <sys/types.h>
47 #include <sys/malloc.h>
48 #include <sys/mbuf.h>
49 #include <sys/queue.h>
50 #include <sys/kernel.h>
51 #include <sys/systm.h>
52
53 #include <kern/debug.h>
54 #include <kern/zalloc.h>
55 #include <kern/cpu_number.h>
56 #include <kern/locks.h>
57 #include <kern/thread_call.h>
58
59 #include <libkern/libkern.h>
60 #include <libkern/OSAtomic.h>
61 #include <libkern/OSDebug.h>
62
63 #include <mach/vm_param.h>
64 #include <machine/limits.h>
65 #include <machine/machine_routines.h>
66
67 #include <string.h>
68
69 #include <sys/mcache.h>
70
71 #define MCACHE_SIZE(n) \
72 __builtin_offsetof(mcache_t, mc_cpu[n])
73
74 /* Allocate extra in case we need to manually align the pointer */
75 #define MCACHE_ALLOC_SIZE \
76 (sizeof (void *) + MCACHE_SIZE(ncpu) + CPU_CACHE_LINE_SIZE)
77
78 #define MCACHE_CPU(c) \
79 (mcache_cpu_t *)((void *)((char *)(c) + MCACHE_SIZE(cpu_number())))
80
81 /*
82 * MCACHE_LIST_LOCK() and MCACHE_LIST_UNLOCK() are macros used
83 * to serialize accesses to the global list of caches in the system.
84 * They also record the thread currently running in the critical
85 * section, so that we can avoid recursive requests to reap the
86 * caches when memory runs low.
87 */
88 #define MCACHE_LIST_LOCK() { \
89 lck_mtx_lock(mcache_llock); \
90 mcache_llock_owner = current_thread(); \
91 }
92
93 #define MCACHE_LIST_UNLOCK() { \
94 mcache_llock_owner = NULL; \
95 lck_mtx_unlock(mcache_llock); \
96 }
97
98 #define MCACHE_LOCK(l) lck_mtx_lock(l)
99 #define MCACHE_UNLOCK(l) lck_mtx_unlock(l)
100 #define MCACHE_LOCK_TRY(l) lck_mtx_try_lock(l)
101
102 static unsigned int ncpu;
103 static unsigned int cache_line_size;
104 static lck_mtx_t *mcache_llock;
105 static struct thread *mcache_llock_owner;
106 static lck_attr_t *mcache_llock_attr;
107 static lck_grp_t *mcache_llock_grp;
108 static lck_grp_attr_t *mcache_llock_grp_attr;
109 static struct zone *mcache_zone;
110 static const uint32_t mcache_reap_interval = 15;
111 static const uint32_t mcache_reap_interval_leeway = 2;
112 static UInt32 mcache_reaping;
113 static int mcache_ready;
114 static int mcache_updating;
115
116 static int mcache_bkt_contention = 3;
117 #if DEBUG
118 static unsigned int mcache_flags = MCF_DEBUG;
119 #else
120 static unsigned int mcache_flags = 0;
121 #endif
122
123 int mca_trn_max = MCA_TRN_MAX;
124
125 #define DUMP_MCA_BUF_SIZE 512
126 static char *mca_dump_buf;
127
128 static mcache_bkttype_t mcache_bkttype[] = {
129 { 1, 4096, 32768, NULL },
130 { 3, 2048, 16384, NULL },
131 { 7, 1024, 12288, NULL },
132 { 15, 256, 8192, NULL },
133 { 31, 64, 4096, NULL },
134 { 47, 0, 2048, NULL },
135 { 63, 0, 1024, NULL },
136 { 95, 0, 512, NULL },
137 { 143, 0, 256, NULL },
138 { 165, 0, 0, NULL },
139 };
140
141 static mcache_t *mcache_create_common(const char *, size_t, size_t,
142 mcache_allocfn_t, mcache_freefn_t, mcache_auditfn_t, mcache_logfn_t,
143 mcache_notifyfn_t, void *, u_int32_t, int, int);
144 static unsigned int mcache_slab_alloc(void *, mcache_obj_t ***,
145 unsigned int, int);
146 static void mcache_slab_free(void *, mcache_obj_t *, boolean_t);
147 static void mcache_slab_audit(void *, mcache_obj_t *, boolean_t);
148 static void mcache_cpu_refill(mcache_cpu_t *, mcache_bkt_t *, int);
149 static mcache_bkt_t *mcache_bkt_alloc(mcache_t *, mcache_bktlist_t *);
150 static void mcache_bkt_free(mcache_t *, mcache_bktlist_t *, mcache_bkt_t *);
151 static void mcache_cache_bkt_enable(mcache_t *);
152 static void mcache_bkt_purge(mcache_t *);
153 static void mcache_bkt_destroy(mcache_t *, mcache_bkt_t *, int);
154 static void mcache_bkt_ws_update(mcache_t *);
155 static void mcache_bkt_ws_zero(mcache_t *);
156 static void mcache_bkt_ws_reap(mcache_t *);
157 static void mcache_dispatch(void (*)(void *), void *);
158 static void mcache_cache_reap(mcache_t *);
159 static void mcache_cache_update(mcache_t *);
160 static void mcache_cache_bkt_resize(void *);
161 static void mcache_cache_enable(void *);
162 static void mcache_update(thread_call_param_t __unused, thread_call_param_t __unused);
163 static void mcache_update_timeout(void *);
164 static void mcache_applyall(void (*)(mcache_t *));
165 static void mcache_reap_start(void *);
166 static void mcache_reap_done(void *);
167 static void mcache_reap_timeout(thread_call_param_t __unused, thread_call_param_t);
168 static void mcache_notify(mcache_t *, u_int32_t);
169 static void mcache_purge(void *);
170
171 static LIST_HEAD(, mcache) mcache_head;
172 mcache_t *mcache_audit_cache;
173
174 static thread_call_t mcache_reap_tcall;
175 static thread_call_t mcache_update_tcall;
176
177 /*
178 * Initialize the framework; this is currently called as part of BSD init.
179 */
180 __private_extern__ void
181 mcache_init(void)
182 {
183 mcache_bkttype_t *btp;
184 unsigned int i;
185 char name[32];
186
187 VERIFY(mca_trn_max >= 2);
188
189 ncpu = ml_wait_max_cpus();
190 (void) mcache_cache_line_size(); /* prime it */
191
192 mcache_llock_grp_attr = lck_grp_attr_alloc_init();
193 mcache_llock_grp = lck_grp_alloc_init("mcache.list",
194 mcache_llock_grp_attr);
195 mcache_llock_attr = lck_attr_alloc_init();
196 mcache_llock = lck_mtx_alloc_init(mcache_llock_grp, mcache_llock_attr);
197
198 mcache_reap_tcall = thread_call_allocate(mcache_reap_timeout, NULL);
199 mcache_update_tcall = thread_call_allocate(mcache_update, NULL);
200 if (mcache_reap_tcall == NULL || mcache_update_tcall == NULL) {
201 panic("mcache_init: thread_call_allocate failed");
202 /* NOTREACHED */
203 __builtin_unreachable();
204 }
205
206 mcache_zone = zone_create("mcache", MCACHE_ALLOC_SIZE, ZC_DESTRUCTIBLE);
207
208 LIST_INIT(&mcache_head);
209
210 for (i = 0; i < sizeof(mcache_bkttype) / sizeof(*btp); i++) {
211 btp = &mcache_bkttype[i];
212 (void) snprintf(name, sizeof(name), "bkt_%d",
213 btp->bt_bktsize);
214 btp->bt_cache = mcache_create(name,
215 (btp->bt_bktsize + 1) * sizeof(void *), 0, 0, MCR_SLEEP);
216 }
217
218 PE_parse_boot_argn("mcache_flags", &mcache_flags, sizeof(mcache_flags));
219 mcache_flags &= MCF_FLAGS_MASK;
220
221 mcache_audit_cache = mcache_create("audit", sizeof(mcache_audit_t),
222 0, 0, MCR_SLEEP);
223
224 mcache_applyall(mcache_cache_bkt_enable);
225 mcache_ready = 1;
226
227 printf("mcache: %d CPU(s), %d bytes CPU cache line size\n",
228 ncpu, CPU_CACHE_LINE_SIZE);
229 }
230
231 /*
232 * Return the global mcache flags.
233 */
234 __private_extern__ unsigned int
235 mcache_getflags(void)
236 {
237 return mcache_flags;
238 }
239
240 /*
241 * Return the CPU cache line size.
242 */
243 __private_extern__ unsigned int
244 mcache_cache_line_size(void)
245 {
246 if (cache_line_size == 0) {
247 ml_cpu_info_t cpu_info;
248 ml_cpu_get_info(&cpu_info);
249 cache_line_size = (unsigned int)cpu_info.cache_line_size;
250 }
251 return cache_line_size;
252 }
253
254 /*
255 * Create a cache using the zone allocator as the backend slab allocator.
256 * The caller may specify any alignment for the object; if it specifies 0
257 * the default alignment (MCACHE_ALIGN) will be used.
258 */
259 __private_extern__ mcache_t *
260 mcache_create(const char *name, size_t bufsize, size_t align,
261 u_int32_t flags, int wait)
262 {
263 return mcache_create_common(name, bufsize, align, mcache_slab_alloc,
264 mcache_slab_free, mcache_slab_audit, NULL, NULL, NULL, flags, 1,
265 wait);
266 }
267
268 /*
269 * Create a cache using a custom backend slab allocator. Since the caller
270 * is responsible for allocation, no alignment guarantee will be provided
271 * by this framework.
272 */
273 __private_extern__ mcache_t *
274 mcache_create_ext(const char *name, size_t bufsize,
275 mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
276 mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
277 u_int32_t flags, int wait)
278 {
279 return mcache_create_common(name, bufsize, 0, allocfn,
280 freefn, auditfn, logfn, notifyfn, arg, flags, 0, wait);
281 }
282
283 /*
284 * Common cache creation routine.
285 */
286 static mcache_t *
287 mcache_create_common(const char *name, size_t bufsize, size_t align,
288 mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
289 mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
290 u_int32_t flags, int need_zone, int wait)
291 {
292 mcache_bkttype_t *btp;
293 mcache_t *cp = NULL;
294 size_t chunksize;
295 void *buf, **pbuf;
296 unsigned int c;
297 char lck_name[64];
298
299 /* If auditing is on and print buffer is NULL, allocate it now */
300 if ((flags & MCF_DEBUG) && mca_dump_buf == NULL) {
301 int malloc_wait = (wait & MCR_NOSLEEP) ? M_NOWAIT : M_WAITOK;
302 MALLOC(mca_dump_buf, char *, DUMP_MCA_BUF_SIZE, M_TEMP,
303 malloc_wait | M_ZERO);
304 if (mca_dump_buf == NULL) {
305 return NULL;
306 }
307 }
308
309 buf = zalloc(mcache_zone);
310 if (buf == NULL) {
311 goto fail;
312 }
313
314 bzero(buf, MCACHE_ALLOC_SIZE);
315
316 /*
317 * In case we didn't get a cache-aligned memory, round it up
318 * accordingly. This is needed in order to get the rest of
319 * structure members aligned properly. It also means that
320 * the memory span gets shifted due to the round up, but it
321 * is okay since we've allocated extra space for this.
322 */
323 cp = (mcache_t *)
324 P2ROUNDUP((intptr_t)buf + sizeof(void *), CPU_CACHE_LINE_SIZE);
325 pbuf = (void **)((intptr_t)cp - sizeof(void *));
326 *pbuf = buf;
327
328 /*
329 * Guaranteed alignment is valid only when we use the internal
330 * slab allocator (currently set to use the zone allocator).
331 */
332 if (!need_zone) {
333 align = 1;
334 } else {
335 /* Enforce 64-bit minimum alignment for zone-based buffers */
336 if (align == 0) {
337 align = MCACHE_ALIGN;
338 }
339 align = P2ROUNDUP(align, MCACHE_ALIGN);
340 }
341
342 if ((align & (align - 1)) != 0) {
343 panic("mcache_create: bad alignment %lu", align);
344 /* NOTREACHED */
345 __builtin_unreachable();
346 }
347
348 cp->mc_align = align;
349 cp->mc_slab_alloc = allocfn;
350 cp->mc_slab_free = freefn;
351 cp->mc_slab_audit = auditfn;
352 cp->mc_slab_log = logfn;
353 cp->mc_slab_notify = notifyfn;
354 cp->mc_private = need_zone ? cp : arg;
355 cp->mc_bufsize = bufsize;
356 cp->mc_flags = (flags & MCF_FLAGS_MASK) | mcache_flags;
357
358 (void) snprintf(cp->mc_name, sizeof(cp->mc_name), "mcache.%s", name);
359
360 (void) snprintf(lck_name, sizeof(lck_name), "%s.cpu", cp->mc_name);
361 cp->mc_cpu_lock_grp_attr = lck_grp_attr_alloc_init();
362 cp->mc_cpu_lock_grp = lck_grp_alloc_init(lck_name,
363 cp->mc_cpu_lock_grp_attr);
364 cp->mc_cpu_lock_attr = lck_attr_alloc_init();
365
366 /*
367 * Allocation chunk size is the object's size plus any extra size
368 * needed to satisfy the object's alignment. It is enforced to be
369 * at least the size of an LP64 pointer to simplify auditing and to
370 * handle multiple-element allocation requests, where the elements
371 * returned are linked together in a list.
372 */
373 chunksize = MAX(bufsize, sizeof(u_int64_t));
374 if (need_zone) {
375 VERIFY(align != 0 && (align % MCACHE_ALIGN) == 0);
376 chunksize += sizeof(uint64_t) + align;
377 chunksize = P2ROUNDUP(chunksize, align);
378 cp->mc_slab_zone = zone_create(cp->mc_name, chunksize, ZC_DESTRUCTIBLE);
379 }
380 cp->mc_chunksize = chunksize;
381
382 /*
383 * Initialize the bucket layer.
384 */
385 (void) snprintf(lck_name, sizeof(lck_name), "%s.bkt", cp->mc_name);
386 cp->mc_bkt_lock_grp_attr = lck_grp_attr_alloc_init();
387 cp->mc_bkt_lock_grp = lck_grp_alloc_init(lck_name,
388 cp->mc_bkt_lock_grp_attr);
389 cp->mc_bkt_lock_attr = lck_attr_alloc_init();
390 lck_mtx_init(&cp->mc_bkt_lock, cp->mc_bkt_lock_grp,
391 cp->mc_bkt_lock_attr);
392
393 (void) snprintf(lck_name, sizeof(lck_name), "%s.sync", cp->mc_name);
394 cp->mc_sync_lock_grp_attr = lck_grp_attr_alloc_init();
395 cp->mc_sync_lock_grp = lck_grp_alloc_init(lck_name,
396 cp->mc_sync_lock_grp_attr);
397 cp->mc_sync_lock_attr = lck_attr_alloc_init();
398 lck_mtx_init(&cp->mc_sync_lock, cp->mc_sync_lock_grp,
399 cp->mc_sync_lock_attr);
400
401 for (btp = mcache_bkttype; chunksize <= btp->bt_minbuf; btp++) {
402 continue;
403 }
404
405 cp->cache_bkttype = btp;
406
407 /*
408 * Initialize the CPU layer. Each per-CPU structure is aligned
409 * on the CPU cache line boundary to prevent false sharing.
410 */
411 for (c = 0; c < ncpu; c++) {
412 mcache_cpu_t *ccp = &cp->mc_cpu[c];
413
414 VERIFY(IS_P2ALIGNED(ccp, CPU_CACHE_LINE_SIZE));
415 lck_mtx_init(&ccp->cc_lock, cp->mc_cpu_lock_grp,
416 cp->mc_cpu_lock_attr);
417 ccp->cc_objs = -1;
418 ccp->cc_pobjs = -1;
419 }
420
421 if (mcache_ready) {
422 mcache_cache_bkt_enable(cp);
423 }
424
425 /* TODO: dynamically create sysctl for stats */
426
427 MCACHE_LIST_LOCK();
428 LIST_INSERT_HEAD(&mcache_head, cp, mc_list);
429 MCACHE_LIST_UNLOCK();
430
431 /*
432 * If cache buckets are enabled and this is the first cache
433 * created, start the periodic cache update.
434 */
435 if (!(mcache_flags & MCF_NOCPUCACHE) && !mcache_updating) {
436 mcache_updating = 1;
437 mcache_update_timeout(NULL);
438 }
439 if (cp->mc_flags & MCF_DEBUG) {
440 printf("mcache_create: %s (%s) arg %p bufsize %lu align %lu "
441 "chunksize %lu bktsize %d\n", name, need_zone ? "i" : "e",
442 arg, bufsize, cp->mc_align, chunksize, btp->bt_bktsize);
443 }
444 return cp;
445
446 fail:
447 if (buf != NULL) {
448 zfree(mcache_zone, buf);
449 }
450 return NULL;
451 }
452
453 /*
454 * Allocate one or more objects from a cache.
455 */
456 __private_extern__ unsigned int
457 mcache_alloc_ext(mcache_t *cp, mcache_obj_t **list, unsigned int num, int wait)
458 {
459 mcache_cpu_t *ccp;
460 mcache_obj_t **top = &(*list);
461 mcache_bkt_t *bkt;
462 unsigned int need = num;
463 boolean_t nwretry = FALSE;
464
465 /* MCR_NOSLEEP and MCR_FAILOK are mutually exclusive */
466 VERIFY((wait & (MCR_NOSLEEP | MCR_FAILOK)) != (MCR_NOSLEEP | MCR_FAILOK));
467
468 ASSERT(list != NULL);
469 *list = NULL;
470
471 if (num == 0) {
472 return 0;
473 }
474
475 retry_alloc:
476 /* We may not always be running in the same CPU in case of retries */
477 ccp = MCACHE_CPU(cp);
478
479 MCACHE_LOCK(&ccp->cc_lock);
480 for (;;) {
481 /*
482 * If we have an object in the current CPU's filled bucket,
483 * chain the object to any previous objects and return if
484 * we've satisfied the number of requested objects.
485 */
486 if (ccp->cc_objs > 0) {
487 mcache_obj_t *tail;
488 int objs;
489
490 /*
491 * Objects in the bucket are already linked together
492 * with the most recently freed object at the head of
493 * the list; grab as many objects as we can.
494 */
495 objs = MIN((unsigned int)ccp->cc_objs, need);
496 *list = ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
497 ccp->cc_objs -= objs;
498 ccp->cc_alloc += objs;
499
500 tail = ccp->cc_filled->bkt_obj[ccp->cc_objs];
501 list = &tail->obj_next;
502 *list = NULL;
503
504 /* If we got them all, return to caller */
505 if ((need -= objs) == 0) {
506 MCACHE_UNLOCK(&ccp->cc_lock);
507
508 if (!(cp->mc_flags & MCF_NOLEAKLOG) &&
509 cp->mc_slab_log != NULL) {
510 (*cp->mc_slab_log)(num, *top, TRUE);
511 }
512
513 if (cp->mc_flags & MCF_DEBUG) {
514 goto debug_alloc;
515 }
516
517 return num;
518 }
519 }
520
521 /*
522 * The CPU's filled bucket is empty. If the previous filled
523 * bucket was full, exchange and try again.
524 */
525 if (ccp->cc_pobjs > 0) {
526 mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
527 continue;
528 }
529
530 /*
531 * If the bucket layer is disabled, allocate from slab. This
532 * can happen either because MCF_NOCPUCACHE is set, or because
533 * the bucket layer is currently being resized.
534 */
535 if (ccp->cc_bktsize == 0) {
536 break;
537 }
538
539 /*
540 * Both of the CPU's buckets are empty; try to get a full
541 * bucket from the bucket layer. Upon success, refill this
542 * CPU and place any empty bucket into the empty list.
543 */
544 bkt = mcache_bkt_alloc(cp, &cp->mc_full);
545 if (bkt != NULL) {
546 if (ccp->cc_pfilled != NULL) {
547 mcache_bkt_free(cp, &cp->mc_empty,
548 ccp->cc_pfilled);
549 }
550 mcache_cpu_refill(ccp, bkt, ccp->cc_bktsize);
551 continue;
552 }
553
554 /*
555 * The bucket layer has no full buckets; allocate the
556 * object(s) directly from the slab layer.
557 */
558 break;
559 }
560 MCACHE_UNLOCK(&ccp->cc_lock);
561
562 need -= (*cp->mc_slab_alloc)(cp->mc_private, &list, need, wait);
563
564 /*
565 * If this is a blocking allocation, or if it is non-blocking and
566 * the cache's full bucket is non-empty, then retry the allocation.
567 */
568 if (need > 0) {
569 if (!(wait & MCR_NONBLOCKING)) {
570 atomic_add_32(&cp->mc_wretry_cnt, 1);
571 goto retry_alloc;
572 } else if ((wait & (MCR_NOSLEEP | MCR_TRYHARD)) &&
573 !mcache_bkt_isempty(cp)) {
574 if (!nwretry) {
575 nwretry = TRUE;
576 }
577 atomic_add_32(&cp->mc_nwretry_cnt, 1);
578 goto retry_alloc;
579 } else if (nwretry) {
580 atomic_add_32(&cp->mc_nwfail_cnt, 1);
581 }
582 }
583
584 if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL) {
585 (*cp->mc_slab_log)((num - need), *top, TRUE);
586 }
587
588 if (!(cp->mc_flags & MCF_DEBUG)) {
589 return num - need;
590 }
591
592 debug_alloc:
593 if (cp->mc_flags & MCF_DEBUG) {
594 mcache_obj_t **o = top;
595 unsigned int n;
596
597 n = 0;
598 /*
599 * Verify that the chain of objects have the same count as
600 * what we are about to report to the caller. Any mismatch
601 * here means that the object list is insanely broken and
602 * therefore we must panic.
603 */
604 while (*o != NULL) {
605 o = &(*o)->obj_next;
606 ++n;
607 }
608 if (n != (num - need)) {
609 panic("mcache_alloc_ext: %s cp %p corrupted list "
610 "(got %d actual %d)\n", cp->mc_name,
611 (void *)cp, num - need, n);
612 /* NOTREACHED */
613 __builtin_unreachable();
614 }
615 }
616
617 /* Invoke the slab layer audit callback if auditing is enabled */
618 if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL) {
619 (*cp->mc_slab_audit)(cp->mc_private, *top, TRUE);
620 }
621
622 return num - need;
623 }
624
625 /*
626 * Allocate a single object from a cache.
627 */
628 __private_extern__ void *
629 mcache_alloc(mcache_t *cp, int wait)
630 {
631 mcache_obj_t *buf;
632
633 (void) mcache_alloc_ext(cp, &buf, 1, wait);
634 return buf;
635 }
636
637 __private_extern__ void
638 mcache_waiter_inc(mcache_t *cp)
639 {
640 atomic_add_32(&cp->mc_waiter_cnt, 1);
641 }
642
643 __private_extern__ void
644 mcache_waiter_dec(mcache_t *cp)
645 {
646 atomic_add_32(&cp->mc_waiter_cnt, -1);
647 }
648
649 __private_extern__ boolean_t
650 mcache_bkt_isempty(mcache_t *cp)
651 {
652 /*
653 * This isn't meant to accurately tell whether there are
654 * any full buckets in the cache; it is simply a way to
655 * obtain "hints" about the state of the cache.
656 */
657 return cp->mc_full.bl_total == 0;
658 }
659
660 /*
661 * Notify the slab layer about an event.
662 */
663 static void
664 mcache_notify(mcache_t *cp, u_int32_t event)
665 {
666 if (cp->mc_slab_notify != NULL) {
667 (*cp->mc_slab_notify)(cp->mc_private, event);
668 }
669 }
670
671 /*
672 * Purge the cache and disable its buckets.
673 */
674 static void
675 mcache_purge(void *arg)
676 {
677 mcache_t *cp = arg;
678
679 mcache_bkt_purge(cp);
680 /*
681 * We cannot simply call mcache_cache_bkt_enable() from here as
682 * a bucket resize may be in flight and we would cause the CPU
683 * layers of the cache to point to different sizes. Therefore,
684 * we simply increment the enable count so that during the next
685 * periodic cache update the buckets can be reenabled.
686 */
687 lck_mtx_lock_spin(&cp->mc_sync_lock);
688 cp->mc_enable_cnt++;
689 lck_mtx_unlock(&cp->mc_sync_lock);
690 }
691
692 __private_extern__ boolean_t
693 mcache_purge_cache(mcache_t *cp, boolean_t async)
694 {
695 /*
696 * Purging a cache that has no per-CPU caches or is already
697 * in the process of being purged is rather pointless.
698 */
699 if (cp->mc_flags & MCF_NOCPUCACHE) {
700 return FALSE;
701 }
702
703 lck_mtx_lock_spin(&cp->mc_sync_lock);
704 if (cp->mc_purge_cnt > 0) {
705 lck_mtx_unlock(&cp->mc_sync_lock);
706 return FALSE;
707 }
708 cp->mc_purge_cnt++;
709 lck_mtx_unlock(&cp->mc_sync_lock);
710
711 if (async) {
712 mcache_dispatch(mcache_purge, cp);
713 } else {
714 mcache_purge(cp);
715 }
716
717 return TRUE;
718 }
719
720 /*
721 * Free a single object to a cache.
722 */
723 __private_extern__ void
724 mcache_free(mcache_t *cp, void *buf)
725 {
726 ((mcache_obj_t *)buf)->obj_next = NULL;
727 mcache_free_ext(cp, (mcache_obj_t *)buf);
728 }
729
730 /*
731 * Free one or more objects to a cache.
732 */
733 __private_extern__ void
734 mcache_free_ext(mcache_t *cp, mcache_obj_t *list)
735 {
736 mcache_cpu_t *ccp = MCACHE_CPU(cp);
737 mcache_bkttype_t *btp;
738 mcache_obj_t *nlist;
739 mcache_bkt_t *bkt;
740
741 if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL) {
742 (*cp->mc_slab_log)(0, list, FALSE);
743 }
744
745 /* Invoke the slab layer audit callback if auditing is enabled */
746 if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL) {
747 (*cp->mc_slab_audit)(cp->mc_private, list, FALSE);
748 }
749
750 MCACHE_LOCK(&ccp->cc_lock);
751 for (;;) {
752 /*
753 * If there is space in the current CPU's filled bucket, put
754 * the object there and return once all objects are freed.
755 * Note the cast to unsigned integer takes care of the case
756 * where the bucket layer is disabled (when cc_objs is -1).
757 */
758 if ((unsigned int)ccp->cc_objs <
759 (unsigned int)ccp->cc_bktsize) {
760 /*
761 * Reverse the list while we place the object into the
762 * bucket; this effectively causes the most recently
763 * freed object(s) to be reused during allocation.
764 */
765 nlist = list->obj_next;
766 list->obj_next = (ccp->cc_objs == 0) ? NULL :
767 ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
768 ccp->cc_filled->bkt_obj[ccp->cc_objs++] = list;
769 ccp->cc_free++;
770
771 if ((list = nlist) != NULL) {
772 continue;
773 }
774
775 /* We are done; return to caller */
776 MCACHE_UNLOCK(&ccp->cc_lock);
777
778 /* If there is a waiter below, notify it */
779 if (cp->mc_waiter_cnt > 0) {
780 mcache_notify(cp, MCN_RETRYALLOC);
781 }
782 return;
783 }
784
785 /*
786 * The CPU's filled bucket is full. If the previous filled
787 * bucket was empty, exchange and try again.
788 */
789 if (ccp->cc_pobjs == 0) {
790 mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
791 continue;
792 }
793
794 /*
795 * If the bucket layer is disabled, free to slab. This can
796 * happen either because MCF_NOCPUCACHE is set, or because
797 * the bucket layer is currently being resized.
798 */
799 if (ccp->cc_bktsize == 0) {
800 break;
801 }
802
803 /*
804 * Both of the CPU's buckets are full; try to get an empty
805 * bucket from the bucket layer. Upon success, empty this
806 * CPU and place any full bucket into the full list.
807 */
808 bkt = mcache_bkt_alloc(cp, &cp->mc_empty);
809 if (bkt != NULL) {
810 if (ccp->cc_pfilled != NULL) {
811 mcache_bkt_free(cp, &cp->mc_full,
812 ccp->cc_pfilled);
813 }
814 mcache_cpu_refill(ccp, bkt, 0);
815 continue;
816 }
817 btp = cp->cache_bkttype;
818
819 /*
820 * We need an empty bucket to put our freed objects into
821 * but couldn't get an empty bucket from the bucket layer;
822 * attempt to allocate one. We do not want to block for
823 * allocation here, and if the bucket allocation fails
824 * we will simply fall through to the slab layer.
825 */
826 MCACHE_UNLOCK(&ccp->cc_lock);
827 bkt = mcache_alloc(btp->bt_cache, MCR_NOSLEEP);
828 MCACHE_LOCK(&ccp->cc_lock);
829
830 if (bkt != NULL) {
831 /*
832 * We have an empty bucket, but since we drop the
833 * CPU lock above, the cache's bucket size may have
834 * changed. If so, free the bucket and try again.
835 */
836 if (ccp->cc_bktsize != btp->bt_bktsize) {
837 MCACHE_UNLOCK(&ccp->cc_lock);
838 mcache_free(btp->bt_cache, bkt);
839 MCACHE_LOCK(&ccp->cc_lock);
840 continue;
841 }
842
843 /*
844 * Store it in the bucket object since we'll
845 * need to refer to it during bucket destroy;
846 * we can't safely refer to cache_bkttype as
847 * the bucket lock may not be acquired then.
848 */
849 bkt->bkt_type = btp;
850
851 /*
852 * We have an empty bucket of the right size;
853 * add it to the bucket layer and try again.
854 */
855 mcache_bkt_free(cp, &cp->mc_empty, bkt);
856 continue;
857 }
858
859 /*
860 * The bucket layer has no empty buckets; free the
861 * object(s) directly to the slab layer.
862 */
863 break;
864 }
865 MCACHE_UNLOCK(&ccp->cc_lock);
866
867 /* If there is a waiter below, notify it */
868 if (cp->mc_waiter_cnt > 0) {
869 mcache_notify(cp, MCN_RETRYALLOC);
870 }
871
872 /* Advise the slab layer to purge the object(s) */
873 (*cp->mc_slab_free)(cp->mc_private, list,
874 (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
875 }
876
877 /*
878 * Cache destruction routine.
879 */
880 __private_extern__ void
881 mcache_destroy(mcache_t *cp)
882 {
883 void **pbuf;
884
885 MCACHE_LIST_LOCK();
886 LIST_REMOVE(cp, mc_list);
887 MCACHE_LIST_UNLOCK();
888
889 mcache_bkt_purge(cp);
890
891 /*
892 * This cache is dead; there should be no further transaction.
893 * If it's still invoked, make sure that it induces a fault.
894 */
895 cp->mc_slab_alloc = NULL;
896 cp->mc_slab_free = NULL;
897 cp->mc_slab_audit = NULL;
898
899 lck_attr_free(cp->mc_bkt_lock_attr);
900 lck_grp_free(cp->mc_bkt_lock_grp);
901 lck_grp_attr_free(cp->mc_bkt_lock_grp_attr);
902
903 lck_attr_free(cp->mc_cpu_lock_attr);
904 lck_grp_free(cp->mc_cpu_lock_grp);
905 lck_grp_attr_free(cp->mc_cpu_lock_grp_attr);
906
907 lck_attr_free(cp->mc_sync_lock_attr);
908 lck_grp_free(cp->mc_sync_lock_grp);
909 lck_grp_attr_free(cp->mc_sync_lock_grp_attr);
910
911 /*
912 * TODO: We need to destroy the zone here, but cannot do it
913 * because there is no such way to achieve that. Until then
914 * the memory allocated for the zone structure is leaked.
915 * Once it is achievable, uncomment these lines:
916 *
917 * if (cp->mc_slab_zone != NULL) {
918 * zdestroy(cp->mc_slab_zone);
919 * cp->mc_slab_zone = NULL;
920 * }
921 */
922
923 /* Get the original address since we're about to free it */
924 pbuf = (void **)((intptr_t)cp - sizeof(void *));
925
926 zfree(mcache_zone, *pbuf);
927 }
928
929 /*
930 * Internal slab allocator used as a backend for simple caches. The current
931 * implementation uses the zone allocator for simplicity reasons.
932 */
933 static unsigned int
934 mcache_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num,
935 int wait)
936 {
937 #pragma unused(wait)
938 mcache_t *cp = arg;
939 unsigned int need = num;
940 size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof(u_int64_t));
941 u_int32_t flags = cp->mc_flags;
942 void *buf, *base, **pbuf;
943 mcache_obj_t **list = *plist;
944
945 *list = NULL;
946
947 for (;;) {
948 buf = zalloc(cp->mc_slab_zone);
949 if (buf == NULL) {
950 break;
951 }
952
953 /* Get the aligned base address for this object */
954 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
955 cp->mc_align);
956
957 /*
958 * Wind back a pointer size from the aligned base and
959 * save the original address so we can free it later.
960 */
961 pbuf = (void **)((intptr_t)base - sizeof(void *));
962 *pbuf = buf;
963
964 VERIFY(((intptr_t)base + cp->mc_bufsize) <=
965 ((intptr_t)buf + cp->mc_chunksize));
966
967 /*
968 * If auditing is enabled, patternize the contents of
969 * the buffer starting from the 64-bit aligned base to
970 * the end of the buffer; the length is rounded up to
971 * the nearest 64-bit multiply; this is because we use
972 * 64-bit memory access to set/check the pattern.
973 */
974 if (flags & MCF_DEBUG) {
975 VERIFY(((intptr_t)base + rsize) <=
976 ((intptr_t)buf + cp->mc_chunksize));
977 mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
978 }
979
980 VERIFY(IS_P2ALIGNED(base, cp->mc_align));
981 *list = (mcache_obj_t *)base;
982
983 (*list)->obj_next = NULL;
984 list = *plist = &(*list)->obj_next;
985
986 /* If we got them all, return to mcache */
987 if (--need == 0) {
988 break;
989 }
990 }
991
992 return num - need;
993 }
994
995 /*
996 * Internal slab deallocator used as a backend for simple caches.
997 */
998 static void
999 mcache_slab_free(void *arg, mcache_obj_t *list, __unused boolean_t purged)
1000 {
1001 mcache_t *cp = arg;
1002 mcache_obj_t *nlist;
1003 size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof(u_int64_t));
1004 u_int32_t flags = cp->mc_flags;
1005 void *base;
1006 void **pbuf;
1007
1008 for (;;) {
1009 nlist = list->obj_next;
1010 list->obj_next = NULL;
1011
1012 base = list;
1013 VERIFY(IS_P2ALIGNED(base, cp->mc_align));
1014
1015 /* Get the original address since we're about to free it */
1016 pbuf = (void **)((intptr_t)base - sizeof(void *));
1017
1018 VERIFY(((intptr_t)base + cp->mc_bufsize) <=
1019 ((intptr_t)*pbuf + cp->mc_chunksize));
1020
1021 if (flags & MCF_DEBUG) {
1022 VERIFY(((intptr_t)base + rsize) <=
1023 ((intptr_t)*pbuf + cp->mc_chunksize));
1024 mcache_audit_free_verify(NULL, base, 0, rsize);
1025 }
1026
1027 /* Free it to zone */
1028 zfree(cp->mc_slab_zone, *pbuf);
1029
1030 /* No more objects to free; return to mcache */
1031 if ((list = nlist) == NULL) {
1032 break;
1033 }
1034 }
1035 }
1036
1037 /*
1038 * Internal slab auditor for simple caches.
1039 */
1040 static void
1041 mcache_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
1042 {
1043 mcache_t *cp = arg;
1044 size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof(u_int64_t));
1045 void *base, **pbuf;
1046
1047 while (list != NULL) {
1048 mcache_obj_t *next = list->obj_next;
1049
1050 base = list;
1051 VERIFY(IS_P2ALIGNED(base, cp->mc_align));
1052
1053 /* Get the original address */
1054 pbuf = (void **)((intptr_t)base - sizeof(void *));
1055
1056 VERIFY(((intptr_t)base + rsize) <=
1057 ((intptr_t)*pbuf + cp->mc_chunksize));
1058
1059 if (!alloc) {
1060 mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
1061 } else {
1062 mcache_audit_free_verify_set(NULL, base, 0, rsize);
1063 }
1064
1065 list = list->obj_next = next;
1066 }
1067 }
1068
1069 /*
1070 * Refill the CPU's filled bucket with bkt and save the previous one.
1071 */
1072 static void
1073 mcache_cpu_refill(mcache_cpu_t *ccp, mcache_bkt_t *bkt, int objs)
1074 {
1075 ASSERT((ccp->cc_filled == NULL && ccp->cc_objs == -1) ||
1076 (ccp->cc_filled && ccp->cc_objs + objs == ccp->cc_bktsize));
1077 ASSERT(ccp->cc_bktsize > 0);
1078
1079 ccp->cc_pfilled = ccp->cc_filled;
1080 ccp->cc_pobjs = ccp->cc_objs;
1081 ccp->cc_filled = bkt;
1082 ccp->cc_objs = objs;
1083 }
1084
1085 /*
1086 * Allocate a bucket from the bucket layer.
1087 */
1088 static mcache_bkt_t *
1089 mcache_bkt_alloc(mcache_t *cp, mcache_bktlist_t *blp)
1090 {
1091 mcache_bkt_t *bkt;
1092
1093 if (!MCACHE_LOCK_TRY(&cp->mc_bkt_lock)) {
1094 /*
1095 * The bucket layer lock is held by another CPU; increase
1096 * the contention count so that we can later resize the
1097 * bucket size accordingly.
1098 */
1099 MCACHE_LOCK(&cp->mc_bkt_lock);
1100 cp->mc_bkt_contention++;
1101 }
1102
1103 if ((bkt = blp->bl_list) != NULL) {
1104 blp->bl_list = bkt->bkt_next;
1105 if (--blp->bl_total < blp->bl_min) {
1106 blp->bl_min = blp->bl_total;
1107 }
1108 blp->bl_alloc++;
1109 }
1110
1111 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1112
1113 return bkt;
1114 }
1115
1116 /*
1117 * Free a bucket to the bucket layer.
1118 */
1119 static void
1120 mcache_bkt_free(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkt_t *bkt)
1121 {
1122 MCACHE_LOCK(&cp->mc_bkt_lock);
1123
1124 bkt->bkt_next = blp->bl_list;
1125 blp->bl_list = bkt;
1126 blp->bl_total++;
1127
1128 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1129 }
1130
1131 /*
1132 * Enable the bucket layer of a cache.
1133 */
1134 static void
1135 mcache_cache_bkt_enable(mcache_t *cp)
1136 {
1137 mcache_cpu_t *ccp;
1138 unsigned int cpu;
1139
1140 if (cp->mc_flags & MCF_NOCPUCACHE) {
1141 return;
1142 }
1143
1144 for (cpu = 0; cpu < ncpu; cpu++) {
1145 ccp = &cp->mc_cpu[cpu];
1146 MCACHE_LOCK(&ccp->cc_lock);
1147 ccp->cc_bktsize = cp->cache_bkttype->bt_bktsize;
1148 MCACHE_UNLOCK(&ccp->cc_lock);
1149 }
1150 }
1151
1152 /*
1153 * Purge all buckets from a cache and disable its bucket layer.
1154 */
1155 static void
1156 mcache_bkt_purge(mcache_t *cp)
1157 {
1158 mcache_cpu_t *ccp;
1159 mcache_bkt_t *bp, *pbp;
1160 int objs, pobjs;
1161 unsigned int cpu;
1162
1163 for (cpu = 0; cpu < ncpu; cpu++) {
1164 ccp = &cp->mc_cpu[cpu];
1165
1166 MCACHE_LOCK(&ccp->cc_lock);
1167
1168 bp = ccp->cc_filled;
1169 pbp = ccp->cc_pfilled;
1170 objs = ccp->cc_objs;
1171 pobjs = ccp->cc_pobjs;
1172 ccp->cc_filled = NULL;
1173 ccp->cc_pfilled = NULL;
1174 ccp->cc_objs = -1;
1175 ccp->cc_pobjs = -1;
1176 ccp->cc_bktsize = 0;
1177
1178 MCACHE_UNLOCK(&ccp->cc_lock);
1179
1180 if (bp != NULL) {
1181 mcache_bkt_destroy(cp, bp, objs);
1182 }
1183 if (pbp != NULL) {
1184 mcache_bkt_destroy(cp, pbp, pobjs);
1185 }
1186 }
1187
1188 mcache_bkt_ws_zero(cp);
1189 mcache_bkt_ws_reap(cp);
1190 }
1191
1192 /*
1193 * Free one or more objects in the bucket to the slab layer,
1194 * and also free the bucket itself.
1195 */
1196 static void
1197 mcache_bkt_destroy(mcache_t *cp, mcache_bkt_t *bkt, int nobjs)
1198 {
1199 if (nobjs > 0) {
1200 mcache_obj_t *top = bkt->bkt_obj[nobjs - 1];
1201
1202 if (cp->mc_flags & MCF_DEBUG) {
1203 mcache_obj_t *o = top;
1204 int cnt = 0;
1205
1206 /*
1207 * Verify that the chain of objects in the bucket is
1208 * valid. Any mismatch here means a mistake when the
1209 * object(s) were freed to the CPU layer, so we panic.
1210 */
1211 while (o != NULL) {
1212 o = o->obj_next;
1213 ++cnt;
1214 }
1215 if (cnt != nobjs) {
1216 panic("mcache_bkt_destroy: %s cp %p corrupted "
1217 "list in bkt %p (nobjs %d actual %d)\n",
1218 cp->mc_name, (void *)cp, (void *)bkt,
1219 nobjs, cnt);
1220 /* NOTREACHED */
1221 __builtin_unreachable();
1222 }
1223 }
1224
1225 /* Advise the slab layer to purge the object(s) */
1226 (*cp->mc_slab_free)(cp->mc_private, top,
1227 (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
1228 }
1229 mcache_free(bkt->bkt_type->bt_cache, bkt);
1230 }
1231
1232 /*
1233 * Update the bucket layer working set statistics.
1234 */
1235 static void
1236 mcache_bkt_ws_update(mcache_t *cp)
1237 {
1238 MCACHE_LOCK(&cp->mc_bkt_lock);
1239
1240 cp->mc_full.bl_reaplimit = cp->mc_full.bl_min;
1241 cp->mc_full.bl_min = cp->mc_full.bl_total;
1242 cp->mc_empty.bl_reaplimit = cp->mc_empty.bl_min;
1243 cp->mc_empty.bl_min = cp->mc_empty.bl_total;
1244
1245 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1246 }
1247
1248 /*
1249 * Mark everything as eligible for reaping (working set is zero).
1250 */
1251 static void
1252 mcache_bkt_ws_zero(mcache_t *cp)
1253 {
1254 MCACHE_LOCK(&cp->mc_bkt_lock);
1255
1256 cp->mc_full.bl_reaplimit = cp->mc_full.bl_total;
1257 cp->mc_full.bl_min = cp->mc_full.bl_total;
1258 cp->mc_empty.bl_reaplimit = cp->mc_empty.bl_total;
1259 cp->mc_empty.bl_min = cp->mc_empty.bl_total;
1260
1261 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1262 }
1263
1264 /*
1265 * Reap all buckets that are beyond the working set.
1266 */
1267 static void
1268 mcache_bkt_ws_reap(mcache_t *cp)
1269 {
1270 long reap;
1271 mcache_bkt_t *bkt;
1272
1273 reap = MIN(cp->mc_full.bl_reaplimit, cp->mc_full.bl_min);
1274 while (reap-- &&
1275 (bkt = mcache_bkt_alloc(cp, &cp->mc_full)) != NULL) {
1276 mcache_bkt_destroy(cp, bkt, bkt->bkt_type->bt_bktsize);
1277 }
1278
1279 reap = MIN(cp->mc_empty.bl_reaplimit, cp->mc_empty.bl_min);
1280 while (reap-- &&
1281 (bkt = mcache_bkt_alloc(cp, &cp->mc_empty)) != NULL) {
1282 mcache_bkt_destroy(cp, bkt, 0);
1283 }
1284 }
1285
1286 static void
1287 mcache_reap_timeout(thread_call_param_t dummy __unused,
1288 thread_call_param_t arg)
1289 {
1290 volatile UInt32 *flag = arg;
1291
1292 ASSERT(flag == &mcache_reaping);
1293
1294 *flag = 0;
1295 }
1296
1297 static void
1298 mcache_reap_done(void *flag)
1299 {
1300 uint64_t deadline, leeway;
1301
1302 clock_interval_to_deadline(mcache_reap_interval, NSEC_PER_SEC,
1303 &deadline);
1304 clock_interval_to_absolutetime_interval(mcache_reap_interval_leeway,
1305 NSEC_PER_SEC, &leeway);
1306 thread_call_enter_delayed_with_leeway(mcache_reap_tcall, flag,
1307 deadline, leeway, THREAD_CALL_DELAY_LEEWAY);
1308 }
1309
1310 static void
1311 mcache_reap_start(void *arg)
1312 {
1313 UInt32 *flag = arg;
1314
1315 ASSERT(flag == &mcache_reaping);
1316
1317 mcache_applyall(mcache_cache_reap);
1318 mcache_dispatch(mcache_reap_done, flag);
1319 }
1320
1321 __private_extern__ void
1322 mcache_reap(void)
1323 {
1324 UInt32 *flag = &mcache_reaping;
1325
1326 if (mcache_llock_owner == current_thread() ||
1327 !OSCompareAndSwap(0, 1, flag)) {
1328 return;
1329 }
1330
1331 mcache_dispatch(mcache_reap_start, flag);
1332 }
1333
1334 __private_extern__ void
1335 mcache_reap_now(mcache_t *cp, boolean_t purge)
1336 {
1337 if (purge) {
1338 mcache_bkt_purge(cp);
1339 mcache_cache_bkt_enable(cp);
1340 } else {
1341 mcache_bkt_ws_zero(cp);
1342 mcache_bkt_ws_reap(cp);
1343 }
1344 }
1345
1346 static void
1347 mcache_cache_reap(mcache_t *cp)
1348 {
1349 mcache_bkt_ws_reap(cp);
1350 }
1351
1352 /*
1353 * Performs period maintenance on a cache.
1354 */
1355 static void
1356 mcache_cache_update(mcache_t *cp)
1357 {
1358 int need_bkt_resize = 0;
1359 int need_bkt_reenable = 0;
1360
1361 lck_mtx_assert(mcache_llock, LCK_MTX_ASSERT_OWNED);
1362
1363 mcache_bkt_ws_update(cp);
1364
1365 /*
1366 * Cache resize and post-purge reenable are mutually exclusive.
1367 * If the cache was previously purged, there is no point of
1368 * increasing the bucket size as there was an indication of
1369 * memory pressure on the system.
1370 */
1371 lck_mtx_lock_spin(&cp->mc_sync_lock);
1372 if (!(cp->mc_flags & MCF_NOCPUCACHE) && cp->mc_enable_cnt) {
1373 need_bkt_reenable = 1;
1374 }
1375 lck_mtx_unlock(&cp->mc_sync_lock);
1376
1377 MCACHE_LOCK(&cp->mc_bkt_lock);
1378 /*
1379 * If the contention count is greater than the threshold, and if
1380 * we are not already at the maximum bucket size, increase it.
1381 * Otherwise, if this cache was previously purged by the user
1382 * then we simply reenable it.
1383 */
1384 if ((unsigned int)cp->mc_chunksize < cp->cache_bkttype->bt_maxbuf &&
1385 (int)(cp->mc_bkt_contention - cp->mc_bkt_contention_prev) >
1386 mcache_bkt_contention && !need_bkt_reenable) {
1387 need_bkt_resize = 1;
1388 }
1389
1390 cp->mc_bkt_contention_prev = cp->mc_bkt_contention;
1391 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1392
1393 if (need_bkt_resize) {
1394 mcache_dispatch(mcache_cache_bkt_resize, cp);
1395 } else if (need_bkt_reenable) {
1396 mcache_dispatch(mcache_cache_enable, cp);
1397 }
1398 }
1399
1400 /*
1401 * Recompute a cache's bucket size. This is an expensive operation
1402 * and should not be done frequently; larger buckets provide for a
1403 * higher transfer rate with the bucket while smaller buckets reduce
1404 * the memory consumption.
1405 */
1406 static void
1407 mcache_cache_bkt_resize(void *arg)
1408 {
1409 mcache_t *cp = arg;
1410 mcache_bkttype_t *btp = cp->cache_bkttype;
1411
1412 if ((unsigned int)cp->mc_chunksize < btp->bt_maxbuf) {
1413 mcache_bkt_purge(cp);
1414
1415 /*
1416 * Upgrade to the next bucket type with larger bucket size;
1417 * temporarily set the previous contention snapshot to a
1418 * negative number to prevent unnecessary resize request.
1419 */
1420 MCACHE_LOCK(&cp->mc_bkt_lock);
1421 cp->cache_bkttype = ++btp;
1422 cp->mc_bkt_contention_prev = cp->mc_bkt_contention + INT_MAX;
1423 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1424
1425 mcache_cache_enable(cp);
1426 }
1427 }
1428
1429 /*
1430 * Reenable a previously disabled cache due to purge.
1431 */
1432 static void
1433 mcache_cache_enable(void *arg)
1434 {
1435 mcache_t *cp = arg;
1436
1437 lck_mtx_lock_spin(&cp->mc_sync_lock);
1438 cp->mc_purge_cnt = 0;
1439 cp->mc_enable_cnt = 0;
1440 lck_mtx_unlock(&cp->mc_sync_lock);
1441
1442 mcache_cache_bkt_enable(cp);
1443 }
1444
1445 static void
1446 mcache_update_timeout(__unused void *arg)
1447 {
1448 uint64_t deadline, leeway;
1449
1450 clock_interval_to_deadline(mcache_reap_interval, NSEC_PER_SEC,
1451 &deadline);
1452 clock_interval_to_absolutetime_interval(mcache_reap_interval_leeway,
1453 NSEC_PER_SEC, &leeway);
1454 thread_call_enter_delayed_with_leeway(mcache_update_tcall, NULL,
1455 deadline, leeway, THREAD_CALL_DELAY_LEEWAY);
1456 }
1457
1458 static void
1459 mcache_update(thread_call_param_t arg __unused,
1460 thread_call_param_t dummy __unused)
1461 {
1462 mcache_applyall(mcache_cache_update);
1463 mcache_update_timeout(NULL);
1464 }
1465
1466 static void
1467 mcache_applyall(void (*func)(mcache_t *))
1468 {
1469 mcache_t *cp;
1470
1471 MCACHE_LIST_LOCK();
1472 LIST_FOREACH(cp, &mcache_head, mc_list) {
1473 func(cp);
1474 }
1475 MCACHE_LIST_UNLOCK();
1476 }
1477
1478 static void
1479 mcache_dispatch(void (*func)(void *), void *arg)
1480 {
1481 ASSERT(func != NULL);
1482 timeout(func, arg, hz / 1000);
1483 }
1484
1485 __private_extern__ void
1486 mcache_buffer_log(mcache_audit_t *mca, void *addr, mcache_t *cp,
1487 struct timeval *base_ts)
1488 {
1489 struct timeval now, base = { .tv_sec = 0, .tv_usec = 0 };
1490 void *stack[MCACHE_STACK_DEPTH + 1];
1491 struct mca_trn *transaction;
1492
1493 transaction = &mca->mca_trns[mca->mca_next_trn];
1494
1495 mca->mca_addr = addr;
1496 mca->mca_cache = cp;
1497
1498 transaction->mca_thread = current_thread();
1499
1500 bzero(stack, sizeof(stack));
1501 transaction->mca_depth = (uint16_t)OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
1502 bcopy(&stack[1], transaction->mca_stack,
1503 sizeof(transaction->mca_stack));
1504
1505 microuptime(&now);
1506 if (base_ts != NULL) {
1507 base = *base_ts;
1508 }
1509 /* tstamp is in ms relative to base_ts */
1510 transaction->mca_tstamp = ((now.tv_usec - base.tv_usec) / 1000);
1511 if ((now.tv_sec - base.tv_sec) > 0) {
1512 transaction->mca_tstamp += ((now.tv_sec - base.tv_sec) * 1000);
1513 }
1514
1515 mca->mca_next_trn =
1516 (mca->mca_next_trn + 1) % mca_trn_max;
1517 }
1518
1519 /*
1520 * N.B.: mcache_set_pattern(), mcache_verify_pattern() and
1521 * mcache_verify_set_pattern() are marked as noinline to prevent the
1522 * compiler from aliasing pointers when they are inlined inside the callers
1523 * (e.g. mcache_audit_free_verify_set()) which would be undefined behavior.
1524 */
1525 __private_extern__ OS_NOINLINE void
1526 mcache_set_pattern(u_int64_t pattern, void *buf_arg, size_t size)
1527 {
1528 u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1529 u_int64_t *buf = (u_int64_t *)buf_arg;
1530
1531 VERIFY(IS_P2ALIGNED(buf_arg, sizeof(u_int64_t)));
1532 VERIFY(IS_P2ALIGNED(size, sizeof(u_int64_t)));
1533
1534 while (buf < buf_end) {
1535 *buf++ = pattern;
1536 }
1537 }
1538
1539 __private_extern__ OS_NOINLINE void *
1540 mcache_verify_pattern(u_int64_t pattern, void *buf_arg, size_t size)
1541 {
1542 u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1543 u_int64_t *buf;
1544
1545 VERIFY(IS_P2ALIGNED(buf_arg, sizeof(u_int64_t)));
1546 VERIFY(IS_P2ALIGNED(size, sizeof(u_int64_t)));
1547
1548 for (buf = buf_arg; buf < buf_end; buf++) {
1549 if (*buf != pattern) {
1550 return buf;
1551 }
1552 }
1553 return NULL;
1554 }
1555
1556 OS_NOINLINE static void *
1557 mcache_verify_set_pattern(u_int64_t old, u_int64_t new, void *buf_arg,
1558 size_t size)
1559 {
1560 u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1561 u_int64_t *buf;
1562
1563 VERIFY(IS_P2ALIGNED(buf_arg, sizeof(u_int64_t)));
1564 VERIFY(IS_P2ALIGNED(size, sizeof(u_int64_t)));
1565
1566 for (buf = buf_arg; buf < buf_end; buf++) {
1567 if (*buf != old) {
1568 mcache_set_pattern(old, buf_arg,
1569 (uintptr_t)buf - (uintptr_t)buf_arg);
1570 return buf;
1571 }
1572 *buf = new;
1573 }
1574 return NULL;
1575 }
1576
1577 __private_extern__ void
1578 mcache_audit_free_verify(mcache_audit_t *mca, void *base, size_t offset,
1579 size_t size)
1580 {
1581 void *addr;
1582 u_int64_t *oaddr64;
1583 mcache_obj_t *next;
1584
1585 addr = (void *)((uintptr_t)base + offset);
1586 next = ((mcache_obj_t *)addr)->obj_next;
1587
1588 /* For the "obj_next" pointer in the buffer */
1589 oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof(u_int64_t));
1590 *oaddr64 = MCACHE_FREE_PATTERN;
1591
1592 if ((oaddr64 = mcache_verify_pattern(MCACHE_FREE_PATTERN,
1593 (caddr_t)base, size)) != NULL) {
1594 mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
1595 (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
1596 /* NOTREACHED */
1597 }
1598 ((mcache_obj_t *)addr)->obj_next = next;
1599 }
1600
1601 __private_extern__ void
1602 mcache_audit_free_verify_set(mcache_audit_t *mca, void *base, size_t offset,
1603 size_t size)
1604 {
1605 void *addr;
1606 u_int64_t *oaddr64;
1607 mcache_obj_t *next;
1608
1609 addr = (void *)((uintptr_t)base + offset);
1610 next = ((mcache_obj_t *)addr)->obj_next;
1611
1612 /* For the "obj_next" pointer in the buffer */
1613 oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof(u_int64_t));
1614 *oaddr64 = MCACHE_FREE_PATTERN;
1615
1616 if ((oaddr64 = mcache_verify_set_pattern(MCACHE_FREE_PATTERN,
1617 MCACHE_UNINITIALIZED_PATTERN, (caddr_t)base, size)) != NULL) {
1618 mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
1619 (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
1620 /* NOTREACHED */
1621 }
1622 ((mcache_obj_t *)addr)->obj_next = next;
1623 }
1624
1625 #undef panic
1626
1627 #define DUMP_TRN_FMT() \
1628 "%s transaction thread %p saved PC stack (%d deep):\n" \
1629 "\t%p, %p, %p, %p, %p, %p, %p, %p\n" \
1630 "\t%p, %p, %p, %p, %p, %p, %p, %p\n"
1631
1632 #define DUMP_TRN_FIELDS(s, x) \
1633 s, \
1634 mca->mca_trns[x].mca_thread, mca->mca_trns[x].mca_depth, \
1635 mca->mca_trns[x].mca_stack[0], mca->mca_trns[x].mca_stack[1], \
1636 mca->mca_trns[x].mca_stack[2], mca->mca_trns[x].mca_stack[3], \
1637 mca->mca_trns[x].mca_stack[4], mca->mca_trns[x].mca_stack[5], \
1638 mca->mca_trns[x].mca_stack[6], mca->mca_trns[x].mca_stack[7], \
1639 mca->mca_trns[x].mca_stack[8], mca->mca_trns[x].mca_stack[9], \
1640 mca->mca_trns[x].mca_stack[10], mca->mca_trns[x].mca_stack[11], \
1641 mca->mca_trns[x].mca_stack[12], mca->mca_trns[x].mca_stack[13], \
1642 mca->mca_trns[x].mca_stack[14], mca->mca_trns[x].mca_stack[15]
1643
1644 #define MCA_TRN_LAST ((mca->mca_next_trn + mca_trn_max) % mca_trn_max)
1645 #define MCA_TRN_PREV ((mca->mca_next_trn + mca_trn_max - 1) % mca_trn_max)
1646
1647 __private_extern__ char *
1648 mcache_dump_mca(mcache_audit_t *mca)
1649 {
1650 if (mca_dump_buf == NULL) {
1651 return NULL;
1652 }
1653
1654 snprintf(mca_dump_buf, DUMP_MCA_BUF_SIZE,
1655 "mca %p: addr %p, cache %p (%s) nxttrn %d\n"
1656 DUMP_TRN_FMT()
1657 DUMP_TRN_FMT(),
1658
1659 mca, mca->mca_addr, mca->mca_cache,
1660 mca->mca_cache ? mca->mca_cache->mc_name : "?",
1661 mca->mca_next_trn,
1662
1663 DUMP_TRN_FIELDS("last", MCA_TRN_LAST),
1664 DUMP_TRN_FIELDS("previous", MCA_TRN_PREV));
1665
1666 return mca_dump_buf;
1667 }
1668
1669 __private_extern__ void
1670 mcache_audit_panic(mcache_audit_t *mca, void *addr, size_t offset,
1671 int64_t expected, int64_t got)
1672 {
1673 if (mca == NULL) {
1674 panic("mcache_audit: buffer %p modified after free at "
1675 "offset 0x%lx (0x%llx instead of 0x%llx)\n", addr,
1676 offset, got, expected);
1677 /* NOTREACHED */
1678 __builtin_unreachable();
1679 }
1680
1681 panic("mcache_audit: buffer %p modified after free at offset 0x%lx "
1682 "(0x%llx instead of 0x%llx)\n%s\n",
1683 addr, offset, got, expected, mcache_dump_mca(mca));
1684 /* NOTREACHED */
1685 __builtin_unreachable();
1686 }
1687
1688 __attribute__((noinline, cold, not_tail_called, noreturn))
1689 __private_extern__ int
1690 assfail(const char *a, const char *f, int l)
1691 {
1692 panic("assertion failed: %s, file: %s, line: %d", a, f, l);
1693 /* NOTREACHED */
1694 __builtin_unreachable();
1695 }