]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/mcache.c
a2263417d41c1209be3f17d3b4ec25848cfbc34b
[apple/xnu.git] / bsd / kern / mcache.c
1 /*
2 * Copyright (c) 2006-2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * Memory allocator with per-CPU caching, derived from the kmem magazine
31 * concept and implementation as described in the following paper:
32 * http://www.usenix.org/events/usenix01/full_papers/bonwick/bonwick.pdf
33 * That implementation is Copyright 2006 Sun Microsystems, Inc. All rights
34 * reserved. Use is subject to license terms.
35 *
36 * There are several major differences between this and the original kmem
37 * magazine: this derivative implementation allows for multiple objects to
38 * be allocated and freed from/to the object cache in one call; in addition,
39 * it provides for better flexibility where the user is allowed to define
40 * its own slab allocator (instead of the default zone allocator). Finally,
41 * no object construction/destruction takes place at the moment, although
42 * this could be added in future to improve efficiency.
43 */
44
45 #include <sys/param.h>
46 #include <sys/types.h>
47 #include <sys/malloc.h>
48 #include <sys/mbuf.h>
49 #include <sys/queue.h>
50 #include <sys/kernel.h>
51 #include <sys/systm.h>
52
53 #include <kern/debug.h>
54 #include <kern/zalloc.h>
55 #include <kern/cpu_number.h>
56 #include <kern/locks.h>
57 #include <kern/thread_call.h>
58
59 #include <libkern/libkern.h>
60 #include <libkern/OSAtomic.h>
61 #include <libkern/OSDebug.h>
62
63 #include <mach/vm_param.h>
64 #include <machine/limits.h>
65 #include <machine/machine_routines.h>
66
67 #include <string.h>
68
69 #include <sys/mcache.h>
70
71 #define MCACHE_SIZE(n) \
72 __builtin_offsetof(mcache_t, mc_cpu[n])
73
74 /* Allocate extra in case we need to manually align the pointer */
75 #define MCACHE_ALLOC_SIZE \
76 (sizeof (void *) + MCACHE_SIZE(ncpu) + CPU_CACHE_LINE_SIZE)
77
78 #define MCACHE_CPU(c) \
79 (mcache_cpu_t *)((void *)((char *)(c) + MCACHE_SIZE(cpu_number())))
80
81 /*
82 * MCACHE_LIST_LOCK() and MCACHE_LIST_UNLOCK() are macros used
83 * to serialize accesses to the global list of caches in the system.
84 * They also record the thread currently running in the critical
85 * section, so that we can avoid recursive requests to reap the
86 * caches when memory runs low.
87 */
88 #define MCACHE_LIST_LOCK() { \
89 lck_mtx_lock(mcache_llock); \
90 mcache_llock_owner = current_thread(); \
91 }
92
93 #define MCACHE_LIST_UNLOCK() { \
94 mcache_llock_owner = NULL; \
95 lck_mtx_unlock(mcache_llock); \
96 }
97
98 #define MCACHE_LOCK(l) lck_mtx_lock(l)
99 #define MCACHE_UNLOCK(l) lck_mtx_unlock(l)
100 #define MCACHE_LOCK_TRY(l) lck_mtx_try_lock(l)
101
102 static int ncpu;
103 static unsigned int cache_line_size;
104 static lck_mtx_t *mcache_llock;
105 static struct thread *mcache_llock_owner;
106 static lck_attr_t *mcache_llock_attr;
107 static lck_grp_t *mcache_llock_grp;
108 static lck_grp_attr_t *mcache_llock_grp_attr;
109 static struct zone *mcache_zone;
110 static const uint32_t mcache_reap_interval = 15;
111 static const uint32_t mcache_reap_interval_leeway = 2;
112 static UInt32 mcache_reaping;
113 static int mcache_ready;
114 static int mcache_updating;
115
116 static int mcache_bkt_contention = 3;
117 #if DEBUG
118 static unsigned int mcache_flags = MCF_DEBUG;
119 #else
120 static unsigned int mcache_flags = 0;
121 #endif
122
123 int mca_trn_max = MCA_TRN_MAX;
124
125 #define DUMP_MCA_BUF_SIZE 512
126 static char *mca_dump_buf;
127
128 static mcache_bkttype_t mcache_bkttype[] = {
129 { 1, 4096, 32768, NULL },
130 { 3, 2048, 16384, NULL },
131 { 7, 1024, 12288, NULL },
132 { 15, 256, 8192, NULL },
133 { 31, 64, 4096, NULL },
134 { 47, 0, 2048, NULL },
135 { 63, 0, 1024, NULL },
136 { 95, 0, 512, NULL },
137 { 143, 0, 256, NULL },
138 { 165, 0, 0, NULL },
139 };
140
141 static mcache_t *mcache_create_common(const char *, size_t, size_t,
142 mcache_allocfn_t, mcache_freefn_t, mcache_auditfn_t, mcache_logfn_t,
143 mcache_notifyfn_t, void *, u_int32_t, int, int);
144 static unsigned int mcache_slab_alloc(void *, mcache_obj_t ***,
145 unsigned int, int);
146 static void mcache_slab_free(void *, mcache_obj_t *, boolean_t);
147 static void mcache_slab_audit(void *, mcache_obj_t *, boolean_t);
148 static void mcache_cpu_refill(mcache_cpu_t *, mcache_bkt_t *, int);
149 static mcache_bkt_t *mcache_bkt_alloc(mcache_t *, mcache_bktlist_t *,
150 mcache_bkttype_t **);
151 static void mcache_bkt_free(mcache_t *, mcache_bktlist_t *, mcache_bkt_t *);
152 static void mcache_cache_bkt_enable(mcache_t *);
153 static void mcache_bkt_purge(mcache_t *);
154 static void mcache_bkt_destroy(mcache_t *, mcache_bkttype_t *,
155 mcache_bkt_t *, int);
156 static void mcache_bkt_ws_update(mcache_t *);
157 static void mcache_bkt_ws_zero(mcache_t *);
158 static void mcache_bkt_ws_reap(mcache_t *);
159 static void mcache_dispatch(void (*)(void *), void *);
160 static void mcache_cache_reap(mcache_t *);
161 static void mcache_cache_update(mcache_t *);
162 static void mcache_cache_bkt_resize(void *);
163 static void mcache_cache_enable(void *);
164 static void mcache_update(thread_call_param_t __unused, thread_call_param_t __unused);
165 static void mcache_update_timeout(void *);
166 static void mcache_applyall(void (*)(mcache_t *));
167 static void mcache_reap_start(void *);
168 static void mcache_reap_done(void *);
169 static void mcache_reap_timeout(thread_call_param_t __unused, thread_call_param_t);
170 static void mcache_notify(mcache_t *, u_int32_t);
171 static void mcache_purge(void *);
172
173 static LIST_HEAD(, mcache) mcache_head;
174 mcache_t *mcache_audit_cache;
175
176 static thread_call_t mcache_reap_tcall;
177 static thread_call_t mcache_update_tcall;
178
179 /*
180 * Initialize the framework; this is currently called as part of BSD init.
181 */
182 __private_extern__ void
183 mcache_init(void)
184 {
185 mcache_bkttype_t *btp;
186 unsigned int i;
187 char name[32];
188
189 VERIFY(mca_trn_max >= 2);
190
191 ncpu = ml_get_max_cpus();
192 (void) mcache_cache_line_size(); /* prime it */
193
194 mcache_llock_grp_attr = lck_grp_attr_alloc_init();
195 mcache_llock_grp = lck_grp_alloc_init("mcache.list",
196 mcache_llock_grp_attr);
197 mcache_llock_attr = lck_attr_alloc_init();
198 mcache_llock = lck_mtx_alloc_init(mcache_llock_grp, mcache_llock_attr);
199
200 mcache_reap_tcall = thread_call_allocate(mcache_reap_timeout, NULL);
201 mcache_update_tcall = thread_call_allocate(mcache_update, NULL);
202 if (mcache_reap_tcall == NULL || mcache_update_tcall == NULL) {
203 panic("mcache_init: thread_call_allocate failed");
204 }
205
206 mcache_zone = zinit(MCACHE_ALLOC_SIZE, 256 * MCACHE_ALLOC_SIZE,
207 PAGE_SIZE, "mcache");
208 if (mcache_zone == NULL) {
209 panic("mcache_init: failed to allocate mcache zone\n");
210 }
211 zone_change(mcache_zone, Z_CALLERACCT, FALSE);
212
213 LIST_INIT(&mcache_head);
214
215 for (i = 0; i < sizeof(mcache_bkttype) / sizeof(*btp); i++) {
216 btp = &mcache_bkttype[i];
217 (void) snprintf(name, sizeof(name), "bkt_%d",
218 btp->bt_bktsize);
219 btp->bt_cache = mcache_create(name,
220 (btp->bt_bktsize + 1) * sizeof(void *), 0, 0, MCR_SLEEP);
221 }
222
223 PE_parse_boot_argn("mcache_flags", &mcache_flags, sizeof(mcache_flags));
224 mcache_flags &= MCF_FLAGS_MASK;
225
226 mcache_audit_cache = mcache_create("audit", sizeof(mcache_audit_t),
227 0, 0, MCR_SLEEP);
228
229 mcache_applyall(mcache_cache_bkt_enable);
230 mcache_ready = 1;
231
232 printf("mcache: %d CPU(s), %d bytes CPU cache line size\n",
233 ncpu, CPU_CACHE_LINE_SIZE);
234 }
235
236 /*
237 * Return the global mcache flags.
238 */
239 __private_extern__ unsigned int
240 mcache_getflags(void)
241 {
242 return mcache_flags;
243 }
244
245 /*
246 * Return the CPU cache line size.
247 */
248 __private_extern__ unsigned int
249 mcache_cache_line_size(void)
250 {
251 if (cache_line_size == 0) {
252 ml_cpu_info_t cpu_info;
253 ml_cpu_get_info(&cpu_info);
254 cache_line_size = cpu_info.cache_line_size;
255 }
256 return cache_line_size;
257 }
258
259 /*
260 * Create a cache using the zone allocator as the backend slab allocator.
261 * The caller may specify any alignment for the object; if it specifies 0
262 * the default alignment (MCACHE_ALIGN) will be used.
263 */
264 __private_extern__ mcache_t *
265 mcache_create(const char *name, size_t bufsize, size_t align,
266 u_int32_t flags, int wait)
267 {
268 return mcache_create_common(name, bufsize, align, mcache_slab_alloc,
269 mcache_slab_free, mcache_slab_audit, NULL, NULL, NULL, flags, 1,
270 wait);
271 }
272
273 /*
274 * Create a cache using a custom backend slab allocator. Since the caller
275 * is responsible for allocation, no alignment guarantee will be provided
276 * by this framework.
277 */
278 __private_extern__ mcache_t *
279 mcache_create_ext(const char *name, size_t bufsize,
280 mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
281 mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
282 u_int32_t flags, int wait)
283 {
284 return mcache_create_common(name, bufsize, 0, allocfn,
285 freefn, auditfn, logfn, notifyfn, arg, flags, 0, wait);
286 }
287
288 /*
289 * Common cache creation routine.
290 */
291 static mcache_t *
292 mcache_create_common(const char *name, size_t bufsize, size_t align,
293 mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
294 mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
295 u_int32_t flags, int need_zone, int wait)
296 {
297 mcache_bkttype_t *btp;
298 mcache_t *cp = NULL;
299 size_t chunksize;
300 void *buf, **pbuf;
301 int c;
302 char lck_name[64];
303
304 /* If auditing is on and print buffer is NULL, allocate it now */
305 if ((flags & MCF_DEBUG) && mca_dump_buf == NULL) {
306 int malloc_wait = (wait & MCR_NOSLEEP) ? M_NOWAIT : M_WAITOK;
307 MALLOC(mca_dump_buf, char *, DUMP_MCA_BUF_SIZE, M_TEMP,
308 malloc_wait | M_ZERO);
309 if (mca_dump_buf == NULL) {
310 return NULL;
311 }
312 }
313
314 buf = zalloc(mcache_zone);
315 if (buf == NULL) {
316 goto fail;
317 }
318
319 bzero(buf, MCACHE_ALLOC_SIZE);
320
321 /*
322 * In case we didn't get a cache-aligned memory, round it up
323 * accordingly. This is needed in order to get the rest of
324 * structure members aligned properly. It also means that
325 * the memory span gets shifted due to the round up, but it
326 * is okay since we've allocated extra space for this.
327 */
328 cp = (mcache_t *)
329 P2ROUNDUP((intptr_t)buf + sizeof(void *), CPU_CACHE_LINE_SIZE);
330 pbuf = (void **)((intptr_t)cp - sizeof(void *));
331 *pbuf = buf;
332
333 /*
334 * Guaranteed alignment is valid only when we use the internal
335 * slab allocator (currently set to use the zone allocator).
336 */
337 if (!need_zone) {
338 align = 1;
339 } else {
340 /* Enforce 64-bit minimum alignment for zone-based buffers */
341 if (align == 0) {
342 align = MCACHE_ALIGN;
343 }
344 align = P2ROUNDUP(align, MCACHE_ALIGN);
345 }
346
347 if ((align & (align - 1)) != 0) {
348 panic("mcache_create: bad alignment %lu", align);
349 }
350
351 cp->mc_align = align;
352 cp->mc_slab_alloc = allocfn;
353 cp->mc_slab_free = freefn;
354 cp->mc_slab_audit = auditfn;
355 cp->mc_slab_log = logfn;
356 cp->mc_slab_notify = notifyfn;
357 cp->mc_private = need_zone ? cp : arg;
358 cp->mc_bufsize = bufsize;
359 cp->mc_flags = (flags & MCF_FLAGS_MASK) | mcache_flags;
360
361 (void) snprintf(cp->mc_name, sizeof(cp->mc_name), "mcache.%s", name);
362
363 (void) snprintf(lck_name, sizeof(lck_name), "%s.cpu", cp->mc_name);
364 cp->mc_cpu_lock_grp_attr = lck_grp_attr_alloc_init();
365 cp->mc_cpu_lock_grp = lck_grp_alloc_init(lck_name,
366 cp->mc_cpu_lock_grp_attr);
367 cp->mc_cpu_lock_attr = lck_attr_alloc_init();
368
369 /*
370 * Allocation chunk size is the object's size plus any extra size
371 * needed to satisfy the object's alignment. It is enforced to be
372 * at least the size of an LP64 pointer to simplify auditing and to
373 * handle multiple-element allocation requests, where the elements
374 * returned are linked together in a list.
375 */
376 chunksize = MAX(bufsize, sizeof(u_int64_t));
377 if (need_zone) {
378 VERIFY(align != 0 && (align % MCACHE_ALIGN) == 0);
379 chunksize += sizeof(uint64_t) + align;
380 chunksize = P2ROUNDUP(chunksize, align);
381 if ((cp->mc_slab_zone = zinit(chunksize, 64 * 1024 * ncpu,
382 PAGE_SIZE, cp->mc_name)) == NULL) {
383 goto fail;
384 }
385 zone_change(cp->mc_slab_zone, Z_EXPAND, TRUE);
386 }
387 cp->mc_chunksize = chunksize;
388
389 /*
390 * Initialize the bucket layer.
391 */
392 (void) snprintf(lck_name, sizeof(lck_name), "%s.bkt", cp->mc_name);
393 cp->mc_bkt_lock_grp_attr = lck_grp_attr_alloc_init();
394 cp->mc_bkt_lock_grp = lck_grp_alloc_init(lck_name,
395 cp->mc_bkt_lock_grp_attr);
396 cp->mc_bkt_lock_attr = lck_attr_alloc_init();
397 lck_mtx_init(&cp->mc_bkt_lock, cp->mc_bkt_lock_grp,
398 cp->mc_bkt_lock_attr);
399
400 (void) snprintf(lck_name, sizeof(lck_name), "%s.sync", cp->mc_name);
401 cp->mc_sync_lock_grp_attr = lck_grp_attr_alloc_init();
402 cp->mc_sync_lock_grp = lck_grp_alloc_init(lck_name,
403 cp->mc_sync_lock_grp_attr);
404 cp->mc_sync_lock_attr = lck_attr_alloc_init();
405 lck_mtx_init(&cp->mc_sync_lock, cp->mc_sync_lock_grp,
406 cp->mc_sync_lock_attr);
407
408 for (btp = mcache_bkttype; chunksize <= btp->bt_minbuf; btp++) {
409 continue;
410 }
411
412 cp->cache_bkttype = btp;
413
414 /*
415 * Initialize the CPU layer. Each per-CPU structure is aligned
416 * on the CPU cache line boundary to prevent false sharing.
417 */
418 for (c = 0; c < ncpu; c++) {
419 mcache_cpu_t *ccp = &cp->mc_cpu[c];
420
421 VERIFY(IS_P2ALIGNED(ccp, CPU_CACHE_LINE_SIZE));
422 lck_mtx_init(&ccp->cc_lock, cp->mc_cpu_lock_grp,
423 cp->mc_cpu_lock_attr);
424 ccp->cc_objs = -1;
425 ccp->cc_pobjs = -1;
426 }
427
428 if (mcache_ready) {
429 mcache_cache_bkt_enable(cp);
430 }
431
432 /* TODO: dynamically create sysctl for stats */
433
434 MCACHE_LIST_LOCK();
435 LIST_INSERT_HEAD(&mcache_head, cp, mc_list);
436 MCACHE_LIST_UNLOCK();
437
438 /*
439 * If cache buckets are enabled and this is the first cache
440 * created, start the periodic cache update.
441 */
442 if (!(mcache_flags & MCF_NOCPUCACHE) && !mcache_updating) {
443 mcache_updating = 1;
444 mcache_update_timeout(NULL);
445 }
446 if (cp->mc_flags & MCF_DEBUG) {
447 printf("mcache_create: %s (%s) arg %p bufsize %lu align %lu "
448 "chunksize %lu bktsize %d\n", name, need_zone ? "i" : "e",
449 arg, bufsize, cp->mc_align, chunksize, btp->bt_bktsize);
450 }
451 return cp;
452
453 fail:
454 if (buf != NULL) {
455 zfree(mcache_zone, buf);
456 }
457 return NULL;
458 }
459
460 /*
461 * Allocate one or more objects from a cache.
462 */
463 __private_extern__ unsigned int
464 mcache_alloc_ext(mcache_t *cp, mcache_obj_t **list, unsigned int num, int wait)
465 {
466 mcache_cpu_t *ccp;
467 mcache_obj_t **top = &(*list);
468 mcache_bkt_t *bkt;
469 unsigned int need = num;
470 boolean_t nwretry = FALSE;
471
472 /* MCR_NOSLEEP and MCR_FAILOK are mutually exclusive */
473 VERIFY((wait & (MCR_NOSLEEP | MCR_FAILOK)) != (MCR_NOSLEEP | MCR_FAILOK));
474
475 ASSERT(list != NULL);
476 *list = NULL;
477
478 if (num == 0) {
479 return 0;
480 }
481
482 retry_alloc:
483 /* We may not always be running in the same CPU in case of retries */
484 ccp = MCACHE_CPU(cp);
485
486 MCACHE_LOCK(&ccp->cc_lock);
487 for (;;) {
488 /*
489 * If we have an object in the current CPU's filled bucket,
490 * chain the object to any previous objects and return if
491 * we've satisfied the number of requested objects.
492 */
493 if (ccp->cc_objs > 0) {
494 mcache_obj_t *tail;
495 int objs;
496
497 /*
498 * Objects in the bucket are already linked together
499 * with the most recently freed object at the head of
500 * the list; grab as many objects as we can.
501 */
502 objs = MIN((unsigned int)ccp->cc_objs, need);
503 *list = ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
504 ccp->cc_objs -= objs;
505 ccp->cc_alloc += objs;
506
507 tail = ccp->cc_filled->bkt_obj[ccp->cc_objs];
508 list = &tail->obj_next;
509 *list = NULL;
510
511 /* If we got them all, return to caller */
512 if ((need -= objs) == 0) {
513 MCACHE_UNLOCK(&ccp->cc_lock);
514
515 if (!(cp->mc_flags & MCF_NOLEAKLOG) &&
516 cp->mc_slab_log != NULL) {
517 (*cp->mc_slab_log)(num, *top, TRUE);
518 }
519
520 if (cp->mc_flags & MCF_DEBUG) {
521 goto debug_alloc;
522 }
523
524 return num;
525 }
526 }
527
528 /*
529 * The CPU's filled bucket is empty. If the previous filled
530 * bucket was full, exchange and try again.
531 */
532 if (ccp->cc_pobjs > 0) {
533 mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
534 continue;
535 }
536
537 /*
538 * If the bucket layer is disabled, allocate from slab. This
539 * can happen either because MCF_NOCPUCACHE is set, or because
540 * the bucket layer is currently being resized.
541 */
542 if (ccp->cc_bktsize == 0) {
543 break;
544 }
545
546 /*
547 * Both of the CPU's buckets are empty; try to get a full
548 * bucket from the bucket layer. Upon success, refill this
549 * CPU and place any empty bucket into the empty list.
550 */
551 bkt = mcache_bkt_alloc(cp, &cp->mc_full, NULL);
552 if (bkt != NULL) {
553 if (ccp->cc_pfilled != NULL) {
554 mcache_bkt_free(cp, &cp->mc_empty,
555 ccp->cc_pfilled);
556 }
557 mcache_cpu_refill(ccp, bkt, ccp->cc_bktsize);
558 continue;
559 }
560
561 /*
562 * The bucket layer has no full buckets; allocate the
563 * object(s) directly from the slab layer.
564 */
565 break;
566 }
567 MCACHE_UNLOCK(&ccp->cc_lock);
568
569 need -= (*cp->mc_slab_alloc)(cp->mc_private, &list, need, wait);
570
571 /*
572 * If this is a blocking allocation, or if it is non-blocking and
573 * the cache's full bucket is non-empty, then retry the allocation.
574 */
575 if (need > 0) {
576 if (!(wait & MCR_NONBLOCKING)) {
577 atomic_add_32(&cp->mc_wretry_cnt, 1);
578 goto retry_alloc;
579 } else if ((wait & (MCR_NOSLEEP | MCR_TRYHARD)) &&
580 !mcache_bkt_isempty(cp)) {
581 if (!nwretry) {
582 nwretry = TRUE;
583 }
584 atomic_add_32(&cp->mc_nwretry_cnt, 1);
585 goto retry_alloc;
586 } else if (nwretry) {
587 atomic_add_32(&cp->mc_nwfail_cnt, 1);
588 }
589 }
590
591 if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL) {
592 (*cp->mc_slab_log)((num - need), *top, TRUE);
593 }
594
595 if (!(cp->mc_flags & MCF_DEBUG)) {
596 return num - need;
597 }
598
599 debug_alloc:
600 if (cp->mc_flags & MCF_DEBUG) {
601 mcache_obj_t **o = top;
602 unsigned int n;
603
604 n = 0;
605 /*
606 * Verify that the chain of objects have the same count as
607 * what we are about to report to the caller. Any mismatch
608 * here means that the object list is insanely broken and
609 * therefore we must panic.
610 */
611 while (*o != NULL) {
612 o = &(*o)->obj_next;
613 ++n;
614 }
615 if (n != (num - need)) {
616 panic("mcache_alloc_ext: %s cp %p corrupted list "
617 "(got %d actual %d)\n", cp->mc_name,
618 (void *)cp, num - need, n);
619 }
620 }
621
622 /* Invoke the slab layer audit callback if auditing is enabled */
623 if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL) {
624 (*cp->mc_slab_audit)(cp->mc_private, *top, TRUE);
625 }
626
627 return num - need;
628 }
629
630 /*
631 * Allocate a single object from a cache.
632 */
633 __private_extern__ void *
634 mcache_alloc(mcache_t *cp, int wait)
635 {
636 mcache_obj_t *buf;
637
638 (void) mcache_alloc_ext(cp, &buf, 1, wait);
639 return buf;
640 }
641
642 __private_extern__ void
643 mcache_waiter_inc(mcache_t *cp)
644 {
645 atomic_add_32(&cp->mc_waiter_cnt, 1);
646 }
647
648 __private_extern__ void
649 mcache_waiter_dec(mcache_t *cp)
650 {
651 atomic_add_32(&cp->mc_waiter_cnt, -1);
652 }
653
654 __private_extern__ boolean_t
655 mcache_bkt_isempty(mcache_t *cp)
656 {
657 /*
658 * This isn't meant to accurately tell whether there are
659 * any full buckets in the cache; it is simply a way to
660 * obtain "hints" about the state of the cache.
661 */
662 return cp->mc_full.bl_total == 0;
663 }
664
665 /*
666 * Notify the slab layer about an event.
667 */
668 static void
669 mcache_notify(mcache_t *cp, u_int32_t event)
670 {
671 if (cp->mc_slab_notify != NULL) {
672 (*cp->mc_slab_notify)(cp->mc_private, event);
673 }
674 }
675
676 /*
677 * Purge the cache and disable its buckets.
678 */
679 static void
680 mcache_purge(void *arg)
681 {
682 mcache_t *cp = arg;
683
684 mcache_bkt_purge(cp);
685 /*
686 * We cannot simply call mcache_cache_bkt_enable() from here as
687 * a bucket resize may be in flight and we would cause the CPU
688 * layers of the cache to point to different sizes. Therefore,
689 * we simply increment the enable count so that during the next
690 * periodic cache update the buckets can be reenabled.
691 */
692 lck_mtx_lock_spin(&cp->mc_sync_lock);
693 cp->mc_enable_cnt++;
694 lck_mtx_unlock(&cp->mc_sync_lock);
695 }
696
697 __private_extern__ boolean_t
698 mcache_purge_cache(mcache_t *cp, boolean_t async)
699 {
700 /*
701 * Purging a cache that has no per-CPU caches or is already
702 * in the process of being purged is rather pointless.
703 */
704 if (cp->mc_flags & MCF_NOCPUCACHE) {
705 return FALSE;
706 }
707
708 lck_mtx_lock_spin(&cp->mc_sync_lock);
709 if (cp->mc_purge_cnt > 0) {
710 lck_mtx_unlock(&cp->mc_sync_lock);
711 return FALSE;
712 }
713 cp->mc_purge_cnt++;
714 lck_mtx_unlock(&cp->mc_sync_lock);
715
716 if (async) {
717 mcache_dispatch(mcache_purge, cp);
718 } else {
719 mcache_purge(cp);
720 }
721
722 return TRUE;
723 }
724
725 /*
726 * Free a single object to a cache.
727 */
728 __private_extern__ void
729 mcache_free(mcache_t *cp, void *buf)
730 {
731 ((mcache_obj_t *)buf)->obj_next = NULL;
732 mcache_free_ext(cp, (mcache_obj_t *)buf);
733 }
734
735 /*
736 * Free one or more objects to a cache.
737 */
738 __private_extern__ void
739 mcache_free_ext(mcache_t *cp, mcache_obj_t *list)
740 {
741 mcache_cpu_t *ccp = MCACHE_CPU(cp);
742 mcache_bkttype_t *btp;
743 mcache_obj_t *nlist;
744 mcache_bkt_t *bkt;
745
746 if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL) {
747 (*cp->mc_slab_log)(0, list, FALSE);
748 }
749
750 /* Invoke the slab layer audit callback if auditing is enabled */
751 if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL) {
752 (*cp->mc_slab_audit)(cp->mc_private, list, FALSE);
753 }
754
755 MCACHE_LOCK(&ccp->cc_lock);
756 for (;;) {
757 /*
758 * If there is space in the current CPU's filled bucket, put
759 * the object there and return once all objects are freed.
760 * Note the cast to unsigned integer takes care of the case
761 * where the bucket layer is disabled (when cc_objs is -1).
762 */
763 if ((unsigned int)ccp->cc_objs <
764 (unsigned int)ccp->cc_bktsize) {
765 /*
766 * Reverse the list while we place the object into the
767 * bucket; this effectively causes the most recently
768 * freed object(s) to be reused during allocation.
769 */
770 nlist = list->obj_next;
771 list->obj_next = (ccp->cc_objs == 0) ? NULL :
772 ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
773 ccp->cc_filled->bkt_obj[ccp->cc_objs++] = list;
774 ccp->cc_free++;
775
776 if ((list = nlist) != NULL) {
777 continue;
778 }
779
780 /* We are done; return to caller */
781 MCACHE_UNLOCK(&ccp->cc_lock);
782
783 /* If there is a waiter below, notify it */
784 if (cp->mc_waiter_cnt > 0) {
785 mcache_notify(cp, MCN_RETRYALLOC);
786 }
787 return;
788 }
789
790 /*
791 * The CPU's filled bucket is full. If the previous filled
792 * bucket was empty, exchange and try again.
793 */
794 if (ccp->cc_pobjs == 0) {
795 mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
796 continue;
797 }
798
799 /*
800 * If the bucket layer is disabled, free to slab. This can
801 * happen either because MCF_NOCPUCACHE is set, or because
802 * the bucket layer is currently being resized.
803 */
804 if (ccp->cc_bktsize == 0) {
805 break;
806 }
807
808 /*
809 * Both of the CPU's buckets are full; try to get an empty
810 * bucket from the bucket layer. Upon success, empty this
811 * CPU and place any full bucket into the full list.
812 */
813 bkt = mcache_bkt_alloc(cp, &cp->mc_empty, &btp);
814 if (bkt != NULL) {
815 if (ccp->cc_pfilled != NULL) {
816 mcache_bkt_free(cp, &cp->mc_full,
817 ccp->cc_pfilled);
818 }
819 mcache_cpu_refill(ccp, bkt, 0);
820 continue;
821 }
822
823 /*
824 * We need an empty bucket to put our freed objects into
825 * but couldn't get an empty bucket from the bucket layer;
826 * attempt to allocate one. We do not want to block for
827 * allocation here, and if the bucket allocation fails
828 * we will simply fall through to the slab layer.
829 */
830 MCACHE_UNLOCK(&ccp->cc_lock);
831 bkt = mcache_alloc(btp->bt_cache, MCR_NOSLEEP);
832 MCACHE_LOCK(&ccp->cc_lock);
833
834 if (bkt != NULL) {
835 /*
836 * We have an empty bucket, but since we drop the
837 * CPU lock above, the cache's bucket size may have
838 * changed. If so, free the bucket and try again.
839 */
840 if (ccp->cc_bktsize != btp->bt_bktsize) {
841 MCACHE_UNLOCK(&ccp->cc_lock);
842 mcache_free(btp->bt_cache, bkt);
843 MCACHE_LOCK(&ccp->cc_lock);
844 continue;
845 }
846
847 /*
848 * We have an empty bucket of the right size;
849 * add it to the bucket layer and try again.
850 */
851 mcache_bkt_free(cp, &cp->mc_empty, bkt);
852 continue;
853 }
854
855 /*
856 * The bucket layer has no empty buckets; free the
857 * object(s) directly to the slab layer.
858 */
859 break;
860 }
861 MCACHE_UNLOCK(&ccp->cc_lock);
862
863 /* If there is a waiter below, notify it */
864 if (cp->mc_waiter_cnt > 0) {
865 mcache_notify(cp, MCN_RETRYALLOC);
866 }
867
868 /* Advise the slab layer to purge the object(s) */
869 (*cp->mc_slab_free)(cp->mc_private, list,
870 (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
871 }
872
873 /*
874 * Cache destruction routine.
875 */
876 __private_extern__ void
877 mcache_destroy(mcache_t *cp)
878 {
879 void **pbuf;
880
881 MCACHE_LIST_LOCK();
882 LIST_REMOVE(cp, mc_list);
883 MCACHE_LIST_UNLOCK();
884
885 mcache_bkt_purge(cp);
886
887 /*
888 * This cache is dead; there should be no further transaction.
889 * If it's still invoked, make sure that it induces a fault.
890 */
891 cp->mc_slab_alloc = NULL;
892 cp->mc_slab_free = NULL;
893 cp->mc_slab_audit = NULL;
894
895 lck_attr_free(cp->mc_bkt_lock_attr);
896 lck_grp_free(cp->mc_bkt_lock_grp);
897 lck_grp_attr_free(cp->mc_bkt_lock_grp_attr);
898
899 lck_attr_free(cp->mc_cpu_lock_attr);
900 lck_grp_free(cp->mc_cpu_lock_grp);
901 lck_grp_attr_free(cp->mc_cpu_lock_grp_attr);
902
903 lck_attr_free(cp->mc_sync_lock_attr);
904 lck_grp_free(cp->mc_sync_lock_grp);
905 lck_grp_attr_free(cp->mc_sync_lock_grp_attr);
906
907 /*
908 * TODO: We need to destroy the zone here, but cannot do it
909 * because there is no such way to achieve that. Until then
910 * the memory allocated for the zone structure is leaked.
911 * Once it is achievable, uncomment these lines:
912 *
913 * if (cp->mc_slab_zone != NULL) {
914 * zdestroy(cp->mc_slab_zone);
915 * cp->mc_slab_zone = NULL;
916 * }
917 */
918
919 /* Get the original address since we're about to free it */
920 pbuf = (void **)((intptr_t)cp - sizeof(void *));
921
922 zfree(mcache_zone, *pbuf);
923 }
924
925 /*
926 * Internal slab allocator used as a backend for simple caches. The current
927 * implementation uses the zone allocator for simplicity reasons.
928 */
929 static unsigned int
930 mcache_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num,
931 int wait)
932 {
933 #pragma unused(wait)
934 mcache_t *cp = arg;
935 unsigned int need = num;
936 size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof(u_int64_t));
937 u_int32_t flags = cp->mc_flags;
938 void *buf, *base, **pbuf;
939 mcache_obj_t **list = *plist;
940
941 *list = NULL;
942
943 for (;;) {
944 buf = zalloc(cp->mc_slab_zone);
945 if (buf == NULL) {
946 break;
947 }
948
949 /* Get the aligned base address for this object */
950 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
951 cp->mc_align);
952
953 /*
954 * Wind back a pointer size from the aligned base and
955 * save the original address so we can free it later.
956 */
957 pbuf = (void **)((intptr_t)base - sizeof(void *));
958 *pbuf = buf;
959
960 VERIFY(((intptr_t)base + cp->mc_bufsize) <=
961 ((intptr_t)buf + cp->mc_chunksize));
962
963 /*
964 * If auditing is enabled, patternize the contents of
965 * the buffer starting from the 64-bit aligned base to
966 * the end of the buffer; the length is rounded up to
967 * the nearest 64-bit multiply; this is because we use
968 * 64-bit memory access to set/check the pattern.
969 */
970 if (flags & MCF_DEBUG) {
971 VERIFY(((intptr_t)base + rsize) <=
972 ((intptr_t)buf + cp->mc_chunksize));
973 mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
974 }
975
976 VERIFY(IS_P2ALIGNED(base, cp->mc_align));
977 *list = (mcache_obj_t *)base;
978
979 (*list)->obj_next = NULL;
980 list = *plist = &(*list)->obj_next;
981
982 /* If we got them all, return to mcache */
983 if (--need == 0) {
984 break;
985 }
986 }
987
988 return num - need;
989 }
990
991 /*
992 * Internal slab deallocator used as a backend for simple caches.
993 */
994 static void
995 mcache_slab_free(void *arg, mcache_obj_t *list, __unused boolean_t purged)
996 {
997 mcache_t *cp = arg;
998 mcache_obj_t *nlist;
999 size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof(u_int64_t));
1000 u_int32_t flags = cp->mc_flags;
1001 void *base;
1002 void **pbuf;
1003
1004 for (;;) {
1005 nlist = list->obj_next;
1006 list->obj_next = NULL;
1007
1008 base = list;
1009 VERIFY(IS_P2ALIGNED(base, cp->mc_align));
1010
1011 /* Get the original address since we're about to free it */
1012 pbuf = (void **)((intptr_t)base - sizeof(void *));
1013
1014 VERIFY(((intptr_t)base + cp->mc_bufsize) <=
1015 ((intptr_t)*pbuf + cp->mc_chunksize));
1016
1017 if (flags & MCF_DEBUG) {
1018 VERIFY(((intptr_t)base + rsize) <=
1019 ((intptr_t)*pbuf + cp->mc_chunksize));
1020 mcache_audit_free_verify(NULL, base, 0, rsize);
1021 }
1022
1023 /* Free it to zone */
1024 zfree(cp->mc_slab_zone, *pbuf);
1025
1026 /* No more objects to free; return to mcache */
1027 if ((list = nlist) == NULL) {
1028 break;
1029 }
1030 }
1031 }
1032
1033 /*
1034 * Internal slab auditor for simple caches.
1035 */
1036 static void
1037 mcache_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
1038 {
1039 mcache_t *cp = arg;
1040 size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof(u_int64_t));
1041 void *base, **pbuf;
1042
1043 while (list != NULL) {
1044 mcache_obj_t *next = list->obj_next;
1045
1046 base = list;
1047 VERIFY(IS_P2ALIGNED(base, cp->mc_align));
1048
1049 /* Get the original address */
1050 pbuf = (void **)((intptr_t)base - sizeof(void *));
1051
1052 VERIFY(((intptr_t)base + rsize) <=
1053 ((intptr_t)*pbuf + cp->mc_chunksize));
1054
1055 if (!alloc) {
1056 mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
1057 } else {
1058 mcache_audit_free_verify_set(NULL, base, 0, rsize);
1059 }
1060
1061 list = list->obj_next = next;
1062 }
1063 }
1064
1065 /*
1066 * Refill the CPU's filled bucket with bkt and save the previous one.
1067 */
1068 static void
1069 mcache_cpu_refill(mcache_cpu_t *ccp, mcache_bkt_t *bkt, int objs)
1070 {
1071 ASSERT((ccp->cc_filled == NULL && ccp->cc_objs == -1) ||
1072 (ccp->cc_filled && ccp->cc_objs + objs == ccp->cc_bktsize));
1073 ASSERT(ccp->cc_bktsize > 0);
1074
1075 ccp->cc_pfilled = ccp->cc_filled;
1076 ccp->cc_pobjs = ccp->cc_objs;
1077 ccp->cc_filled = bkt;
1078 ccp->cc_objs = objs;
1079 }
1080
1081 /*
1082 * Allocate a bucket from the bucket layer.
1083 */
1084 static mcache_bkt_t *
1085 mcache_bkt_alloc(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkttype_t **btp)
1086 {
1087 mcache_bkt_t *bkt;
1088
1089 if (!MCACHE_LOCK_TRY(&cp->mc_bkt_lock)) {
1090 /*
1091 * The bucket layer lock is held by another CPU; increase
1092 * the contention count so that we can later resize the
1093 * bucket size accordingly.
1094 */
1095 MCACHE_LOCK(&cp->mc_bkt_lock);
1096 cp->mc_bkt_contention++;
1097 }
1098
1099 if ((bkt = blp->bl_list) != NULL) {
1100 blp->bl_list = bkt->bkt_next;
1101 if (--blp->bl_total < blp->bl_min) {
1102 blp->bl_min = blp->bl_total;
1103 }
1104 blp->bl_alloc++;
1105 }
1106
1107 if (btp != NULL) {
1108 *btp = cp->cache_bkttype;
1109 }
1110
1111 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1112
1113 return bkt;
1114 }
1115
1116 /*
1117 * Free a bucket to the bucket layer.
1118 */
1119 static void
1120 mcache_bkt_free(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkt_t *bkt)
1121 {
1122 MCACHE_LOCK(&cp->mc_bkt_lock);
1123
1124 bkt->bkt_next = blp->bl_list;
1125 blp->bl_list = bkt;
1126 blp->bl_total++;
1127
1128 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1129 }
1130
1131 /*
1132 * Enable the bucket layer of a cache.
1133 */
1134 static void
1135 mcache_cache_bkt_enable(mcache_t *cp)
1136 {
1137 mcache_cpu_t *ccp;
1138 int cpu;
1139
1140 if (cp->mc_flags & MCF_NOCPUCACHE) {
1141 return;
1142 }
1143
1144 for (cpu = 0; cpu < ncpu; cpu++) {
1145 ccp = &cp->mc_cpu[cpu];
1146 MCACHE_LOCK(&ccp->cc_lock);
1147 ccp->cc_bktsize = cp->cache_bkttype->bt_bktsize;
1148 MCACHE_UNLOCK(&ccp->cc_lock);
1149 }
1150 }
1151
1152 /*
1153 * Purge all buckets from a cache and disable its bucket layer.
1154 */
1155 static void
1156 mcache_bkt_purge(mcache_t *cp)
1157 {
1158 mcache_cpu_t *ccp;
1159 mcache_bkt_t *bp, *pbp;
1160 mcache_bkttype_t *btp;
1161 int cpu, objs, pobjs;
1162
1163 for (cpu = 0; cpu < ncpu; cpu++) {
1164 ccp = &cp->mc_cpu[cpu];
1165
1166 MCACHE_LOCK(&ccp->cc_lock);
1167
1168 btp = cp->cache_bkttype;
1169 bp = ccp->cc_filled;
1170 pbp = ccp->cc_pfilled;
1171 objs = ccp->cc_objs;
1172 pobjs = ccp->cc_pobjs;
1173 ccp->cc_filled = NULL;
1174 ccp->cc_pfilled = NULL;
1175 ccp->cc_objs = -1;
1176 ccp->cc_pobjs = -1;
1177 ccp->cc_bktsize = 0;
1178
1179 MCACHE_UNLOCK(&ccp->cc_lock);
1180
1181 if (bp != NULL) {
1182 mcache_bkt_destroy(cp, btp, bp, objs);
1183 }
1184 if (pbp != NULL) {
1185 mcache_bkt_destroy(cp, btp, pbp, pobjs);
1186 }
1187 }
1188
1189 mcache_bkt_ws_zero(cp);
1190 mcache_bkt_ws_reap(cp);
1191 }
1192
1193 /*
1194 * Free one or more objects in the bucket to the slab layer,
1195 * and also free the bucket itself.
1196 */
1197 static void
1198 mcache_bkt_destroy(mcache_t *cp, mcache_bkttype_t *btp, mcache_bkt_t *bkt,
1199 int nobjs)
1200 {
1201 if (nobjs > 0) {
1202 mcache_obj_t *top = bkt->bkt_obj[nobjs - 1];
1203
1204 if (cp->mc_flags & MCF_DEBUG) {
1205 mcache_obj_t *o = top;
1206 int cnt = 0;
1207
1208 /*
1209 * Verify that the chain of objects in the bucket is
1210 * valid. Any mismatch here means a mistake when the
1211 * object(s) were freed to the CPU layer, so we panic.
1212 */
1213 while (o != NULL) {
1214 o = o->obj_next;
1215 ++cnt;
1216 }
1217 if (cnt != nobjs) {
1218 panic("mcache_bkt_destroy: %s cp %p corrupted "
1219 "list in bkt %p (nobjs %d actual %d)\n",
1220 cp->mc_name, (void *)cp, (void *)bkt,
1221 nobjs, cnt);
1222 }
1223 }
1224
1225 /* Advise the slab layer to purge the object(s) */
1226 (*cp->mc_slab_free)(cp->mc_private, top,
1227 (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
1228 }
1229 mcache_free(btp->bt_cache, bkt);
1230 }
1231
1232 /*
1233 * Update the bucket layer working set statistics.
1234 */
1235 static void
1236 mcache_bkt_ws_update(mcache_t *cp)
1237 {
1238 MCACHE_LOCK(&cp->mc_bkt_lock);
1239
1240 cp->mc_full.bl_reaplimit = cp->mc_full.bl_min;
1241 cp->mc_full.bl_min = cp->mc_full.bl_total;
1242 cp->mc_empty.bl_reaplimit = cp->mc_empty.bl_min;
1243 cp->mc_empty.bl_min = cp->mc_empty.bl_total;
1244
1245 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1246 }
1247
1248 /*
1249 * Mark everything as eligible for reaping (working set is zero).
1250 */
1251 static void
1252 mcache_bkt_ws_zero(mcache_t *cp)
1253 {
1254 MCACHE_LOCK(&cp->mc_bkt_lock);
1255
1256 cp->mc_full.bl_reaplimit = cp->mc_full.bl_total;
1257 cp->mc_full.bl_min = cp->mc_full.bl_total;
1258 cp->mc_empty.bl_reaplimit = cp->mc_empty.bl_total;
1259 cp->mc_empty.bl_min = cp->mc_empty.bl_total;
1260
1261 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1262 }
1263
1264 /*
1265 * Reap all buckets that are beyond the working set.
1266 */
1267 static void
1268 mcache_bkt_ws_reap(mcache_t *cp)
1269 {
1270 long reap;
1271 mcache_bkt_t *bkt;
1272 mcache_bkttype_t *btp;
1273
1274 reap = MIN(cp->mc_full.bl_reaplimit, cp->mc_full.bl_min);
1275 while (reap-- &&
1276 (bkt = mcache_bkt_alloc(cp, &cp->mc_full, &btp)) != NULL) {
1277 mcache_bkt_destroy(cp, btp, bkt, btp->bt_bktsize);
1278 }
1279
1280 reap = MIN(cp->mc_empty.bl_reaplimit, cp->mc_empty.bl_min);
1281 while (reap-- &&
1282 (bkt = mcache_bkt_alloc(cp, &cp->mc_empty, &btp)) != NULL) {
1283 mcache_bkt_destroy(cp, btp, bkt, 0);
1284 }
1285 }
1286
1287 static void
1288 mcache_reap_timeout(thread_call_param_t dummy __unused,
1289 thread_call_param_t arg)
1290 {
1291 volatile UInt32 *flag = arg;
1292
1293 ASSERT(flag == &mcache_reaping);
1294
1295 *flag = 0;
1296 }
1297
1298 static void
1299 mcache_reap_done(void *flag)
1300 {
1301 uint64_t deadline, leeway;
1302
1303 clock_interval_to_deadline(mcache_reap_interval, NSEC_PER_SEC,
1304 &deadline);
1305 clock_interval_to_absolutetime_interval(mcache_reap_interval_leeway,
1306 NSEC_PER_SEC, &leeway);
1307 thread_call_enter_delayed_with_leeway(mcache_reap_tcall, flag,
1308 deadline, leeway, THREAD_CALL_DELAY_LEEWAY);
1309 }
1310
1311 static void
1312 mcache_reap_start(void *arg)
1313 {
1314 UInt32 *flag = arg;
1315
1316 ASSERT(flag == &mcache_reaping);
1317
1318 mcache_applyall(mcache_cache_reap);
1319 mcache_dispatch(mcache_reap_done, flag);
1320 }
1321
1322 __private_extern__ void
1323 mcache_reap(void)
1324 {
1325 UInt32 *flag = &mcache_reaping;
1326
1327 if (mcache_llock_owner == current_thread() ||
1328 !OSCompareAndSwap(0, 1, flag)) {
1329 return;
1330 }
1331
1332 mcache_dispatch(mcache_reap_start, flag);
1333 }
1334
1335 __private_extern__ void
1336 mcache_reap_now(mcache_t *cp, boolean_t purge)
1337 {
1338 if (purge) {
1339 mcache_bkt_purge(cp);
1340 mcache_cache_bkt_enable(cp);
1341 } else {
1342 mcache_bkt_ws_zero(cp);
1343 mcache_bkt_ws_reap(cp);
1344 }
1345 }
1346
1347 static void
1348 mcache_cache_reap(mcache_t *cp)
1349 {
1350 mcache_bkt_ws_reap(cp);
1351 }
1352
1353 /*
1354 * Performs period maintenance on a cache.
1355 */
1356 static void
1357 mcache_cache_update(mcache_t *cp)
1358 {
1359 int need_bkt_resize = 0;
1360 int need_bkt_reenable = 0;
1361
1362 lck_mtx_assert(mcache_llock, LCK_MTX_ASSERT_OWNED);
1363
1364 mcache_bkt_ws_update(cp);
1365
1366 /*
1367 * Cache resize and post-purge reenable are mutually exclusive.
1368 * If the cache was previously purged, there is no point of
1369 * increasing the bucket size as there was an indication of
1370 * memory pressure on the system.
1371 */
1372 lck_mtx_lock_spin(&cp->mc_sync_lock);
1373 if (!(cp->mc_flags & MCF_NOCPUCACHE) && cp->mc_enable_cnt) {
1374 need_bkt_reenable = 1;
1375 }
1376 lck_mtx_unlock(&cp->mc_sync_lock);
1377
1378 MCACHE_LOCK(&cp->mc_bkt_lock);
1379 /*
1380 * If the contention count is greater than the threshold, and if
1381 * we are not already at the maximum bucket size, increase it.
1382 * Otherwise, if this cache was previously purged by the user
1383 * then we simply reenable it.
1384 */
1385 if ((unsigned int)cp->mc_chunksize < cp->cache_bkttype->bt_maxbuf &&
1386 (int)(cp->mc_bkt_contention - cp->mc_bkt_contention_prev) >
1387 mcache_bkt_contention && !need_bkt_reenable) {
1388 need_bkt_resize = 1;
1389 }
1390
1391 cp->mc_bkt_contention_prev = cp->mc_bkt_contention;
1392 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1393
1394 if (need_bkt_resize) {
1395 mcache_dispatch(mcache_cache_bkt_resize, cp);
1396 } else if (need_bkt_reenable) {
1397 mcache_dispatch(mcache_cache_enable, cp);
1398 }
1399 }
1400
1401 /*
1402 * Recompute a cache's bucket size. This is an expensive operation
1403 * and should not be done frequently; larger buckets provide for a
1404 * higher transfer rate with the bucket while smaller buckets reduce
1405 * the memory consumption.
1406 */
1407 static void
1408 mcache_cache_bkt_resize(void *arg)
1409 {
1410 mcache_t *cp = arg;
1411 mcache_bkttype_t *btp = cp->cache_bkttype;
1412
1413 if ((unsigned int)cp->mc_chunksize < btp->bt_maxbuf) {
1414 mcache_bkt_purge(cp);
1415
1416 /*
1417 * Upgrade to the next bucket type with larger bucket size;
1418 * temporarily set the previous contention snapshot to a
1419 * negative number to prevent unnecessary resize request.
1420 */
1421 MCACHE_LOCK(&cp->mc_bkt_lock);
1422 cp->cache_bkttype = ++btp;
1423 cp->mc_bkt_contention_prev = cp->mc_bkt_contention + INT_MAX;
1424 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1425
1426 mcache_cache_enable(cp);
1427 }
1428 }
1429
1430 /*
1431 * Reenable a previously disabled cache due to purge.
1432 */
1433 static void
1434 mcache_cache_enable(void *arg)
1435 {
1436 mcache_t *cp = arg;
1437
1438 lck_mtx_lock_spin(&cp->mc_sync_lock);
1439 cp->mc_purge_cnt = 0;
1440 cp->mc_enable_cnt = 0;
1441 lck_mtx_unlock(&cp->mc_sync_lock);
1442
1443 mcache_cache_bkt_enable(cp);
1444 }
1445
1446 static void
1447 mcache_update_timeout(__unused void *arg)
1448 {
1449 uint64_t deadline, leeway;
1450
1451 clock_interval_to_deadline(mcache_reap_interval, NSEC_PER_SEC,
1452 &deadline);
1453 clock_interval_to_absolutetime_interval(mcache_reap_interval_leeway,
1454 NSEC_PER_SEC, &leeway);
1455 thread_call_enter_delayed_with_leeway(mcache_update_tcall, NULL,
1456 deadline, leeway, THREAD_CALL_DELAY_LEEWAY);
1457 }
1458
1459 static void
1460 mcache_update(thread_call_param_t arg __unused,
1461 thread_call_param_t dummy __unused)
1462 {
1463 mcache_applyall(mcache_cache_update);
1464 mcache_update_timeout(NULL);
1465 }
1466
1467 static void
1468 mcache_applyall(void (*func)(mcache_t *))
1469 {
1470 mcache_t *cp;
1471
1472 MCACHE_LIST_LOCK();
1473 LIST_FOREACH(cp, &mcache_head, mc_list) {
1474 func(cp);
1475 }
1476 MCACHE_LIST_UNLOCK();
1477 }
1478
1479 static void
1480 mcache_dispatch(void (*func)(void *), void *arg)
1481 {
1482 ASSERT(func != NULL);
1483 timeout(func, arg, hz / 1000);
1484 }
1485
1486 __private_extern__ void
1487 mcache_buffer_log(mcache_audit_t *mca, void *addr, mcache_t *cp,
1488 struct timeval *base_ts)
1489 {
1490 struct timeval now, base = { 0, 0 };
1491 void *stack[MCACHE_STACK_DEPTH + 1];
1492 struct mca_trn *transaction;
1493
1494 transaction = &mca->mca_trns[mca->mca_next_trn];
1495
1496 mca->mca_addr = addr;
1497 mca->mca_cache = cp;
1498
1499 transaction->mca_thread = current_thread();
1500
1501 bzero(stack, sizeof(stack));
1502 transaction->mca_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
1503 bcopy(&stack[1], transaction->mca_stack,
1504 sizeof(transaction->mca_stack));
1505
1506 microuptime(&now);
1507 if (base_ts != NULL) {
1508 base = *base_ts;
1509 }
1510 /* tstamp is in ms relative to base_ts */
1511 transaction->mca_tstamp = ((now.tv_usec - base.tv_usec) / 1000);
1512 if ((now.tv_sec - base.tv_sec) > 0) {
1513 transaction->mca_tstamp += ((now.tv_sec - base.tv_sec) * 1000);
1514 }
1515
1516 mca->mca_next_trn =
1517 (mca->mca_next_trn + 1) % mca_trn_max;
1518 }
1519
1520 __private_extern__ void
1521 mcache_set_pattern(u_int64_t pattern, void *buf_arg, size_t size)
1522 {
1523 u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1524 u_int64_t *buf = (u_int64_t *)buf_arg;
1525
1526 VERIFY(IS_P2ALIGNED(buf_arg, sizeof(u_int64_t)));
1527 VERIFY(IS_P2ALIGNED(size, sizeof(u_int64_t)));
1528
1529 while (buf < buf_end) {
1530 *buf++ = pattern;
1531 }
1532 }
1533
1534 __private_extern__ void *
1535 mcache_verify_pattern(u_int64_t pattern, void *buf_arg, size_t size)
1536 {
1537 u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1538 u_int64_t *buf;
1539
1540 VERIFY(IS_P2ALIGNED(buf_arg, sizeof(u_int64_t)));
1541 VERIFY(IS_P2ALIGNED(size, sizeof(u_int64_t)));
1542
1543 for (buf = buf_arg; buf < buf_end; buf++) {
1544 if (*buf != pattern) {
1545 return buf;
1546 }
1547 }
1548 return NULL;
1549 }
1550
1551 __private_extern__ void *
1552 mcache_verify_set_pattern(u_int64_t old, u_int64_t new, void *buf_arg,
1553 size_t size)
1554 {
1555 u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1556 u_int64_t *buf;
1557
1558 VERIFY(IS_P2ALIGNED(buf_arg, sizeof(u_int64_t)));
1559 VERIFY(IS_P2ALIGNED(size, sizeof(u_int64_t)));
1560
1561 for (buf = buf_arg; buf < buf_end; buf++) {
1562 if (*buf != old) {
1563 mcache_set_pattern(old, buf_arg,
1564 (uintptr_t)buf - (uintptr_t)buf_arg);
1565 return buf;
1566 }
1567 *buf = new;
1568 }
1569 return NULL;
1570 }
1571
1572 __private_extern__ void
1573 mcache_audit_free_verify(mcache_audit_t *mca, void *base, size_t offset,
1574 size_t size)
1575 {
1576 void *addr;
1577 u_int64_t *oaddr64;
1578 mcache_obj_t *next;
1579
1580 addr = (void *)((uintptr_t)base + offset);
1581 next = ((mcache_obj_t *)addr)->obj_next;
1582
1583 /* For the "obj_next" pointer in the buffer */
1584 oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof(u_int64_t));
1585 *oaddr64 = MCACHE_FREE_PATTERN;
1586
1587 if ((oaddr64 = mcache_verify_pattern(MCACHE_FREE_PATTERN,
1588 (caddr_t)base, size)) != NULL) {
1589 mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
1590 (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
1591 /* NOTREACHED */
1592 }
1593 ((mcache_obj_t *)addr)->obj_next = next;
1594 }
1595
1596 __private_extern__ void
1597 mcache_audit_free_verify_set(mcache_audit_t *mca, void *base, size_t offset,
1598 size_t size)
1599 {
1600 void *addr;
1601 u_int64_t *oaddr64;
1602 mcache_obj_t *next;
1603
1604 addr = (void *)((uintptr_t)base + offset);
1605 next = ((mcache_obj_t *)addr)->obj_next;
1606
1607 /* For the "obj_next" pointer in the buffer */
1608 oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof(u_int64_t));
1609 *oaddr64 = MCACHE_FREE_PATTERN;
1610
1611 if ((oaddr64 = mcache_verify_set_pattern(MCACHE_FREE_PATTERN,
1612 MCACHE_UNINITIALIZED_PATTERN, (caddr_t)base, size)) != NULL) {
1613 mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
1614 (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
1615 /* NOTREACHED */
1616 }
1617 ((mcache_obj_t *)addr)->obj_next = next;
1618 }
1619
1620 #undef panic
1621
1622 #define DUMP_TRN_FMT() \
1623 "%s transaction thread %p saved PC stack (%d deep):\n" \
1624 "\t%p, %p, %p, %p, %p, %p, %p, %p\n" \
1625 "\t%p, %p, %p, %p, %p, %p, %p, %p\n"
1626
1627 #define DUMP_TRN_FIELDS(s, x) \
1628 s, \
1629 mca->mca_trns[x].mca_thread, mca->mca_trns[x].mca_depth, \
1630 mca->mca_trns[x].mca_stack[0], mca->mca_trns[x].mca_stack[1], \
1631 mca->mca_trns[x].mca_stack[2], mca->mca_trns[x].mca_stack[3], \
1632 mca->mca_trns[x].mca_stack[4], mca->mca_trns[x].mca_stack[5], \
1633 mca->mca_trns[x].mca_stack[6], mca->mca_trns[x].mca_stack[7], \
1634 mca->mca_trns[x].mca_stack[8], mca->mca_trns[x].mca_stack[9], \
1635 mca->mca_trns[x].mca_stack[10], mca->mca_trns[x].mca_stack[11], \
1636 mca->mca_trns[x].mca_stack[12], mca->mca_trns[x].mca_stack[13], \
1637 mca->mca_trns[x].mca_stack[14], mca->mca_trns[x].mca_stack[15]
1638
1639 #define MCA_TRN_LAST ((mca->mca_next_trn + mca_trn_max) % mca_trn_max)
1640 #define MCA_TRN_PREV ((mca->mca_next_trn + mca_trn_max - 1) % mca_trn_max)
1641
1642 __private_extern__ char *
1643 mcache_dump_mca(mcache_audit_t *mca)
1644 {
1645 if (mca_dump_buf == NULL) {
1646 return NULL;
1647 }
1648
1649 snprintf(mca_dump_buf, DUMP_MCA_BUF_SIZE,
1650 "mca %p: addr %p, cache %p (%s) nxttrn %d\n"
1651 DUMP_TRN_FMT()
1652 DUMP_TRN_FMT(),
1653
1654 mca, mca->mca_addr, mca->mca_cache,
1655 mca->mca_cache ? mca->mca_cache->mc_name : "?",
1656 mca->mca_next_trn,
1657
1658 DUMP_TRN_FIELDS("last", MCA_TRN_LAST),
1659 DUMP_TRN_FIELDS("previous", MCA_TRN_PREV));
1660
1661 return mca_dump_buf;
1662 }
1663
1664 __private_extern__ void
1665 mcache_audit_panic(mcache_audit_t *mca, void *addr, size_t offset,
1666 int64_t expected, int64_t got)
1667 {
1668 if (mca == NULL) {
1669 panic("mcache_audit: buffer %p modified after free at "
1670 "offset 0x%lx (0x%llx instead of 0x%llx)\n", addr,
1671 offset, got, expected);
1672 /* NOTREACHED */
1673 }
1674
1675 panic("mcache_audit: buffer %p modified after free at offset 0x%lx "
1676 "(0x%llx instead of 0x%llx)\n%s\n",
1677 addr, offset, got, expected, mcache_dump_mca(mca));
1678 /* NOTREACHED */
1679 }
1680
1681 __private_extern__ int
1682 assfail(const char *a, const char *f, int l)
1683 {
1684 panic("assertion failed: %s, file: %s, line: %d", a, f, l);
1685 return 0;
1686 }