bsd/kern/mcache.c

   1 /*
   2  * Copyright (c) 2006-2019 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 /*
  30  * Memory allocator with per-CPU caching, derived from the kmem magazine
  31  * concept and implementation as described in the following paper:
  32  * http://www.usenix.org/events/usenix01/full_papers/bonwick/bonwick.pdf
  33  * That implementation is Copyright 2006 Sun Microsystems, Inc.  All rights
  34  * reserved.  Use is subject to license terms.
  35  *
  36  * There are several major differences between this and the original kmem
  37  * magazine: this derivative implementation allows for multiple objects to
  38  * be allocated and freed from/to the object cache in one call; in addition,
  39  * it provides for better flexibility where the user is allowed to define
  40  * its own slab allocator (instead of the default zone allocator).  Finally,
  41  * no object construction/destruction takes place at the moment, although
  42  * this could be added in future to improve efficiency.
  43  */
  44
  45 #include <sys/param.h>
  46 #include <sys/types.h>
  47 #include <sys/malloc.h>
  48 #include <sys/mbuf.h>
  49 #include <sys/queue.h>
  50 #include <sys/kernel.h>
  51 #include <sys/systm.h>
  52
  53 #include <kern/debug.h>
  54 #include <kern/zalloc.h>
  55 #include <kern/cpu_number.h>
  56 #include <kern/locks.h>
  57 #include <kern/thread_call.h>
  58
  59 #include <libkern/libkern.h>
  60 #include <libkern/OSAtomic.h>
  61 #include <libkern/OSDebug.h>
  62
  63 #include <mach/vm_param.h>
  64 #include <machine/limits.h>
  65 #include <machine/machine_routines.h>
  66
  67 #include <string.h>
  68
  69 #include <sys/mcache.h>
  70
  71 #define MCACHE_SIZE(n) \
  72         __builtin_offsetof(mcache_t, mc_cpu[n])
  73
  74 /* Allocate extra in case we need to manually align the pointer */
  75 #define MCACHE_ALLOC_SIZE \
  76         (sizeof (void *) + MCACHE_SIZE(ncpu) + CPU_CACHE_LINE_SIZE)
  77
  78 #define MCACHE_CPU(c) \
  79         (mcache_cpu_t *)((void *)((char *)(c) + MCACHE_SIZE(cpu_number())))
  80
  81 /*
  82  * MCACHE_LIST_LOCK() and MCACHE_LIST_UNLOCK() are macros used
  83  * to serialize accesses to the global list of caches in the system.
  84  * They also record the thread currently running in the critical
  85  * section, so that we can avoid recursive requests to reap the
  86  * caches when memory runs low.
  87  */
  88 #define MCACHE_LIST_LOCK() {                            \
  89         lck_mtx_lock(mcache_llock);                     \
  90         mcache_llock_owner = current_thread();          \
  91 }
  92
  93 #define MCACHE_LIST_UNLOCK() {                          \
  94         mcache_llock_owner = NULL;                      \
  95         lck_mtx_unlock(mcache_llock);                   \
  96 }
  97
  98 #define MCACHE_LOCK(l)          lck_mtx_lock(l)
  99 #define MCACHE_UNLOCK(l)        lck_mtx_unlock(l)
 100 #define MCACHE_LOCK_TRY(l)      lck_mtx_try_lock(l)
 101
 102 static int ncpu;
 103 static unsigned int cache_line_size;
 104 static lck_mtx_t *mcache_llock;
 105 static struct thread *mcache_llock_owner;
 106 static lck_attr_t *mcache_llock_attr;
 107 static lck_grp_t *mcache_llock_grp;
 108 static lck_grp_attr_t *mcache_llock_grp_attr;
 109 static struct zone *mcache_zone;
 110 static const uint32_t mcache_reap_interval = 15;
 111 static const uint32_t mcache_reap_interval_leeway = 2;
 112 static UInt32 mcache_reaping;
 113 static int mcache_ready;
 114 static int mcache_updating;
 115
 116 static int mcache_bkt_contention = 3;
 117 #if DEBUG
 118 static unsigned int mcache_flags = MCF_DEBUG;
 119 #else
 120 static unsigned int mcache_flags = 0;
 121 #endif
 122
 123 int mca_trn_max = MCA_TRN_MAX;
 124
 125 #define DUMP_MCA_BUF_SIZE       512
 126 static char *mca_dump_buf;
 127
 128 static mcache_bkttype_t mcache_bkttype[] = {
 129         { 1, 4096, 32768, NULL },
 130         { 3, 2048, 16384, NULL },
 131         { 7, 1024, 12288, NULL },
 132         { 15, 256, 8192, NULL },
 133         { 31, 64, 4096, NULL },
 134         { 47, 0, 2048, NULL },
 135         { 63, 0, 1024, NULL },
 136         { 95, 0, 512, NULL },
 137         { 143, 0, 256, NULL },
 138         { 165, 0, 0, NULL },
 139 };
 140
 141 static mcache_t *mcache_create_common(const char *, size_t, size_t,
 142     mcache_allocfn_t, mcache_freefn_t, mcache_auditfn_t, mcache_logfn_t,
 143     mcache_notifyfn_t, void *, u_int32_t, int, int);
 144 static unsigned int mcache_slab_alloc(void *, mcache_obj_t ***,
 145     unsigned int, int);
 146 static void mcache_slab_free(void *, mcache_obj_t *, boolean_t);
 147 static void mcache_slab_audit(void *, mcache_obj_t *, boolean_t);
 148 static void mcache_cpu_refill(mcache_cpu_t *, mcache_bkt_t *, int);
 149 static mcache_bkt_t *mcache_bkt_alloc(mcache_t *, mcache_bktlist_t *);
 150 static void mcache_bkt_free(mcache_t *, mcache_bktlist_t *, mcache_bkt_t *);
 151 static void mcache_cache_bkt_enable(mcache_t *);
 152 static void mcache_bkt_purge(mcache_t *);
 153 static void mcache_bkt_destroy(mcache_t *, mcache_bkt_t *, int);
 154 static void mcache_bkt_ws_update(mcache_t *);
 155 static void mcache_bkt_ws_zero(mcache_t *);
 156 static void mcache_bkt_ws_reap(mcache_t *);
 157 static void mcache_dispatch(void (*)(void *), void *);
 158 static void mcache_cache_reap(mcache_t *);
 159 static void mcache_cache_update(mcache_t *);
 160 static void mcache_cache_bkt_resize(void *);
 161 static void mcache_cache_enable(void *);
 162 static void mcache_update(thread_call_param_t __unused, thread_call_param_t __unused);
 163 static void mcache_update_timeout(void *);
 164 static void mcache_applyall(void (*)(mcache_t *));
 165 static void mcache_reap_start(void *);
 166 static void mcache_reap_done(void *);
 167 static void mcache_reap_timeout(thread_call_param_t __unused, thread_call_param_t);
 168 static void mcache_notify(mcache_t *, u_int32_t);
 169 static void mcache_purge(void *);
 170
 171 static LIST_HEAD(, mcache) mcache_head;
 172 mcache_t *mcache_audit_cache;
 173
 174 static thread_call_t mcache_reap_tcall;
 175 static thread_call_t mcache_update_tcall;
 176
 177 /*
 178  * Initialize the framework; this is currently called as part of BSD init.
 179  */
 180 __private_extern__ void
 181 mcache_init(void)
 182 {
 183         mcache_bkttype_t *btp;
 184         unsigned int i;
 185         char name[32];
 186
 187         VERIFY(mca_trn_max >= 2);
 188
 189         ncpu = ml_get_max_cpus();
 190         (void) mcache_cache_line_size();        /* prime it */
 191
 192         mcache_llock_grp_attr = lck_grp_attr_alloc_init();
 193         mcache_llock_grp = lck_grp_alloc_init("mcache.list",
 194             mcache_llock_grp_attr);
 195         mcache_llock_attr = lck_attr_alloc_init();
 196         mcache_llock = lck_mtx_alloc_init(mcache_llock_grp, mcache_llock_attr);
 197
 198         mcache_reap_tcall = thread_call_allocate(mcache_reap_timeout, NULL);
 199         mcache_update_tcall = thread_call_allocate(mcache_update, NULL);
 200         if (mcache_reap_tcall == NULL || mcache_update_tcall == NULL) {
 201                 panic("mcache_init: thread_call_allocate failed");
 202                 /* NOTREACHED */
 203                 __builtin_unreachable();
 204         }
 205
 206         mcache_zone = zinit(MCACHE_ALLOC_SIZE, 256 * MCACHE_ALLOC_SIZE,
 207             PAGE_SIZE, "mcache");
 208         if (mcache_zone == NULL) {
 209                 panic("mcache_init: failed to allocate mcache zone\n");
 210                 /* NOTREACHED */
 211                 __builtin_unreachable();
 212         }
 213         zone_change(mcache_zone, Z_CALLERACCT, FALSE);
 214
 215         LIST_INIT(&mcache_head);
 216
 217         for (i = 0; i < sizeof(mcache_bkttype) / sizeof(*btp); i++) {
 218                 btp = &mcache_bkttype[i];
 219                 (void) snprintf(name, sizeof(name), "bkt_%d",
 220                     btp->bt_bktsize);
 221                 btp->bt_cache = mcache_create(name,
 222                     (btp->bt_bktsize + 1) * sizeof(void *), 0, 0, MCR_SLEEP);
 223         }
 224
 225         PE_parse_boot_argn("mcache_flags", &mcache_flags, sizeof(mcache_flags));
 226         mcache_flags &= MCF_FLAGS_MASK;
 227
 228         mcache_audit_cache = mcache_create("audit", sizeof(mcache_audit_t),
 229             0, 0, MCR_SLEEP);
 230
 231         mcache_applyall(mcache_cache_bkt_enable);
 232         mcache_ready = 1;
 233
 234         printf("mcache: %d CPU(s), %d bytes CPU cache line size\n",
 235             ncpu, CPU_CACHE_LINE_SIZE);
 236 }
 237
 238 /*
 239  * Return the global mcache flags.
 240  */
 241 __private_extern__ unsigned int
 242 mcache_getflags(void)
 243 {
 244         return mcache_flags;
 245 }
 246
 247 /*
 248  * Return the CPU cache line size.
 249  */
 250 __private_extern__ unsigned int
 251 mcache_cache_line_size(void)
 252 {
 253         if (cache_line_size == 0) {
 254                 ml_cpu_info_t cpu_info;
 255                 ml_cpu_get_info(&cpu_info);
 256                 cache_line_size = cpu_info.cache_line_size;
 257         }
 258         return cache_line_size;
 259 }
 260
 261 /*
 262  * Create a cache using the zone allocator as the backend slab allocator.
 263  * The caller may specify any alignment for the object; if it specifies 0
 264  * the default alignment (MCACHE_ALIGN) will be used.
 265  */
 266 __private_extern__ mcache_t *
 267 mcache_create(const char *name, size_t bufsize, size_t align,
 268     u_int32_t flags, int wait)
 269 {
 270         return mcache_create_common(name, bufsize, align, mcache_slab_alloc,
 271                    mcache_slab_free, mcache_slab_audit, NULL, NULL, NULL, flags, 1,
 272                    wait);
 273 }
 274
 275 /*
 276  * Create a cache using a custom backend slab allocator.  Since the caller
 277  * is responsible for allocation, no alignment guarantee will be provided
 278  * by this framework.
 279  */
 280 __private_extern__ mcache_t *
 281 mcache_create_ext(const char *name, size_t bufsize,
 282     mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
 283     mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
 284     u_int32_t flags, int wait)
 285 {
 286         return mcache_create_common(name, bufsize, 0, allocfn,
 287                    freefn, auditfn, logfn, notifyfn, arg, flags, 0, wait);
 288 }
 289
 290 /*
 291  * Common cache creation routine.
 292  */
 293 static mcache_t *
 294 mcache_create_common(const char *name, size_t bufsize, size_t align,
 295     mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
 296     mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
 297     u_int32_t flags, int need_zone, int wait)
 298 {
 299         mcache_bkttype_t *btp;
 300         mcache_t *cp = NULL;
 301         size_t chunksize;
 302         void *buf, **pbuf;
 303         int c;
 304         char lck_name[64];
 305
 306         /* If auditing is on and print buffer is NULL, allocate it now */
 307         if ((flags & MCF_DEBUG) && mca_dump_buf == NULL) {
 308                 int malloc_wait = (wait & MCR_NOSLEEP) ? M_NOWAIT : M_WAITOK;
 309                 MALLOC(mca_dump_buf, char *, DUMP_MCA_BUF_SIZE, M_TEMP,
 310                     malloc_wait | M_ZERO);
 311                 if (mca_dump_buf == NULL) {
 312                         return NULL;
 313                 }
 314         }
 315
 316         buf = zalloc(mcache_zone);
 317         if (buf == NULL) {
 318                 goto fail;
 319         }
 320
 321         bzero(buf, MCACHE_ALLOC_SIZE);
 322
 323         /*
 324          * In case we didn't get a cache-aligned memory, round it up
 325          * accordingly.  This is needed in order to get the rest of
 326          * structure members aligned properly.  It also means that
 327          * the memory span gets shifted due to the round up, but it
 328          * is okay since we've allocated extra space for this.
 329          */
 330         cp = (mcache_t *)
 331             P2ROUNDUP((intptr_t)buf + sizeof(void *), CPU_CACHE_LINE_SIZE);
 332         pbuf = (void **)((intptr_t)cp - sizeof(void *));
 333         *pbuf = buf;
 334
 335         /*
 336          * Guaranteed alignment is valid only when we use the internal
 337          * slab allocator (currently set to use the zone allocator).
 338          */
 339         if (!need_zone) {
 340                 align = 1;
 341         } else {
 342                 /* Enforce 64-bit minimum alignment for zone-based buffers */
 343                 if (align == 0) {
 344                         align = MCACHE_ALIGN;
 345                 }
 346                 align = P2ROUNDUP(align, MCACHE_ALIGN);
 347         }
 348
 349         if ((align & (align - 1)) != 0) {
 350                 panic("mcache_create: bad alignment %lu", align);
 351                 /* NOTREACHED */
 352                 __builtin_unreachable();
 353         }
 354
 355         cp->mc_align = align;
 356         cp->mc_slab_alloc = allocfn;
 357         cp->mc_slab_free = freefn;
 358         cp->mc_slab_audit = auditfn;
 359         cp->mc_slab_log = logfn;
 360         cp->mc_slab_notify = notifyfn;
 361         cp->mc_private = need_zone ? cp : arg;
 362         cp->mc_bufsize = bufsize;
 363         cp->mc_flags = (flags & MCF_FLAGS_MASK) | mcache_flags;
 364
 365         (void) snprintf(cp->mc_name, sizeof(cp->mc_name), "mcache.%s", name);
 366
 367         (void) snprintf(lck_name, sizeof(lck_name), "%s.cpu", cp->mc_name);
 368         cp->mc_cpu_lock_grp_attr = lck_grp_attr_alloc_init();
 369         cp->mc_cpu_lock_grp = lck_grp_alloc_init(lck_name,
 370             cp->mc_cpu_lock_grp_attr);
 371         cp->mc_cpu_lock_attr = lck_attr_alloc_init();
 372
 373         /*
 374          * Allocation chunk size is the object's size plus any extra size
 375          * needed to satisfy the object's alignment.  It is enforced to be
 376          * at least the size of an LP64 pointer to simplify auditing and to
 377          * handle multiple-element allocation requests, where the elements
 378          * returned are linked together in a list.
 379          */
 380         chunksize = MAX(bufsize, sizeof(u_int64_t));
 381         if (need_zone) {
 382                 VERIFY(align != 0 && (align % MCACHE_ALIGN) == 0);
 383                 chunksize += sizeof(uint64_t) + align;
 384                 chunksize = P2ROUNDUP(chunksize, align);
 385                 if ((cp->mc_slab_zone = zinit(chunksize, 64 * 1024 * ncpu,
 386                     PAGE_SIZE, cp->mc_name)) == NULL) {
 387                         goto fail;
 388                 }
 389                 zone_change(cp->mc_slab_zone, Z_EXPAND, TRUE);
 390         }
 391         cp->mc_chunksize = chunksize;
 392
 393         /*
 394          * Initialize the bucket layer.
 395          */
 396         (void) snprintf(lck_name, sizeof(lck_name), "%s.bkt", cp->mc_name);
 397         cp->mc_bkt_lock_grp_attr = lck_grp_attr_alloc_init();
 398         cp->mc_bkt_lock_grp = lck_grp_alloc_init(lck_name,
 399             cp->mc_bkt_lock_grp_attr);
 400         cp->mc_bkt_lock_attr = lck_attr_alloc_init();
 401         lck_mtx_init(&cp->mc_bkt_lock, cp->mc_bkt_lock_grp,
 402             cp->mc_bkt_lock_attr);
 403
 404         (void) snprintf(lck_name, sizeof(lck_name), "%s.sync", cp->mc_name);
 405         cp->mc_sync_lock_grp_attr = lck_grp_attr_alloc_init();
 406         cp->mc_sync_lock_grp = lck_grp_alloc_init(lck_name,
 407             cp->mc_sync_lock_grp_attr);
 408         cp->mc_sync_lock_attr = lck_attr_alloc_init();
 409         lck_mtx_init(&cp->mc_sync_lock, cp->mc_sync_lock_grp,
 410             cp->mc_sync_lock_attr);
 411
 412         for (btp = mcache_bkttype; chunksize <= btp->bt_minbuf; btp++) {
 413                 continue;
 414         }
 415
 416         cp->cache_bkttype = btp;
 417
 418         /*
 419          * Initialize the CPU layer.  Each per-CPU structure is aligned
 420          * on the CPU cache line boundary to prevent false sharing.
 421          */
 422         for (c = 0; c < ncpu; c++) {
 423                 mcache_cpu_t *ccp = &cp->mc_cpu[c];
 424
 425                 VERIFY(IS_P2ALIGNED(ccp, CPU_CACHE_LINE_SIZE));
 426                 lck_mtx_init(&ccp->cc_lock, cp->mc_cpu_lock_grp,
 427                     cp->mc_cpu_lock_attr);
 428                 ccp->cc_objs = -1;
 429                 ccp->cc_pobjs = -1;
 430         }
 431
 432         if (mcache_ready) {
 433                 mcache_cache_bkt_enable(cp);
 434         }
 435
 436         /* TODO: dynamically create sysctl for stats */
 437
 438         MCACHE_LIST_LOCK();
 439         LIST_INSERT_HEAD(&mcache_head, cp, mc_list);
 440         MCACHE_LIST_UNLOCK();
 441
 442         /*
 443          * If cache buckets are enabled and this is the first cache
 444          * created, start the periodic cache update.
 445          */
 446         if (!(mcache_flags & MCF_NOCPUCACHE) && !mcache_updating) {
 447                 mcache_updating = 1;
 448                 mcache_update_timeout(NULL);
 449         }
 450         if (cp->mc_flags & MCF_DEBUG) {
 451                 printf("mcache_create: %s (%s) arg %p bufsize %lu align %lu "
 452                     "chunksize %lu bktsize %d\n", name, need_zone ? "i" : "e",
 453                     arg, bufsize, cp->mc_align, chunksize, btp->bt_bktsize);
 454         }
 455         return cp;
 456
 457 fail:
 458         if (buf != NULL) {
 459                 zfree(mcache_zone, buf);
 460         }
 461         return NULL;
 462 }
 463
 464 /*
 465  * Allocate one or more objects from a cache.
 466  */
 467 __private_extern__ unsigned int
 468 mcache_alloc_ext(mcache_t *cp, mcache_obj_t **list, unsigned int num, int wait)
 469 {
 470         mcache_cpu_t *ccp;
 471         mcache_obj_t **top = &(*list);
 472         mcache_bkt_t *bkt;
 473         unsigned int need = num;
 474         boolean_t nwretry = FALSE;
 475
 476         /* MCR_NOSLEEP and MCR_FAILOK are mutually exclusive */
 477         VERIFY((wait & (MCR_NOSLEEP | MCR_FAILOK)) != (MCR_NOSLEEP | MCR_FAILOK));
 478
 479         ASSERT(list != NULL);
 480         *list = NULL;
 481
 482         if (num == 0) {
 483                 return 0;
 484         }
 485
 486 retry_alloc:
 487         /* We may not always be running in the same CPU in case of retries */
 488         ccp = MCACHE_CPU(cp);
 489
 490         MCACHE_LOCK(&ccp->cc_lock);
 491         for (;;) {
 492                 /*
 493                  * If we have an object in the current CPU's filled bucket,
 494                  * chain the object to any previous objects and return if
 495                  * we've satisfied the number of requested objects.
 496                  */
 497                 if (ccp->cc_objs > 0) {
 498                         mcache_obj_t *tail;
 499                         int objs;
 500
 501                         /*
 502                          * Objects in the bucket are already linked together
 503                          * with the most recently freed object at the head of
 504                          * the list; grab as many objects as we can.
 505                          */
 506                         objs = MIN((unsigned int)ccp->cc_objs, need);
 507                         *list = ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
 508                         ccp->cc_objs -= objs;
 509                         ccp->cc_alloc += objs;
 510
 511                         tail = ccp->cc_filled->bkt_obj[ccp->cc_objs];
 512                         list = &tail->obj_next;
 513                         *list = NULL;
 514
 515                         /* If we got them all, return to caller */
 516                         if ((need -= objs) == 0) {
 517                                 MCACHE_UNLOCK(&ccp->cc_lock);
 518
 519                                 if (!(cp->mc_flags & MCF_NOLEAKLOG) &&
 520                                     cp->mc_slab_log != NULL) {
 521                                         (*cp->mc_slab_log)(num, *top, TRUE);
 522                                 }
 523
 524                                 if (cp->mc_flags & MCF_DEBUG) {
 525                                         goto debug_alloc;
 526                                 }
 527
 528                                 return num;
 529                         }
 530                 }
 531
 532                 /*
 533                  * The CPU's filled bucket is empty.  If the previous filled
 534                  * bucket was full, exchange and try again.
 535                  */
 536                 if (ccp->cc_pobjs > 0) {
 537                         mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
 538                         continue;
 539                 }
 540
 541                 /*
 542                  * If the bucket layer is disabled, allocate from slab.  This
 543                  * can happen either because MCF_NOCPUCACHE is set, or because
 544                  * the bucket layer is currently being resized.
 545                  */
 546                 if (ccp->cc_bktsize == 0) {
 547                         break;
 548                 }
 549
 550                 /*
 551                  * Both of the CPU's buckets are empty; try to get a full
 552                  * bucket from the bucket layer.  Upon success, refill this
 553                  * CPU and place any empty bucket into the empty list.
 554                  */
 555                 bkt = mcache_bkt_alloc(cp, &cp->mc_full);
 556                 if (bkt != NULL) {
 557                         if (ccp->cc_pfilled != NULL) {
 558                                 mcache_bkt_free(cp, &cp->mc_empty,
 559                                     ccp->cc_pfilled);
 560                         }
 561                         mcache_cpu_refill(ccp, bkt, ccp->cc_bktsize);
 562                         continue;
 563                 }
 564
 565                 /*
 566                  * The bucket layer has no full buckets; allocate the
 567                  * object(s) directly from the slab layer.
 568                  */
 569                 break;
 570         }
 571         MCACHE_UNLOCK(&ccp->cc_lock);
 572
 573         need -= (*cp->mc_slab_alloc)(cp->mc_private, &list, need, wait);
 574
 575         /*
 576          * If this is a blocking allocation, or if it is non-blocking and
 577          * the cache's full bucket is non-empty, then retry the allocation.
 578          */
 579         if (need > 0) {
 580                 if (!(wait & MCR_NONBLOCKING)) {
 581                         atomic_add_32(&cp->mc_wretry_cnt, 1);
 582                         goto retry_alloc;
 583                 } else if ((wait & (MCR_NOSLEEP | MCR_TRYHARD)) &&
 584                     !mcache_bkt_isempty(cp)) {
 585                         if (!nwretry) {
 586                                 nwretry = TRUE;
 587                         }
 588                         atomic_add_32(&cp->mc_nwretry_cnt, 1);
 589                         goto retry_alloc;
 590                 } else if (nwretry) {
 591                         atomic_add_32(&cp->mc_nwfail_cnt, 1);
 592                 }
 593         }
 594
 595         if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL) {
 596                 (*cp->mc_slab_log)((num - need), *top, TRUE);
 597         }
 598
 599         if (!(cp->mc_flags & MCF_DEBUG)) {
 600                 return num - need;
 601         }
 602
 603 debug_alloc:
 604         if (cp->mc_flags & MCF_DEBUG) {
 605                 mcache_obj_t **o = top;
 606                 unsigned int n;
 607
 608                 n = 0;
 609                 /*
 610                  * Verify that the chain of objects have the same count as
 611                  * what we are about to report to the caller.  Any mismatch
 612                  * here means that the object list is insanely broken and
 613                  * therefore we must panic.
 614                  */
 615                 while (*o != NULL) {
 616                         o = &(*o)->obj_next;
 617                         ++n;
 618                 }
 619                 if (n != (num - need)) {
 620                         panic("mcache_alloc_ext: %s cp %p corrupted list "
 621                             "(got %d actual %d)\n", cp->mc_name,
 622                             (void *)cp, num - need, n);
 623                         /* NOTREACHED */
 624                         __builtin_unreachable();
 625                 }
 626         }
 627
 628         /* Invoke the slab layer audit callback if auditing is enabled */
 629         if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL) {
 630                 (*cp->mc_slab_audit)(cp->mc_private, *top, TRUE);
 631         }
 632
 633         return num - need;
 634 }
 635
 636 /*
 637  * Allocate a single object from a cache.
 638  */
 639 __private_extern__ void *
 640 mcache_alloc(mcache_t *cp, int wait)
 641 {
 642         mcache_obj_t *buf;
 643
 644         (void) mcache_alloc_ext(cp, &buf, 1, wait);
 645         return buf;
 646 }
 647
 648 __private_extern__ void
 649 mcache_waiter_inc(mcache_t *cp)
 650 {
 651         atomic_add_32(&cp->mc_waiter_cnt, 1);
 652 }
 653
 654 __private_extern__ void
 655 mcache_waiter_dec(mcache_t *cp)
 656 {
 657         atomic_add_32(&cp->mc_waiter_cnt, -1);
 658 }
 659
 660 __private_extern__ boolean_t
 661 mcache_bkt_isempty(mcache_t *cp)
 662 {
 663         /*
 664          * This isn't meant to accurately tell whether there are
 665          * any full buckets in the cache; it is simply a way to
 666          * obtain "hints" about the state of the cache.
 667          */
 668         return cp->mc_full.bl_total == 0;
 669 }
 670
 671 /*
 672  * Notify the slab layer about an event.
 673  */
 674 static void
 675 mcache_notify(mcache_t *cp, u_int32_t event)
 676 {
 677         if (cp->mc_slab_notify != NULL) {
 678                 (*cp->mc_slab_notify)(cp->mc_private, event);
 679         }
 680 }
 681
 682 /*
 683  * Purge the cache and disable its buckets.
 684  */
 685 static void
 686 mcache_purge(void *arg)
 687 {
 688         mcache_t *cp = arg;
 689
 690         mcache_bkt_purge(cp);
 691         /*
 692          * We cannot simply call mcache_cache_bkt_enable() from here as
 693          * a bucket resize may be in flight and we would cause the CPU
 694          * layers of the cache to point to different sizes.  Therefore,
 695          * we simply increment the enable count so that during the next
 696          * periodic cache update the buckets can be reenabled.
 697          */
 698         lck_mtx_lock_spin(&cp->mc_sync_lock);
 699         cp->mc_enable_cnt++;
 700         lck_mtx_unlock(&cp->mc_sync_lock);
 701 }
 702
 703 __private_extern__ boolean_t
 704 mcache_purge_cache(mcache_t *cp, boolean_t async)
 705 {
 706         /*
 707          * Purging a cache that has no per-CPU caches or is already
 708          * in the process of being purged is rather pointless.
 709          */
 710         if (cp->mc_flags & MCF_NOCPUCACHE) {
 711                 return FALSE;
 712         }
 713
 714         lck_mtx_lock_spin(&cp->mc_sync_lock);
 715         if (cp->mc_purge_cnt > 0) {
 716                 lck_mtx_unlock(&cp->mc_sync_lock);
 717                 return FALSE;
 718         }
 719         cp->mc_purge_cnt++;
 720         lck_mtx_unlock(&cp->mc_sync_lock);
 721
 722         if (async) {
 723                 mcache_dispatch(mcache_purge, cp);
 724         } else {
 725                 mcache_purge(cp);
 726         }
 727
 728         return TRUE;
 729 }
 730
 731 /*
 732  * Free a single object to a cache.
 733  */
 734 __private_extern__ void
 735 mcache_free(mcache_t *cp, void *buf)
 736 {
 737         ((mcache_obj_t *)buf)->obj_next = NULL;
 738         mcache_free_ext(cp, (mcache_obj_t *)buf);
 739 }
 740
 741 /*
 742  * Free one or more objects to a cache.
 743  */
 744 __private_extern__ void
 745 mcache_free_ext(mcache_t *cp, mcache_obj_t *list)
 746 {
 747         mcache_cpu_t *ccp = MCACHE_CPU(cp);
 748         mcache_bkttype_t *btp;
 749         mcache_obj_t *nlist;
 750         mcache_bkt_t *bkt;
 751
 752         if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL) {
 753                 (*cp->mc_slab_log)(0, list, FALSE);
 754         }
 755
 756         /* Invoke the slab layer audit callback if auditing is enabled */
 757         if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL) {
 758                 (*cp->mc_slab_audit)(cp->mc_private, list, FALSE);
 759         }
 760
 761         MCACHE_LOCK(&ccp->cc_lock);
 762         for (;;) {
 763                 /*
 764                  * If there is space in the current CPU's filled bucket, put
 765                  * the object there and return once all objects are freed.
 766                  * Note the cast to unsigned integer takes care of the case
 767                  * where the bucket layer is disabled (when cc_objs is -1).
 768                  */
 769                 if ((unsigned int)ccp->cc_objs <
 770                     (unsigned int)ccp->cc_bktsize) {
 771                         /*
 772                          * Reverse the list while we place the object into the
 773                          * bucket; this effectively causes the most recently
 774                          * freed object(s) to be reused during allocation.
 775                          */
 776                         nlist = list->obj_next;
 777                         list->obj_next = (ccp->cc_objs == 0) ? NULL :
 778                             ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
 779                         ccp->cc_filled->bkt_obj[ccp->cc_objs++] = list;
 780                         ccp->cc_free++;
 781
 782                         if ((list = nlist) != NULL) {
 783                                 continue;
 784                         }
 785
 786                         /* We are done; return to caller */
 787                         MCACHE_UNLOCK(&ccp->cc_lock);
 788
 789                         /* If there is a waiter below, notify it */
 790                         if (cp->mc_waiter_cnt > 0) {
 791                                 mcache_notify(cp, MCN_RETRYALLOC);
 792                         }
 793                         return;
 794                 }
 795
 796                 /*
 797                  * The CPU's filled bucket is full.  If the previous filled
 798                  * bucket was empty, exchange and try again.
 799                  */
 800                 if (ccp->cc_pobjs == 0) {
 801                         mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
 802                         continue;
 803                 }
 804
 805                 /*
 806                  * If the bucket layer is disabled, free to slab.  This can
 807                  * happen either because MCF_NOCPUCACHE is set, or because
 808                  * the bucket layer is currently being resized.
 809                  */
 810                 if (ccp->cc_bktsize == 0) {
 811                         break;
 812                 }
 813
 814                 /*
 815                  * Both of the CPU's buckets are full; try to get an empty
 816                  * bucket from the bucket layer.  Upon success, empty this
 817                  * CPU and place any full bucket into the full list.
 818                  */
 819                 bkt = mcache_bkt_alloc(cp, &cp->mc_empty);
 820                 if (bkt != NULL) {
 821                         if (ccp->cc_pfilled != NULL) {
 822                                 mcache_bkt_free(cp, &cp->mc_full,
 823                                     ccp->cc_pfilled);
 824                         }
 825                         mcache_cpu_refill(ccp, bkt, 0);
 826                         continue;
 827                 }
 828                 btp = cp->cache_bkttype;
 829
 830                 /*
 831                  * We need an empty bucket to put our freed objects into
 832                  * but couldn't get an empty bucket from the bucket layer;
 833                  * attempt to allocate one.  We do not want to block for
 834                  * allocation here, and if the bucket allocation fails
 835                  * we will simply fall through to the slab layer.
 836                  */
 837                 MCACHE_UNLOCK(&ccp->cc_lock);
 838                 bkt = mcache_alloc(btp->bt_cache, MCR_NOSLEEP);
 839                 MCACHE_LOCK(&ccp->cc_lock);
 840
 841                 if (bkt != NULL) {
 842                         /*
 843                          * We have an empty bucket, but since we drop the
 844                          * CPU lock above, the cache's bucket size may have
 845                          * changed.  If so, free the bucket and try again.
 846                          */
 847                         if (ccp->cc_bktsize != btp->bt_bktsize) {
 848                                 MCACHE_UNLOCK(&ccp->cc_lock);
 849                                 mcache_free(btp->bt_cache, bkt);
 850                                 MCACHE_LOCK(&ccp->cc_lock);
 851                                 continue;
 852                         }
 853
 854                         /*
 855                          * Store it in the bucket object since we'll
 856                          * need to refer to it during bucket destroy;
 857                          * we can't safely refer to cache_bkttype as
 858                          * the bucket lock may not be acquired then.
 859                          */
 860                         bkt->bkt_type = btp;
 861
 862                         /*
 863                          * We have an empty bucket of the right size;
 864                          * add it to the bucket layer and try again.
 865                          */
 866                         mcache_bkt_free(cp, &cp->mc_empty, bkt);
 867                         continue;
 868                 }
 869
 870                 /*
 871                  * The bucket layer has no empty buckets; free the
 872                  * object(s) directly to the slab layer.
 873                  */
 874                 break;
 875         }
 876         MCACHE_UNLOCK(&ccp->cc_lock);
 877
 878         /* If there is a waiter below, notify it */
 879         if (cp->mc_waiter_cnt > 0) {
 880                 mcache_notify(cp, MCN_RETRYALLOC);
 881         }
 882
 883         /* Advise the slab layer to purge the object(s) */
 884         (*cp->mc_slab_free)(cp->mc_private, list,
 885             (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
 886 }
 887
 888 /*
 889  * Cache destruction routine.
 890  */
 891 __private_extern__ void
 892 mcache_destroy(mcache_t *cp)
 893 {
 894         void **pbuf;
 895
 896         MCACHE_LIST_LOCK();
 897         LIST_REMOVE(cp, mc_list);
 898         MCACHE_LIST_UNLOCK();
 899
 900         mcache_bkt_purge(cp);
 901
 902         /*
 903          * This cache is dead; there should be no further transaction.
 904          * If it's still invoked, make sure that it induces a fault.
 905          */
 906         cp->mc_slab_alloc = NULL;
 907         cp->mc_slab_free = NULL;
 908         cp->mc_slab_audit = NULL;
 909
 910         lck_attr_free(cp->mc_bkt_lock_attr);
 911         lck_grp_free(cp->mc_bkt_lock_grp);
 912         lck_grp_attr_free(cp->mc_bkt_lock_grp_attr);
 913
 914         lck_attr_free(cp->mc_cpu_lock_attr);
 915         lck_grp_free(cp->mc_cpu_lock_grp);
 916         lck_grp_attr_free(cp->mc_cpu_lock_grp_attr);
 917
 918         lck_attr_free(cp->mc_sync_lock_attr);
 919         lck_grp_free(cp->mc_sync_lock_grp);
 920         lck_grp_attr_free(cp->mc_sync_lock_grp_attr);
 921
 922         /*
 923          * TODO: We need to destroy the zone here, but cannot do it
 924          * because there is no such way to achieve that.  Until then
 925          * the memory allocated for the zone structure is leaked.
 926          * Once it is achievable, uncomment these lines:
 927          *
 928          *      if (cp->mc_slab_zone != NULL) {
 929          *              zdestroy(cp->mc_slab_zone);
 930          *              cp->mc_slab_zone = NULL;
 931          *      }
 932          */
 933
 934         /* Get the original address since we're about to free it */
 935         pbuf = (void **)((intptr_t)cp - sizeof(void *));
 936
 937         zfree(mcache_zone, *pbuf);
 938 }
 939
 940 /*
 941  * Internal slab allocator used as a backend for simple caches.  The current
 942  * implementation uses the zone allocator for simplicity reasons.
 943  */
 944 static unsigned int
 945 mcache_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num,
 946     int wait)
 947 {
 948 #pragma unused(wait)
 949         mcache_t *cp = arg;
 950         unsigned int need = num;
 951         size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof(u_int64_t));
 952         u_int32_t flags = cp->mc_flags;
 953         void *buf, *base, **pbuf;
 954         mcache_obj_t **list = *plist;
 955
 956         *list = NULL;
 957
 958         for (;;) {
 959                 buf = zalloc(cp->mc_slab_zone);
 960                 if (buf == NULL) {
 961                         break;
 962                 }
 963
 964                 /* Get the aligned base address for this object */
 965                 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
 966                     cp->mc_align);
 967
 968                 /*
 969                  * Wind back a pointer size from the aligned base and
 970                  * save the original address so we can free it later.
 971                  */
 972                 pbuf = (void **)((intptr_t)base - sizeof(void *));
 973                 *pbuf = buf;
 974
 975                 VERIFY(((intptr_t)base + cp->mc_bufsize) <=
 976                     ((intptr_t)buf + cp->mc_chunksize));
 977
 978                 /*
 979                  * If auditing is enabled, patternize the contents of
 980                  * the buffer starting from the 64-bit aligned base to
 981                  * the end of the buffer; the length is rounded up to
 982                  * the nearest 64-bit multiply; this is because we use
 983                  * 64-bit memory access to set/check the pattern.
 984                  */
 985                 if (flags & MCF_DEBUG) {
 986                         VERIFY(((intptr_t)base + rsize) <=
 987                             ((intptr_t)buf + cp->mc_chunksize));
 988                         mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
 989                 }
 990
 991                 VERIFY(IS_P2ALIGNED(base, cp->mc_align));
 992                 *list = (mcache_obj_t *)base;
 993
 994                 (*list)->obj_next = NULL;
 995                 list = *plist = &(*list)->obj_next;
 996
 997                 /* If we got them all, return to mcache */
 998                 if (--need == 0) {
 999                         break;
1000                 }
1001         }
1002
1003         return num - need;
1004 }
1005
1006 /*
1007  * Internal slab deallocator used as a backend for simple caches.
1008  */
1009 static void
1010 mcache_slab_free(void *arg, mcache_obj_t *list, __unused boolean_t purged)
1011 {
1012         mcache_t *cp = arg;
1013         mcache_obj_t *nlist;
1014         size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof(u_int64_t));
1015         u_int32_t flags = cp->mc_flags;
1016         void *base;
1017         void **pbuf;
1018
1019         for (;;) {
1020                 nlist = list->obj_next;
1021                 list->obj_next = NULL;
1022
1023                 base = list;
1024                 VERIFY(IS_P2ALIGNED(base, cp->mc_align));
1025
1026                 /* Get the original address since we're about to free it */
1027                 pbuf = (void **)((intptr_t)base - sizeof(void *));
1028
1029                 VERIFY(((intptr_t)base + cp->mc_bufsize) <=
1030                     ((intptr_t)*pbuf + cp->mc_chunksize));
1031
1032                 if (flags & MCF_DEBUG) {
1033                         VERIFY(((intptr_t)base + rsize) <=
1034                             ((intptr_t)*pbuf + cp->mc_chunksize));
1035                         mcache_audit_free_verify(NULL, base, 0, rsize);
1036                 }
1037
1038                 /* Free it to zone */
1039                 zfree(cp->mc_slab_zone, *pbuf);
1040
1041                 /* No more objects to free; return to mcache */
1042                 if ((list = nlist) == NULL) {
1043                         break;
1044                 }
1045         }
1046 }
1047
1048 /*
1049  * Internal slab auditor for simple caches.
1050  */
1051 static void
1052 mcache_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
1053 {
1054         mcache_t *cp = arg;
1055         size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof(u_int64_t));
1056         void *base, **pbuf;
1057
1058         while (list != NULL) {
1059                 mcache_obj_t *next = list->obj_next;
1060
1061                 base = list;
1062                 VERIFY(IS_P2ALIGNED(base, cp->mc_align));
1063
1064                 /* Get the original address */
1065                 pbuf = (void **)((intptr_t)base - sizeof(void *));
1066
1067                 VERIFY(((intptr_t)base + rsize) <=
1068                     ((intptr_t)*pbuf + cp->mc_chunksize));
1069
1070                 if (!alloc) {
1071                         mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
1072                 } else {
1073                         mcache_audit_free_verify_set(NULL, base, 0, rsize);
1074                 }
1075
1076                 list = list->obj_next = next;
1077         }
1078 }
1079
1080 /*
1081  * Refill the CPU's filled bucket with bkt and save the previous one.
1082  */
1083 static void
1084 mcache_cpu_refill(mcache_cpu_t *ccp, mcache_bkt_t *bkt, int objs)
1085 {
1086         ASSERT((ccp->cc_filled == NULL && ccp->cc_objs == -1) ||
1087             (ccp->cc_filled && ccp->cc_objs + objs == ccp->cc_bktsize));
1088         ASSERT(ccp->cc_bktsize > 0);
1089
1090         ccp->cc_pfilled = ccp->cc_filled;
1091         ccp->cc_pobjs = ccp->cc_objs;
1092         ccp->cc_filled = bkt;
1093         ccp->cc_objs = objs;
1094 }
1095
1096 /*
1097  * Allocate a bucket from the bucket layer.
1098  */
1099 static mcache_bkt_t *
1100 mcache_bkt_alloc(mcache_t *cp, mcache_bktlist_t *blp)
1101 {
1102         mcache_bkt_t *bkt;
1103
1104         if (!MCACHE_LOCK_TRY(&cp->mc_bkt_lock)) {
1105                 /*
1106                  * The bucket layer lock is held by another CPU; increase
1107                  * the contention count so that we can later resize the
1108                  * bucket size accordingly.
1109                  */
1110                 MCACHE_LOCK(&cp->mc_bkt_lock);
1111                 cp->mc_bkt_contention++;
1112         }
1113
1114         if ((bkt = blp->bl_list) != NULL) {
1115                 blp->bl_list = bkt->bkt_next;
1116                 if (--blp->bl_total < blp->bl_min) {
1117                         blp->bl_min = blp->bl_total;
1118                 }
1119                 blp->bl_alloc++;
1120         }
1121
1122         MCACHE_UNLOCK(&cp->mc_bkt_lock);
1123
1124         return bkt;
1125 }
1126
1127 /*
1128  * Free a bucket to the bucket layer.
1129  */
1130 static void
1131 mcache_bkt_free(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkt_t *bkt)
1132 {
1133         MCACHE_LOCK(&cp->mc_bkt_lock);
1134
1135         bkt->bkt_next = blp->bl_list;
1136         blp->bl_list = bkt;
1137         blp->bl_total++;
1138
1139         MCACHE_UNLOCK(&cp->mc_bkt_lock);
1140 }
1141
1142 /*
1143  * Enable the bucket layer of a cache.
1144  */
1145 static void
1146 mcache_cache_bkt_enable(mcache_t *cp)
1147 {
1148         mcache_cpu_t *ccp;
1149         int cpu;
1150
1151         if (cp->mc_flags & MCF_NOCPUCACHE) {
1152                 return;
1153         }
1154
1155         for (cpu = 0; cpu < ncpu; cpu++) {
1156                 ccp = &cp->mc_cpu[cpu];
1157                 MCACHE_LOCK(&ccp->cc_lock);
1158                 ccp->cc_bktsize = cp->cache_bkttype->bt_bktsize;
1159                 MCACHE_UNLOCK(&ccp->cc_lock);
1160         }
1161 }
1162
1163 /*
1164  * Purge all buckets from a cache and disable its bucket layer.
1165  */
1166 static void
1167 mcache_bkt_purge(mcache_t *cp)
1168 {
1169         mcache_cpu_t *ccp;
1170         mcache_bkt_t *bp, *pbp;
1171         int cpu, objs, pobjs;
1172
1173         for (cpu = 0; cpu < ncpu; cpu++) {
1174                 ccp = &cp->mc_cpu[cpu];
1175
1176                 MCACHE_LOCK(&ccp->cc_lock);
1177
1178                 bp = ccp->cc_filled;
1179                 pbp = ccp->cc_pfilled;
1180                 objs = ccp->cc_objs;
1181                 pobjs = ccp->cc_pobjs;
1182                 ccp->cc_filled = NULL;
1183                 ccp->cc_pfilled = NULL;
1184                 ccp->cc_objs = -1;
1185                 ccp->cc_pobjs = -1;
1186                 ccp->cc_bktsize = 0;
1187
1188                 MCACHE_UNLOCK(&ccp->cc_lock);
1189
1190                 if (bp != NULL) {
1191                         mcache_bkt_destroy(cp, bp, objs);
1192                 }
1193                 if (pbp != NULL) {
1194                         mcache_bkt_destroy(cp, pbp, pobjs);
1195                 }
1196         }
1197
1198         mcache_bkt_ws_zero(cp);
1199         mcache_bkt_ws_reap(cp);
1200 }
1201
1202 /*
1203  * Free one or more objects in the bucket to the slab layer,
1204  * and also free the bucket itself.
1205  */
1206 static void
1207 mcache_bkt_destroy(mcache_t *cp, mcache_bkt_t *bkt, int nobjs)
1208 {
1209         if (nobjs > 0) {
1210                 mcache_obj_t *top = bkt->bkt_obj[nobjs - 1];
1211
1212                 if (cp->mc_flags & MCF_DEBUG) {
1213                         mcache_obj_t *o = top;
1214                         int cnt = 0;
1215
1216                         /*
1217                          * Verify that the chain of objects in the bucket is
1218                          * valid.  Any mismatch here means a mistake when the
1219                          * object(s) were freed to the CPU layer, so we panic.
1220                          */
1221                         while (o != NULL) {
1222                                 o = o->obj_next;
1223                                 ++cnt;
1224                         }
1225                         if (cnt != nobjs) {
1226                                 panic("mcache_bkt_destroy: %s cp %p corrupted "
1227                                     "list in bkt %p (nobjs %d actual %d)\n",
1228                                     cp->mc_name, (void *)cp, (void *)bkt,
1229                                     nobjs, cnt);
1230                                 /* NOTREACHED */
1231                                 __builtin_unreachable();
1232                         }
1233                 }
1234
1235                 /* Advise the slab layer to purge the object(s) */
1236                 (*cp->mc_slab_free)(cp->mc_private, top,
1237                     (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
1238         }
1239         mcache_free(bkt->bkt_type->bt_cache, bkt);
1240 }
1241
1242 /*
1243  * Update the bucket layer working set statistics.
1244  */
1245 static void
1246 mcache_bkt_ws_update(mcache_t *cp)
1247 {
1248         MCACHE_LOCK(&cp->mc_bkt_lock);
1249
1250         cp->mc_full.bl_reaplimit = cp->mc_full.bl_min;
1251         cp->mc_full.bl_min = cp->mc_full.bl_total;
1252         cp->mc_empty.bl_reaplimit = cp->mc_empty.bl_min;
1253         cp->mc_empty.bl_min = cp->mc_empty.bl_total;
1254
1255         MCACHE_UNLOCK(&cp->mc_bkt_lock);
1256 }
1257
1258 /*
1259  * Mark everything as eligible for reaping (working set is zero).
1260  */
1261 static void
1262 mcache_bkt_ws_zero(mcache_t *cp)
1263 {
1264         MCACHE_LOCK(&cp->mc_bkt_lock);
1265
1266         cp->mc_full.bl_reaplimit = cp->mc_full.bl_total;
1267         cp->mc_full.bl_min = cp->mc_full.bl_total;
1268         cp->mc_empty.bl_reaplimit = cp->mc_empty.bl_total;
1269         cp->mc_empty.bl_min = cp->mc_empty.bl_total;
1270
1271         MCACHE_UNLOCK(&cp->mc_bkt_lock);
1272 }
1273
1274 /*
1275  * Reap all buckets that are beyond the working set.
1276  */
1277 static void
1278 mcache_bkt_ws_reap(mcache_t *cp)
1279 {
1280         long reap;
1281         mcache_bkt_t *bkt;
1282
1283         reap = MIN(cp->mc_full.bl_reaplimit, cp->mc_full.bl_min);
1284         while (reap-- &&
1285             (bkt = mcache_bkt_alloc(cp, &cp->mc_full)) != NULL) {
1286                 mcache_bkt_destroy(cp, bkt, bkt->bkt_type->bt_bktsize);
1287         }
1288
1289         reap = MIN(cp->mc_empty.bl_reaplimit, cp->mc_empty.bl_min);
1290         while (reap-- &&
1291             (bkt = mcache_bkt_alloc(cp, &cp->mc_empty)) != NULL) {
1292                 mcache_bkt_destroy(cp, bkt, 0);
1293         }
1294 }
1295
1296 static void
1297 mcache_reap_timeout(thread_call_param_t dummy __unused,
1298     thread_call_param_t arg)
1299 {
1300         volatile UInt32 *flag = arg;
1301
1302         ASSERT(flag == &mcache_reaping);
1303
1304         *flag = 0;
1305 }
1306
1307 static void
1308 mcache_reap_done(void *flag)
1309 {
1310         uint64_t deadline, leeway;
1311
1312         clock_interval_to_deadline(mcache_reap_interval, NSEC_PER_SEC,
1313             &deadline);
1314         clock_interval_to_absolutetime_interval(mcache_reap_interval_leeway,
1315             NSEC_PER_SEC, &leeway);
1316         thread_call_enter_delayed_with_leeway(mcache_reap_tcall, flag,
1317             deadline, leeway, THREAD_CALL_DELAY_LEEWAY);
1318 }
1319
1320 static void
1321 mcache_reap_start(void *arg)
1322 {
1323         UInt32 *flag = arg;
1324
1325         ASSERT(flag == &mcache_reaping);
1326
1327         mcache_applyall(mcache_cache_reap);
1328         mcache_dispatch(mcache_reap_done, flag);
1329 }
1330
1331 __private_extern__ void
1332 mcache_reap(void)
1333 {
1334         UInt32 *flag = &mcache_reaping;
1335
1336         if (mcache_llock_owner == current_thread() ||
1337             !OSCompareAndSwap(0, 1, flag)) {
1338                 return;
1339         }
1340
1341         mcache_dispatch(mcache_reap_start, flag);
1342 }
1343
1344 __private_extern__ void
1345 mcache_reap_now(mcache_t *cp, boolean_t purge)
1346 {
1347         if (purge) {
1348                 mcache_bkt_purge(cp);
1349                 mcache_cache_bkt_enable(cp);
1350         } else {
1351                 mcache_bkt_ws_zero(cp);
1352                 mcache_bkt_ws_reap(cp);
1353         }
1354 }
1355
1356 static void
1357 mcache_cache_reap(mcache_t *cp)
1358 {
1359         mcache_bkt_ws_reap(cp);
1360 }
1361
1362 /*
1363  * Performs period maintenance on a cache.
1364  */
1365 static void
1366 mcache_cache_update(mcache_t *cp)
1367 {
1368         int need_bkt_resize = 0;
1369         int need_bkt_reenable = 0;
1370
1371         lck_mtx_assert(mcache_llock, LCK_MTX_ASSERT_OWNED);
1372
1373         mcache_bkt_ws_update(cp);
1374
1375         /*
1376          * Cache resize and post-purge reenable are mutually exclusive.
1377          * If the cache was previously purged, there is no point of
1378          * increasing the bucket size as there was an indication of
1379          * memory pressure on the system.
1380          */
1381         lck_mtx_lock_spin(&cp->mc_sync_lock);
1382         if (!(cp->mc_flags & MCF_NOCPUCACHE) && cp->mc_enable_cnt) {
1383                 need_bkt_reenable = 1;
1384         }
1385         lck_mtx_unlock(&cp->mc_sync_lock);
1386
1387         MCACHE_LOCK(&cp->mc_bkt_lock);
1388         /*
1389          * If the contention count is greater than the threshold, and if
1390          * we are not already at the maximum bucket size, increase it.
1391          * Otherwise, if this cache was previously purged by the user
1392          * then we simply reenable it.
1393          */
1394         if ((unsigned int)cp->mc_chunksize < cp->cache_bkttype->bt_maxbuf &&
1395             (int)(cp->mc_bkt_contention - cp->mc_bkt_contention_prev) >
1396             mcache_bkt_contention && !need_bkt_reenable) {
1397                 need_bkt_resize = 1;
1398         }
1399
1400         cp->mc_bkt_contention_prev = cp->mc_bkt_contention;
1401         MCACHE_UNLOCK(&cp->mc_bkt_lock);
1402
1403         if (need_bkt_resize) {
1404                 mcache_dispatch(mcache_cache_bkt_resize, cp);
1405         } else if (need_bkt_reenable) {
1406                 mcache_dispatch(mcache_cache_enable, cp);
1407         }
1408 }
1409
1410 /*
1411  * Recompute a cache's bucket size.  This is an expensive operation
1412  * and should not be done frequently; larger buckets provide for a
1413  * higher transfer rate with the bucket while smaller buckets reduce
1414  * the memory consumption.
1415  */
1416 static void
1417 mcache_cache_bkt_resize(void *arg)
1418 {
1419         mcache_t *cp = arg;
1420         mcache_bkttype_t *btp = cp->cache_bkttype;
1421
1422         if ((unsigned int)cp->mc_chunksize < btp->bt_maxbuf) {
1423                 mcache_bkt_purge(cp);
1424
1425                 /*
1426                  * Upgrade to the next bucket type with larger bucket size;
1427                  * temporarily set the previous contention snapshot to a
1428                  * negative number to prevent unnecessary resize request.
1429                  */
1430                 MCACHE_LOCK(&cp->mc_bkt_lock);
1431                 cp->cache_bkttype = ++btp;
1432                 cp->mc_bkt_contention_prev = cp->mc_bkt_contention + INT_MAX;
1433                 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1434
1435                 mcache_cache_enable(cp);
1436         }
1437 }
1438
1439 /*
1440  * Reenable a previously disabled cache due to purge.
1441  */
1442 static void
1443 mcache_cache_enable(void *arg)
1444 {
1445         mcache_t *cp = arg;
1446
1447         lck_mtx_lock_spin(&cp->mc_sync_lock);
1448         cp->mc_purge_cnt = 0;
1449         cp->mc_enable_cnt = 0;
1450         lck_mtx_unlock(&cp->mc_sync_lock);
1451
1452         mcache_cache_bkt_enable(cp);
1453 }
1454
1455 static void
1456 mcache_update_timeout(__unused void *arg)
1457 {
1458         uint64_t deadline, leeway;
1459
1460         clock_interval_to_deadline(mcache_reap_interval, NSEC_PER_SEC,
1461             &deadline);
1462         clock_interval_to_absolutetime_interval(mcache_reap_interval_leeway,
1463             NSEC_PER_SEC, &leeway);
1464         thread_call_enter_delayed_with_leeway(mcache_update_tcall, NULL,
1465             deadline, leeway, THREAD_CALL_DELAY_LEEWAY);
1466 }
1467
1468 static void
1469 mcache_update(thread_call_param_t arg __unused,
1470     thread_call_param_t dummy __unused)
1471 {
1472         mcache_applyall(mcache_cache_update);
1473         mcache_update_timeout(NULL);
1474 }
1475
1476 static void
1477 mcache_applyall(void (*func)(mcache_t *))
1478 {
1479         mcache_t *cp;
1480
1481         MCACHE_LIST_LOCK();
1482         LIST_FOREACH(cp, &mcache_head, mc_list) {
1483                 func(cp);
1484         }
1485         MCACHE_LIST_UNLOCK();
1486 }
1487
1488 static void
1489 mcache_dispatch(void (*func)(void *), void *arg)
1490 {
1491         ASSERT(func != NULL);
1492         timeout(func, arg, hz / 1000);
1493 }
1494
1495 __private_extern__ void
1496 mcache_buffer_log(mcache_audit_t *mca, void *addr, mcache_t *cp,
1497     struct timeval *base_ts)
1498 {
1499         struct timeval now, base = { .tv_sec = 0, .tv_usec = 0 };
1500         void *stack[MCACHE_STACK_DEPTH + 1];
1501         struct mca_trn *transaction;
1502
1503         transaction = &mca->mca_trns[mca->mca_next_trn];
1504
1505         mca->mca_addr = addr;
1506         mca->mca_cache = cp;
1507
1508         transaction->mca_thread = current_thread();
1509
1510         bzero(stack, sizeof(stack));
1511         transaction->mca_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
1512         bcopy(&stack[1], transaction->mca_stack,
1513             sizeof(transaction->mca_stack));
1514
1515         microuptime(&now);
1516         if (base_ts != NULL) {
1517                 base = *base_ts;
1518         }
1519         /* tstamp is in ms relative to base_ts */
1520         transaction->mca_tstamp = ((now.tv_usec - base.tv_usec) / 1000);
1521         if ((now.tv_sec - base.tv_sec) > 0) {
1522                 transaction->mca_tstamp += ((now.tv_sec - base.tv_sec) * 1000);
1523         }
1524
1525         mca->mca_next_trn =
1526             (mca->mca_next_trn + 1) % mca_trn_max;
1527 }
1528
1529 __private_extern__ void
1530 mcache_set_pattern(u_int64_t pattern, void *buf_arg, size_t size)
1531 {
1532         u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1533         u_int64_t *buf = (u_int64_t *)buf_arg;
1534
1535         VERIFY(IS_P2ALIGNED(buf_arg, sizeof(u_int64_t)));
1536         VERIFY(IS_P2ALIGNED(size, sizeof(u_int64_t)));
1537
1538         while (buf < buf_end) {
1539                 *buf++ = pattern;
1540         }
1541 }
1542
1543 __private_extern__ void *
1544 mcache_verify_pattern(u_int64_t pattern, void *buf_arg, size_t size)
1545 {
1546         u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1547         u_int64_t *buf;
1548
1549         VERIFY(IS_P2ALIGNED(buf_arg, sizeof(u_int64_t)));
1550         VERIFY(IS_P2ALIGNED(size, sizeof(u_int64_t)));
1551
1552         for (buf = buf_arg; buf < buf_end; buf++) {
1553                 if (*buf != pattern) {
1554                         return buf;
1555                 }
1556         }
1557         return NULL;
1558 }
1559
1560 __private_extern__ void *
1561 mcache_verify_set_pattern(u_int64_t old, u_int64_t new, void *buf_arg,
1562     size_t size)
1563 {
1564         u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1565         u_int64_t *buf;
1566
1567         VERIFY(IS_P2ALIGNED(buf_arg, sizeof(u_int64_t)));
1568         VERIFY(IS_P2ALIGNED(size, sizeof(u_int64_t)));
1569
1570         for (buf = buf_arg; buf < buf_end; buf++) {
1571                 if (*buf != old) {
1572                         mcache_set_pattern(old, buf_arg,
1573                             (uintptr_t)buf - (uintptr_t)buf_arg);
1574                         return buf;
1575                 }
1576                 *buf = new;
1577         }
1578         return NULL;
1579 }
1580
1581 __private_extern__ void
1582 mcache_audit_free_verify(mcache_audit_t *mca, void *base, size_t offset,
1583     size_t size)
1584 {
1585         void *addr;
1586         u_int64_t *oaddr64;
1587         mcache_obj_t *next;
1588
1589         addr = (void *)((uintptr_t)base + offset);
1590         next = ((mcache_obj_t *)addr)->obj_next;
1591
1592         /* For the "obj_next" pointer in the buffer */
1593         oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof(u_int64_t));
1594         *oaddr64 = MCACHE_FREE_PATTERN;
1595
1596         if ((oaddr64 = mcache_verify_pattern(MCACHE_FREE_PATTERN,
1597             (caddr_t)base, size)) != NULL) {
1598                 mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
1599                     (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
1600                 /* NOTREACHED */
1601         }
1602         ((mcache_obj_t *)addr)->obj_next = next;
1603 }
1604
1605 __private_extern__ void
1606 mcache_audit_free_verify_set(mcache_audit_t *mca, void *base, size_t offset,
1607     size_t size)
1608 {
1609         void *addr;
1610         u_int64_t *oaddr64;
1611         mcache_obj_t *next;
1612
1613         addr = (void *)((uintptr_t)base + offset);
1614         next = ((mcache_obj_t *)addr)->obj_next;
1615
1616         /* For the "obj_next" pointer in the buffer */
1617         oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof(u_int64_t));
1618         *oaddr64 = MCACHE_FREE_PATTERN;
1619
1620         if ((oaddr64 = mcache_verify_set_pattern(MCACHE_FREE_PATTERN,
1621             MCACHE_UNINITIALIZED_PATTERN, (caddr_t)base, size)) != NULL) {
1622                 mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
1623                     (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
1624                 /* NOTREACHED */
1625         }
1626         ((mcache_obj_t *)addr)->obj_next = next;
1627 }
1628
1629 #undef panic
1630
1631 #define DUMP_TRN_FMT() \
1632             "%s transaction thread %p saved PC stack (%d deep):\n" \
1633             "\t%p, %p, %p, %p, %p, %p, %p, %p\n" \
1634             "\t%p, %p, %p, %p, %p, %p, %p, %p\n"
1635
1636 #define DUMP_TRN_FIELDS(s, x) \
1637             s, \
1638             mca->mca_trns[x].mca_thread, mca->mca_trns[x].mca_depth, \
1639             mca->mca_trns[x].mca_stack[0], mca->mca_trns[x].mca_stack[1], \
1640             mca->mca_trns[x].mca_stack[2], mca->mca_trns[x].mca_stack[3], \
1641             mca->mca_trns[x].mca_stack[4], mca->mca_trns[x].mca_stack[5], \
1642             mca->mca_trns[x].mca_stack[6], mca->mca_trns[x].mca_stack[7], \
1643             mca->mca_trns[x].mca_stack[8], mca->mca_trns[x].mca_stack[9], \
1644             mca->mca_trns[x].mca_stack[10], mca->mca_trns[x].mca_stack[11], \
1645             mca->mca_trns[x].mca_stack[12], mca->mca_trns[x].mca_stack[13], \
1646             mca->mca_trns[x].mca_stack[14], mca->mca_trns[x].mca_stack[15]
1647
1648 #define MCA_TRN_LAST ((mca->mca_next_trn + mca_trn_max) % mca_trn_max)
1649 #define MCA_TRN_PREV ((mca->mca_next_trn + mca_trn_max - 1) % mca_trn_max)
1650
1651 __private_extern__ char *
1652 mcache_dump_mca(mcache_audit_t *mca)
1653 {
1654         if (mca_dump_buf == NULL) {
1655                 return NULL;
1656         }
1657
1658         snprintf(mca_dump_buf, DUMP_MCA_BUF_SIZE,
1659             "mca %p: addr %p, cache %p (%s) nxttrn %d\n"
1660             DUMP_TRN_FMT()
1661             DUMP_TRN_FMT(),
1662
1663             mca, mca->mca_addr, mca->mca_cache,
1664             mca->mca_cache ? mca->mca_cache->mc_name : "?",
1665             mca->mca_next_trn,
1666
1667             DUMP_TRN_FIELDS("last", MCA_TRN_LAST),
1668             DUMP_TRN_FIELDS("previous", MCA_TRN_PREV));
1669
1670         return mca_dump_buf;
1671 }
1672
1673 __private_extern__ void
1674 mcache_audit_panic(mcache_audit_t *mca, void *addr, size_t offset,
1675     int64_t expected, int64_t got)
1676 {
1677         if (mca == NULL) {
1678                 panic("mcache_audit: buffer %p modified after free at "
1679                     "offset 0x%lx (0x%llx instead of 0x%llx)\n", addr,
1680                     offset, got, expected);
1681                 /* NOTREACHED */
1682                 __builtin_unreachable();
1683         }
1684
1685         panic("mcache_audit: buffer %p modified after free at offset 0x%lx "
1686             "(0x%llx instead of 0x%llx)\n%s\n",
1687             addr, offset, got, expected, mcache_dump_mca(mca));
1688         /* NOTREACHED */
1689         __builtin_unreachable();
1690 }
1691
1692 __attribute__((noinline, cold, not_tail_called, noreturn))
1693 __private_extern__ int
1694 assfail(const char *a, const char *f, int l)
1695 {
1696         panic("assertion failed: %s, file: %s, line: %d", a, f, l);
1697         /* NOTREACHED */
1698         __builtin_unreachable();
1699 }