bsd/kern/mcache.c

   1 /*
   2  * Copyright (c) 2006-2020 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 /*
  30  * Memory allocator with per-CPU caching, derived from the kmem magazine
  31  * concept and implementation as described in the following paper:
  32  * http://www.usenix.org/events/usenix01/full_papers/bonwick/bonwick.pdf
  33  * That implementation is Copyright 2006 Sun Microsystems, Inc.  All rights
  34  * reserved.  Use is subject to license terms.
  35  *
  36  * There are several major differences between this and the original kmem
  37  * magazine: this derivative implementation allows for multiple objects to
  38  * be allocated and freed from/to the object cache in one call; in addition,
  39  * it provides for better flexibility where the user is allowed to define
  40  * its own slab allocator (instead of the default zone allocator).  Finally,
  41  * no object construction/destruction takes place at the moment, although
  42  * this could be added in future to improve efficiency.
  43  */
  44
  45 #include <sys/param.h>
  46 #include <sys/types.h>
  47 #include <sys/malloc.h>
  48 #include <sys/mbuf.h>
  49 #include <sys/queue.h>
  50 #include <sys/kernel.h>
  51 #include <sys/systm.h>
  52
  53 #include <kern/debug.h>
  54 #include <kern/zalloc.h>
  55 #include <kern/cpu_number.h>
  56 #include <kern/locks.h>
  57 #include <kern/thread_call.h>
  58
  59 #include <libkern/libkern.h>
  60 #include <libkern/OSAtomic.h>
  61 #include <libkern/OSDebug.h>
  62
  63 #include <mach/vm_param.h>
  64 #include <machine/limits.h>
  65 #include <machine/machine_routines.h>
  66
  67 #include <string.h>
  68
  69 #include <sys/mcache.h>
  70
  71 #define MCACHE_SIZE(n) \
  72         __builtin_offsetof(mcache_t, mc_cpu[n])
  73
  74 /* Allocate extra in case we need to manually align the pointer */
  75 #define MCACHE_ALLOC_SIZE \
  76         (sizeof (void *) + MCACHE_SIZE(ncpu) + CPU_CACHE_LINE_SIZE)
  77
  78 #define MCACHE_CPU(c) \
  79         (mcache_cpu_t *)((void *)((char *)(c) + MCACHE_SIZE(cpu_number())))
  80
  81 /*
  82  * MCACHE_LIST_LOCK() and MCACHE_LIST_UNLOCK() are macros used
  83  * to serialize accesses to the global list of caches in the system.
  84  * They also record the thread currently running in the critical
  85  * section, so that we can avoid recursive requests to reap the
  86  * caches when memory runs low.
  87  */
  88 #define MCACHE_LIST_LOCK() {                            \
  89         lck_mtx_lock(mcache_llock);                     \
  90         mcache_llock_owner = current_thread();          \
  91 }
  92
  93 #define MCACHE_LIST_UNLOCK() {                          \
  94         mcache_llock_owner = NULL;                      \
  95         lck_mtx_unlock(mcache_llock);                   \
  96 }
  97
  98 #define MCACHE_LOCK(l)          lck_mtx_lock(l)
  99 #define MCACHE_UNLOCK(l)        lck_mtx_unlock(l)
 100 #define MCACHE_LOCK_TRY(l)      lck_mtx_try_lock(l)
 101
 102 static unsigned int ncpu;
 103 static unsigned int cache_line_size;
 104 static lck_mtx_t *mcache_llock;
 105 static struct thread *mcache_llock_owner;
 106 static lck_attr_t *mcache_llock_attr;
 107 static lck_grp_t *mcache_llock_grp;
 108 static lck_grp_attr_t *mcache_llock_grp_attr;
 109 static struct zone *mcache_zone;
 110 static const uint32_t mcache_reap_interval = 15;
 111 static const uint32_t mcache_reap_interval_leeway = 2;
 112 static UInt32 mcache_reaping;
 113 static int mcache_ready;
 114 static int mcache_updating;
 115
 116 static int mcache_bkt_contention = 3;
 117 #if DEBUG
 118 static unsigned int mcache_flags = MCF_DEBUG;
 119 #else
 120 static unsigned int mcache_flags = 0;
 121 #endif
 122
 123 int mca_trn_max = MCA_TRN_MAX;
 124
 125 #define DUMP_MCA_BUF_SIZE       512
 126 static char *mca_dump_buf;
 127
 128 static mcache_bkttype_t mcache_bkttype[] = {
 129         { 1, 4096, 32768, NULL },
 130         { 3, 2048, 16384, NULL },
 131         { 7, 1024, 12288, NULL },
 132         { 15, 256, 8192, NULL },
 133         { 31, 64, 4096, NULL },
 134         { 47, 0, 2048, NULL },
 135         { 63, 0, 1024, NULL },
 136         { 95, 0, 512, NULL },
 137         { 143, 0, 256, NULL },
 138         { 165, 0, 0, NULL },
 139 };
 140
 141 static mcache_t *mcache_create_common(const char *, size_t, size_t,
 142     mcache_allocfn_t, mcache_freefn_t, mcache_auditfn_t, mcache_logfn_t,
 143     mcache_notifyfn_t, void *, u_int32_t, int, int);
 144 static unsigned int mcache_slab_alloc(void *, mcache_obj_t ***,
 145     unsigned int, int);
 146 static void mcache_slab_free(void *, mcache_obj_t *, boolean_t);
 147 static void mcache_slab_audit(void *, mcache_obj_t *, boolean_t);
 148 static void mcache_cpu_refill(mcache_cpu_t *, mcache_bkt_t *, int);
 149 static mcache_bkt_t *mcache_bkt_alloc(mcache_t *, mcache_bktlist_t *);
 150 static void mcache_bkt_free(mcache_t *, mcache_bktlist_t *, mcache_bkt_t *);
 151 static void mcache_cache_bkt_enable(mcache_t *);
 152 static void mcache_bkt_purge(mcache_t *);
 153 static void mcache_bkt_destroy(mcache_t *, mcache_bkt_t *, int);
 154 static void mcache_bkt_ws_update(mcache_t *);
 155 static void mcache_bkt_ws_zero(mcache_t *);
 156 static void mcache_bkt_ws_reap(mcache_t *);
 157 static void mcache_dispatch(void (*)(void *), void *);
 158 static void mcache_cache_reap(mcache_t *);
 159 static void mcache_cache_update(mcache_t *);
 160 static void mcache_cache_bkt_resize(void *);
 161 static void mcache_cache_enable(void *);
 162 static void mcache_update(thread_call_param_t __unused, thread_call_param_t __unused);
 163 static void mcache_update_timeout(void *);
 164 static void mcache_applyall(void (*)(mcache_t *));
 165 static void mcache_reap_start(void *);
 166 static void mcache_reap_done(void *);
 167 static void mcache_reap_timeout(thread_call_param_t __unused, thread_call_param_t);
 168 static void mcache_notify(mcache_t *, u_int32_t);
 169 static void mcache_purge(void *);
 170
 171 static LIST_HEAD(, mcache) mcache_head;
 172 mcache_t *mcache_audit_cache;
 173
 174 static thread_call_t mcache_reap_tcall;
 175 static thread_call_t mcache_update_tcall;
 176
 177 /*
 178  * Initialize the framework; this is currently called as part of BSD init.
 179  */
 180 __private_extern__ void
 181 mcache_init(void)
 182 {
 183         mcache_bkttype_t *btp;
 184         unsigned int i;
 185         char name[32];
 186
 187         VERIFY(mca_trn_max >= 2);
 188
 189         ncpu = ml_wait_max_cpus();
 190         (void) mcache_cache_line_size();        /* prime it */
 191
 192         mcache_llock_grp_attr = lck_grp_attr_alloc_init();
 193         mcache_llock_grp = lck_grp_alloc_init("mcache.list",
 194             mcache_llock_grp_attr);
 195         mcache_llock_attr = lck_attr_alloc_init();
 196         mcache_llock = lck_mtx_alloc_init(mcache_llock_grp, mcache_llock_attr);
 197
 198         mcache_reap_tcall = thread_call_allocate(mcache_reap_timeout, NULL);
 199         mcache_update_tcall = thread_call_allocate(mcache_update, NULL);
 200         if (mcache_reap_tcall == NULL || mcache_update_tcall == NULL) {
 201                 panic("mcache_init: thread_call_allocate failed");
 202                 /* NOTREACHED */
 203                 __builtin_unreachable();
 204         }
 205
 206         mcache_zone = zone_create("mcache", MCACHE_ALLOC_SIZE, ZC_DESTRUCTIBLE);
 207
 208         LIST_INIT(&mcache_head);
 209
 210         for (i = 0; i < sizeof(mcache_bkttype) / sizeof(*btp); i++) {
 211                 btp = &mcache_bkttype[i];
 212                 (void) snprintf(name, sizeof(name), "bkt_%d",
 213                     btp->bt_bktsize);
 214                 btp->bt_cache = mcache_create(name,
 215                     (btp->bt_bktsize + 1) * sizeof(void *), 0, 0, MCR_SLEEP);
 216         }
 217
 218         PE_parse_boot_argn("mcache_flags", &mcache_flags, sizeof(mcache_flags));
 219         mcache_flags &= MCF_FLAGS_MASK;
 220
 221         mcache_audit_cache = mcache_create("audit", sizeof(mcache_audit_t),
 222             0, 0, MCR_SLEEP);
 223
 224         mcache_applyall(mcache_cache_bkt_enable);
 225         mcache_ready = 1;
 226
 227         printf("mcache: %d CPU(s), %d bytes CPU cache line size\n",
 228             ncpu, CPU_CACHE_LINE_SIZE);
 229 }
 230
 231 /*
 232  * Return the global mcache flags.
 233  */
 234 __private_extern__ unsigned int
 235 mcache_getflags(void)
 236 {
 237         return mcache_flags;
 238 }
 239
 240 /*
 241  * Return the CPU cache line size.
 242  */
 243 __private_extern__ unsigned int
 244 mcache_cache_line_size(void)
 245 {
 246         if (cache_line_size == 0) {
 247                 ml_cpu_info_t cpu_info;
 248                 ml_cpu_get_info(&cpu_info);
 249                 cache_line_size = (unsigned int)cpu_info.cache_line_size;
 250         }
 251         return cache_line_size;
 252 }
 253
 254 /*
 255  * Create a cache using the zone allocator as the backend slab allocator.
 256  * The caller may specify any alignment for the object; if it specifies 0
 257  * the default alignment (MCACHE_ALIGN) will be used.
 258  */
 259 __private_extern__ mcache_t *
 260 mcache_create(const char *name, size_t bufsize, size_t align,
 261     u_int32_t flags, int wait)
 262 {
 263         return mcache_create_common(name, bufsize, align, mcache_slab_alloc,
 264                    mcache_slab_free, mcache_slab_audit, NULL, NULL, NULL, flags, 1,
 265                    wait);
 266 }
 267
 268 /*
 269  * Create a cache using a custom backend slab allocator.  Since the caller
 270  * is responsible for allocation, no alignment guarantee will be provided
 271  * by this framework.
 272  */
 273 __private_extern__ mcache_t *
 274 mcache_create_ext(const char *name, size_t bufsize,
 275     mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
 276     mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
 277     u_int32_t flags, int wait)
 278 {
 279         return mcache_create_common(name, bufsize, 0, allocfn,
 280                    freefn, auditfn, logfn, notifyfn, arg, flags, 0, wait);
 281 }
 282
 283 /*
 284  * Common cache creation routine.
 285  */
 286 static mcache_t *
 287 mcache_create_common(const char *name, size_t bufsize, size_t align,
 288     mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
 289     mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
 290     u_int32_t flags, int need_zone, int wait)
 291 {
 292         mcache_bkttype_t *btp;
 293         mcache_t *cp = NULL;
 294         size_t chunksize;
 295         void *buf, **pbuf;
 296         unsigned int c;
 297         char lck_name[64];
 298
 299         /* If auditing is on and print buffer is NULL, allocate it now */
 300         if ((flags & MCF_DEBUG) && mca_dump_buf == NULL) {
 301                 int malloc_wait = (wait & MCR_NOSLEEP) ? M_NOWAIT : M_WAITOK;
 302                 MALLOC(mca_dump_buf, char *, DUMP_MCA_BUF_SIZE, M_TEMP,
 303                     malloc_wait | M_ZERO);
 304                 if (mca_dump_buf == NULL) {
 305                         return NULL;
 306                 }
 307         }
 308
 309         buf = zalloc(mcache_zone);
 310         if (buf == NULL) {
 311                 goto fail;
 312         }
 313
 314         bzero(buf, MCACHE_ALLOC_SIZE);
 315
 316         /*
 317          * In case we didn't get a cache-aligned memory, round it up
 318          * accordingly.  This is needed in order to get the rest of
 319          * structure members aligned properly.  It also means that
 320          * the memory span gets shifted due to the round up, but it
 321          * is okay since we've allocated extra space for this.
 322          */
 323         cp = (mcache_t *)
 324             P2ROUNDUP((intptr_t)buf + sizeof(void *), CPU_CACHE_LINE_SIZE);
 325         pbuf = (void **)((intptr_t)cp - sizeof(void *));
 326         *pbuf = buf;
 327
 328         /*
 329          * Guaranteed alignment is valid only when we use the internal
 330          * slab allocator (currently set to use the zone allocator).
 331          */
 332         if (!need_zone) {
 333                 align = 1;
 334         } else {
 335                 /* Enforce 64-bit minimum alignment for zone-based buffers */
 336                 if (align == 0) {
 337                         align = MCACHE_ALIGN;
 338                 }
 339                 align = P2ROUNDUP(align, MCACHE_ALIGN);
 340         }
 341
 342         if ((align & (align - 1)) != 0) {
 343                 panic("mcache_create: bad alignment %lu", align);
 344                 /* NOTREACHED */
 345                 __builtin_unreachable();
 346         }
 347
 348         cp->mc_align = align;
 349         cp->mc_slab_alloc = allocfn;
 350         cp->mc_slab_free = freefn;
 351         cp->mc_slab_audit = auditfn;
 352         cp->mc_slab_log = logfn;
 353         cp->mc_slab_notify = notifyfn;
 354         cp->mc_private = need_zone ? cp : arg;
 355         cp->mc_bufsize = bufsize;
 356         cp->mc_flags = (flags & MCF_FLAGS_MASK) | mcache_flags;
 357
 358         (void) snprintf(cp->mc_name, sizeof(cp->mc_name), "mcache.%s", name);
 359
 360         (void) snprintf(lck_name, sizeof(lck_name), "%s.cpu", cp->mc_name);
 361         cp->mc_cpu_lock_grp_attr = lck_grp_attr_alloc_init();
 362         cp->mc_cpu_lock_grp = lck_grp_alloc_init(lck_name,
 363             cp->mc_cpu_lock_grp_attr);
 364         cp->mc_cpu_lock_attr = lck_attr_alloc_init();
 365
 366         /*
 367          * Allocation chunk size is the object's size plus any extra size
 368          * needed to satisfy the object's alignment.  It is enforced to be
 369          * at least the size of an LP64 pointer to simplify auditing and to
 370          * handle multiple-element allocation requests, where the elements
 371          * returned are linked together in a list.
 372          */
 373         chunksize = MAX(bufsize, sizeof(u_int64_t));
 374         if (need_zone) {
 375                 VERIFY(align != 0 && (align % MCACHE_ALIGN) == 0);
 376                 chunksize += sizeof(uint64_t) + align;
 377                 chunksize = P2ROUNDUP(chunksize, align);
 378                 cp->mc_slab_zone = zone_create(cp->mc_name, chunksize, ZC_DESTRUCTIBLE);
 379         }
 380         cp->mc_chunksize = chunksize;
 381
 382         /*
 383          * Initialize the bucket layer.
 384          */
 385         (void) snprintf(lck_name, sizeof(lck_name), "%s.bkt", cp->mc_name);
 386         cp->mc_bkt_lock_grp_attr = lck_grp_attr_alloc_init();
 387         cp->mc_bkt_lock_grp = lck_grp_alloc_init(lck_name,
 388             cp->mc_bkt_lock_grp_attr);
 389         cp->mc_bkt_lock_attr = lck_attr_alloc_init();
 390         lck_mtx_init(&cp->mc_bkt_lock, cp->mc_bkt_lock_grp,
 391             cp->mc_bkt_lock_attr);
 392
 393         (void) snprintf(lck_name, sizeof(lck_name), "%s.sync", cp->mc_name);
 394         cp->mc_sync_lock_grp_attr = lck_grp_attr_alloc_init();
 395         cp->mc_sync_lock_grp = lck_grp_alloc_init(lck_name,
 396             cp->mc_sync_lock_grp_attr);
 397         cp->mc_sync_lock_attr = lck_attr_alloc_init();
 398         lck_mtx_init(&cp->mc_sync_lock, cp->mc_sync_lock_grp,
 399             cp->mc_sync_lock_attr);
 400
 401         for (btp = mcache_bkttype; chunksize <= btp->bt_minbuf; btp++) {
 402                 continue;
 403         }
 404
 405         cp->cache_bkttype = btp;
 406
 407         /*
 408          * Initialize the CPU layer.  Each per-CPU structure is aligned
 409          * on the CPU cache line boundary to prevent false sharing.
 410          */
 411         for (c = 0; c < ncpu; c++) {
 412                 mcache_cpu_t *ccp = &cp->mc_cpu[c];
 413
 414                 VERIFY(IS_P2ALIGNED(ccp, CPU_CACHE_LINE_SIZE));
 415                 lck_mtx_init(&ccp->cc_lock, cp->mc_cpu_lock_grp,
 416                     cp->mc_cpu_lock_attr);
 417                 ccp->cc_objs = -1;
 418                 ccp->cc_pobjs = -1;
 419         }
 420
 421         if (mcache_ready) {
 422                 mcache_cache_bkt_enable(cp);
 423         }
 424
 425         /* TODO: dynamically create sysctl for stats */
 426
 427         MCACHE_LIST_LOCK();
 428         LIST_INSERT_HEAD(&mcache_head, cp, mc_list);
 429         MCACHE_LIST_UNLOCK();
 430
 431         /*
 432          * If cache buckets are enabled and this is the first cache
 433          * created, start the periodic cache update.
 434          */
 435         if (!(mcache_flags & MCF_NOCPUCACHE) && !mcache_updating) {
 436                 mcache_updating = 1;
 437                 mcache_update_timeout(NULL);
 438         }
 439         if (cp->mc_flags & MCF_DEBUG) {
 440                 printf("mcache_create: %s (%s) arg %p bufsize %lu align %lu "
 441                     "chunksize %lu bktsize %d\n", name, need_zone ? "i" : "e",
 442                     arg, bufsize, cp->mc_align, chunksize, btp->bt_bktsize);
 443         }
 444         return cp;
 445
 446 fail:
 447         if (buf != NULL) {
 448                 zfree(mcache_zone, buf);
 449         }
 450         return NULL;
 451 }
 452
 453 /*
 454  * Allocate one or more objects from a cache.
 455  */
 456 __private_extern__ unsigned int
 457 mcache_alloc_ext(mcache_t *cp, mcache_obj_t **list, unsigned int num, int wait)
 458 {
 459         mcache_cpu_t *ccp;
 460         mcache_obj_t **top = &(*list);
 461         mcache_bkt_t *bkt;
 462         unsigned int need = num;
 463         boolean_t nwretry = FALSE;
 464
 465         /* MCR_NOSLEEP and MCR_FAILOK are mutually exclusive */
 466         VERIFY((wait & (MCR_NOSLEEP | MCR_FAILOK)) != (MCR_NOSLEEP | MCR_FAILOK));
 467
 468         ASSERT(list != NULL);
 469         *list = NULL;
 470
 471         if (num == 0) {
 472                 return 0;
 473         }
 474
 475 retry_alloc:
 476         /* We may not always be running in the same CPU in case of retries */
 477         ccp = MCACHE_CPU(cp);
 478
 479         MCACHE_LOCK(&ccp->cc_lock);
 480         for (;;) {
 481                 /*
 482                  * If we have an object in the current CPU's filled bucket,
 483                  * chain the object to any previous objects and return if
 484                  * we've satisfied the number of requested objects.
 485                  */
 486                 if (ccp->cc_objs > 0) {
 487                         mcache_obj_t *tail;
 488                         int objs;
 489
 490                         /*
 491                          * Objects in the bucket are already linked together
 492                          * with the most recently freed object at the head of
 493                          * the list; grab as many objects as we can.
 494                          */
 495                         objs = MIN((unsigned int)ccp->cc_objs, need);
 496                         *list = ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
 497                         ccp->cc_objs -= objs;
 498                         ccp->cc_alloc += objs;
 499
 500                         tail = ccp->cc_filled->bkt_obj[ccp->cc_objs];
 501                         list = &tail->obj_next;
 502                         *list = NULL;
 503
 504                         /* If we got them all, return to caller */
 505                         if ((need -= objs) == 0) {
 506                                 MCACHE_UNLOCK(&ccp->cc_lock);
 507
 508                                 if (!(cp->mc_flags & MCF_NOLEAKLOG) &&
 509                                     cp->mc_slab_log != NULL) {
 510                                         (*cp->mc_slab_log)(num, *top, TRUE);
 511                                 }
 512
 513                                 if (cp->mc_flags & MCF_DEBUG) {
 514                                         goto debug_alloc;
 515                                 }
 516
 517                                 return num;
 518                         }
 519                 }
 520
 521                 /*
 522                  * The CPU's filled bucket is empty.  If the previous filled
 523                  * bucket was full, exchange and try again.
 524                  */
 525                 if (ccp->cc_pobjs > 0) {
 526                         mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
 527                         continue;
 528                 }
 529
 530                 /*
 531                  * If the bucket layer is disabled, allocate from slab.  This
 532                  * can happen either because MCF_NOCPUCACHE is set, or because
 533                  * the bucket layer is currently being resized.
 534                  */
 535                 if (ccp->cc_bktsize == 0) {
 536                         break;
 537                 }
 538
 539                 /*
 540                  * Both of the CPU's buckets are empty; try to get a full
 541                  * bucket from the bucket layer.  Upon success, refill this
 542                  * CPU and place any empty bucket into the empty list.
 543                  */
 544                 bkt = mcache_bkt_alloc(cp, &cp->mc_full);
 545                 if (bkt != NULL) {
 546                         if (ccp->cc_pfilled != NULL) {
 547                                 mcache_bkt_free(cp, &cp->mc_empty,
 548                                     ccp->cc_pfilled);
 549                         }
 550                         mcache_cpu_refill(ccp, bkt, ccp->cc_bktsize);
 551                         continue;
 552                 }
 553
 554                 /*
 555                  * The bucket layer has no full buckets; allocate the
 556                  * object(s) directly from the slab layer.
 557                  */
 558                 break;
 559         }
 560         MCACHE_UNLOCK(&ccp->cc_lock);
 561
 562         need -= (*cp->mc_slab_alloc)(cp->mc_private, &list, need, wait);
 563
 564         /*
 565          * If this is a blocking allocation, or if it is non-blocking and
 566          * the cache's full bucket is non-empty, then retry the allocation.
 567          */
 568         if (need > 0) {
 569                 if (!(wait & MCR_NONBLOCKING)) {
 570                         atomic_add_32(&cp->mc_wretry_cnt, 1);
 571                         goto retry_alloc;
 572                 } else if ((wait & (MCR_NOSLEEP | MCR_TRYHARD)) &&
 573                     !mcache_bkt_isempty(cp)) {
 574                         if (!nwretry) {
 575                                 nwretry = TRUE;
 576                         }
 577                         atomic_add_32(&cp->mc_nwretry_cnt, 1);
 578                         goto retry_alloc;
 579                 } else if (nwretry) {
 580                         atomic_add_32(&cp->mc_nwfail_cnt, 1);
 581                 }
 582         }
 583
 584         if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL) {
 585                 (*cp->mc_slab_log)((num - need), *top, TRUE);
 586         }
 587
 588         if (!(cp->mc_flags & MCF_DEBUG)) {
 589                 return num - need;
 590         }
 591
 592 debug_alloc:
 593         if (cp->mc_flags & MCF_DEBUG) {
 594                 mcache_obj_t **o = top;
 595                 unsigned int n;
 596
 597                 n = 0;
 598                 /*
 599                  * Verify that the chain of objects have the same count as
 600                  * what we are about to report to the caller.  Any mismatch
 601                  * here means that the object list is insanely broken and
 602                  * therefore we must panic.
 603                  */
 604                 while (*o != NULL) {
 605                         o = &(*o)->obj_next;
 606                         ++n;
 607                 }
 608                 if (n != (num - need)) {
 609                         panic("mcache_alloc_ext: %s cp %p corrupted list "
 610                             "(got %d actual %d)\n", cp->mc_name,
 611                             (void *)cp, num - need, n);
 612                         /* NOTREACHED */
 613                         __builtin_unreachable();
 614                 }
 615         }
 616
 617         /* Invoke the slab layer audit callback if auditing is enabled */
 618         if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL) {
 619                 (*cp->mc_slab_audit)(cp->mc_private, *top, TRUE);
 620         }
 621
 622         return num - need;
 623 }
 624
 625 /*
 626  * Allocate a single object from a cache.
 627  */
 628 __private_extern__ void *
 629 mcache_alloc(mcache_t *cp, int wait)
 630 {
 631         mcache_obj_t *buf;
 632
 633         (void) mcache_alloc_ext(cp, &buf, 1, wait);
 634         return buf;
 635 }
 636
 637 __private_extern__ void
 638 mcache_waiter_inc(mcache_t *cp)
 639 {
 640         atomic_add_32(&cp->mc_waiter_cnt, 1);
 641 }
 642
 643 __private_extern__ void
 644 mcache_waiter_dec(mcache_t *cp)
 645 {
 646         atomic_add_32(&cp->mc_waiter_cnt, -1);
 647 }
 648
 649 __private_extern__ boolean_t
 650 mcache_bkt_isempty(mcache_t *cp)
 651 {
 652         /*
 653          * This isn't meant to accurately tell whether there are
 654          * any full buckets in the cache; it is simply a way to
 655          * obtain "hints" about the state of the cache.
 656          */
 657         return cp->mc_full.bl_total == 0;
 658 }
 659
 660 /*
 661  * Notify the slab layer about an event.
 662  */
 663 static void
 664 mcache_notify(mcache_t *cp, u_int32_t event)
 665 {
 666         if (cp->mc_slab_notify != NULL) {
 667                 (*cp->mc_slab_notify)(cp->mc_private, event);
 668         }
 669 }
 670
 671 /*
 672  * Purge the cache and disable its buckets.
 673  */
 674 static void
 675 mcache_purge(void *arg)
 676 {
 677         mcache_t *cp = arg;
 678
 679         mcache_bkt_purge(cp);
 680         /*
 681          * We cannot simply call mcache_cache_bkt_enable() from here as
 682          * a bucket resize may be in flight and we would cause the CPU
 683          * layers of the cache to point to different sizes.  Therefore,
 684          * we simply increment the enable count so that during the next
 685          * periodic cache update the buckets can be reenabled.
 686          */
 687         lck_mtx_lock_spin(&cp->mc_sync_lock);
 688         cp->mc_enable_cnt++;
 689         lck_mtx_unlock(&cp->mc_sync_lock);
 690 }
 691
 692 __private_extern__ boolean_t
 693 mcache_purge_cache(mcache_t *cp, boolean_t async)
 694 {
 695         /*
 696          * Purging a cache that has no per-CPU caches or is already
 697          * in the process of being purged is rather pointless.
 698          */
 699         if (cp->mc_flags & MCF_NOCPUCACHE) {
 700                 return FALSE;
 701         }
 702
 703         lck_mtx_lock_spin(&cp->mc_sync_lock);
 704         if (cp->mc_purge_cnt > 0) {
 705                 lck_mtx_unlock(&cp->mc_sync_lock);
 706                 return FALSE;
 707         }
 708         cp->mc_purge_cnt++;
 709         lck_mtx_unlock(&cp->mc_sync_lock);
 710
 711         if (async) {
 712                 mcache_dispatch(mcache_purge, cp);
 713         } else {
 714                 mcache_purge(cp);
 715         }
 716
 717         return TRUE;
 718 }
 719
 720 /*
 721  * Free a single object to a cache.
 722  */
 723 __private_extern__ void
 724 mcache_free(mcache_t *cp, void *buf)
 725 {
 726         ((mcache_obj_t *)buf)->obj_next = NULL;
 727         mcache_free_ext(cp, (mcache_obj_t *)buf);
 728 }
 729
 730 /*
 731  * Free one or more objects to a cache.
 732  */
 733 __private_extern__ void
 734 mcache_free_ext(mcache_t *cp, mcache_obj_t *list)
 735 {
 736         mcache_cpu_t *ccp = MCACHE_CPU(cp);
 737         mcache_bkttype_t *btp;
 738         mcache_obj_t *nlist;
 739         mcache_bkt_t *bkt;
 740
 741         if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL) {
 742                 (*cp->mc_slab_log)(0, list, FALSE);
 743         }
 744
 745         /* Invoke the slab layer audit callback if auditing is enabled */
 746         if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL) {
 747                 (*cp->mc_slab_audit)(cp->mc_private, list, FALSE);
 748         }
 749
 750         MCACHE_LOCK(&ccp->cc_lock);
 751         for (;;) {
 752                 /*
 753                  * If there is space in the current CPU's filled bucket, put
 754                  * the object there and return once all objects are freed.
 755                  * Note the cast to unsigned integer takes care of the case
 756                  * where the bucket layer is disabled (when cc_objs is -1).
 757                  */
 758                 if ((unsigned int)ccp->cc_objs <
 759                     (unsigned int)ccp->cc_bktsize) {
 760                         /*
 761                          * Reverse the list while we place the object into the
 762                          * bucket; this effectively causes the most recently
 763                          * freed object(s) to be reused during allocation.
 764                          */
 765                         nlist = list->obj_next;
 766                         list->obj_next = (ccp->cc_objs == 0) ? NULL :
 767                             ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
 768                         ccp->cc_filled->bkt_obj[ccp->cc_objs++] = list;
 769                         ccp->cc_free++;
 770
 771                         if ((list = nlist) != NULL) {
 772                                 continue;
 773                         }
 774
 775                         /* We are done; return to caller */
 776                         MCACHE_UNLOCK(&ccp->cc_lock);
 777
 778                         /* If there is a waiter below, notify it */
 779                         if (cp->mc_waiter_cnt > 0) {
 780                                 mcache_notify(cp, MCN_RETRYALLOC);
 781                         }
 782                         return;
 783                 }
 784
 785                 /*
 786                  * The CPU's filled bucket is full.  If the previous filled
 787                  * bucket was empty, exchange and try again.
 788                  */
 789                 if (ccp->cc_pobjs == 0) {
 790                         mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
 791                         continue;
 792                 }
 793
 794                 /*
 795                  * If the bucket layer is disabled, free to slab.  This can
 796                  * happen either because MCF_NOCPUCACHE is set, or because
 797                  * the bucket layer is currently being resized.
 798                  */
 799                 if (ccp->cc_bktsize == 0) {
 800                         break;
 801                 }
 802
 803                 /*
 804                  * Both of the CPU's buckets are full; try to get an empty
 805                  * bucket from the bucket layer.  Upon success, empty this
 806                  * CPU and place any full bucket into the full list.
 807                  */
 808                 bkt = mcache_bkt_alloc(cp, &cp->mc_empty);
 809                 if (bkt != NULL) {
 810                         if (ccp->cc_pfilled != NULL) {
 811                                 mcache_bkt_free(cp, &cp->mc_full,
 812                                     ccp->cc_pfilled);
 813                         }
 814                         mcache_cpu_refill(ccp, bkt, 0);
 815                         continue;
 816                 }
 817                 btp = cp->cache_bkttype;
 818
 819                 /*
 820                  * We need an empty bucket to put our freed objects into
 821                  * but couldn't get an empty bucket from the bucket layer;
 822                  * attempt to allocate one.  We do not want to block for
 823                  * allocation here, and if the bucket allocation fails
 824                  * we will simply fall through to the slab layer.
 825                  */
 826                 MCACHE_UNLOCK(&ccp->cc_lock);
 827                 bkt = mcache_alloc(btp->bt_cache, MCR_NOSLEEP);
 828                 MCACHE_LOCK(&ccp->cc_lock);
 829
 830                 if (bkt != NULL) {
 831                         /*
 832                          * We have an empty bucket, but since we drop the
 833                          * CPU lock above, the cache's bucket size may have
 834                          * changed.  If so, free the bucket and try again.
 835                          */
 836                         if (ccp->cc_bktsize != btp->bt_bktsize) {
 837                                 MCACHE_UNLOCK(&ccp->cc_lock);
 838                                 mcache_free(btp->bt_cache, bkt);
 839                                 MCACHE_LOCK(&ccp->cc_lock);
 840                                 continue;
 841                         }
 842
 843                         /*
 844                          * Store it in the bucket object since we'll
 845                          * need to refer to it during bucket destroy;
 846                          * we can't safely refer to cache_bkttype as
 847                          * the bucket lock may not be acquired then.
 848                          */
 849                         bkt->bkt_type = btp;
 850
 851                         /*
 852                          * We have an empty bucket of the right size;
 853                          * add it to the bucket layer and try again.
 854                          */
 855                         mcache_bkt_free(cp, &cp->mc_empty, bkt);
 856                         continue;
 857                 }
 858
 859                 /*
 860                  * The bucket layer has no empty buckets; free the
 861                  * object(s) directly to the slab layer.
 862                  */
 863                 break;
 864         }
 865         MCACHE_UNLOCK(&ccp->cc_lock);
 866
 867         /* If there is a waiter below, notify it */
 868         if (cp->mc_waiter_cnt > 0) {
 869                 mcache_notify(cp, MCN_RETRYALLOC);
 870         }
 871
 872         /* Advise the slab layer to purge the object(s) */
 873         (*cp->mc_slab_free)(cp->mc_private, list,
 874             (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
 875 }
 876
 877 /*
 878  * Cache destruction routine.
 879  */
 880 __private_extern__ void
 881 mcache_destroy(mcache_t *cp)
 882 {
 883         void **pbuf;
 884
 885         MCACHE_LIST_LOCK();
 886         LIST_REMOVE(cp, mc_list);
 887         MCACHE_LIST_UNLOCK();
 888
 889         mcache_bkt_purge(cp);
 890
 891         /*
 892          * This cache is dead; there should be no further transaction.
 893          * If it's still invoked, make sure that it induces a fault.
 894          */
 895         cp->mc_slab_alloc = NULL;
 896         cp->mc_slab_free = NULL;
 897         cp->mc_slab_audit = NULL;
 898
 899         lck_attr_free(cp->mc_bkt_lock_attr);
 900         lck_grp_free(cp->mc_bkt_lock_grp);
 901         lck_grp_attr_free(cp->mc_bkt_lock_grp_attr);
 902
 903         lck_attr_free(cp->mc_cpu_lock_attr);
 904         lck_grp_free(cp->mc_cpu_lock_grp);
 905         lck_grp_attr_free(cp->mc_cpu_lock_grp_attr);
 906
 907         lck_attr_free(cp->mc_sync_lock_attr);
 908         lck_grp_free(cp->mc_sync_lock_grp);
 909         lck_grp_attr_free(cp->mc_sync_lock_grp_attr);
 910
 911         /*
 912          * TODO: We need to destroy the zone here, but cannot do it
 913          * because there is no such way to achieve that.  Until then
 914          * the memory allocated for the zone structure is leaked.
 915          * Once it is achievable, uncomment these lines:
 916          *
 917          *      if (cp->mc_slab_zone != NULL) {
 918          *              zdestroy(cp->mc_slab_zone);
 919          *              cp->mc_slab_zone = NULL;
 920          *      }
 921          */
 922
 923         /* Get the original address since we're about to free it */
 924         pbuf = (void **)((intptr_t)cp - sizeof(void *));
 925
 926         zfree(mcache_zone, *pbuf);
 927 }
 928
 929 /*
 930  * Internal slab allocator used as a backend for simple caches.  The current
 931  * implementation uses the zone allocator for simplicity reasons.
 932  */
 933 static unsigned int
 934 mcache_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num,
 935     int wait)
 936 {
 937 #pragma unused(wait)
 938         mcache_t *cp = arg;
 939         unsigned int need = num;
 940         size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof(u_int64_t));
 941         u_int32_t flags = cp->mc_flags;
 942         void *buf, *base, **pbuf;
 943         mcache_obj_t **list = *plist;
 944
 945         *list = NULL;
 946
 947         for (;;) {
 948                 buf = zalloc(cp->mc_slab_zone);
 949                 if (buf == NULL) {
 950                         break;
 951                 }
 952
 953                 /* Get the aligned base address for this object */
 954                 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
 955                     cp->mc_align);
 956
 957                 /*
 958                  * Wind back a pointer size from the aligned base and
 959                  * save the original address so we can free it later.
 960                  */
 961                 pbuf = (void **)((intptr_t)base - sizeof(void *));
 962                 *pbuf = buf;
 963
 964                 VERIFY(((intptr_t)base + cp->mc_bufsize) <=
 965                     ((intptr_t)buf + cp->mc_chunksize));
 966
 967                 /*
 968                  * If auditing is enabled, patternize the contents of
 969                  * the buffer starting from the 64-bit aligned base to
 970                  * the end of the buffer; the length is rounded up to
 971                  * the nearest 64-bit multiply; this is because we use
 972                  * 64-bit memory access to set/check the pattern.
 973                  */
 974                 if (flags & MCF_DEBUG) {
 975                         VERIFY(((intptr_t)base + rsize) <=
 976                             ((intptr_t)buf + cp->mc_chunksize));
 977                         mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
 978                 }
 979
 980                 VERIFY(IS_P2ALIGNED(base, cp->mc_align));
 981                 *list = (mcache_obj_t *)base;
 982
 983                 (*list)->obj_next = NULL;
 984                 list = *plist = &(*list)->obj_next;
 985
 986                 /* If we got them all, return to mcache */
 987                 if (--need == 0) {
 988                         break;
 989                 }
 990         }
 991
 992         return num - need;
 993 }
 994
 995 /*
 996  * Internal slab deallocator used as a backend for simple caches.
 997  */
 998 static void
 999 mcache_slab_free(void *arg, mcache_obj_t *list, __unused boolean_t purged)
1000 {
1001         mcache_t *cp = arg;
1002         mcache_obj_t *nlist;
1003         size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof(u_int64_t));
1004         u_int32_t flags = cp->mc_flags;
1005         void *base;
1006         void **pbuf;
1007
1008         for (;;) {
1009                 nlist = list->obj_next;
1010                 list->obj_next = NULL;
1011
1012                 base = list;
1013                 VERIFY(IS_P2ALIGNED(base, cp->mc_align));
1014
1015                 /* Get the original address since we're about to free it */
1016                 pbuf = (void **)((intptr_t)base - sizeof(void *));
1017
1018                 VERIFY(((intptr_t)base + cp->mc_bufsize) <=
1019                     ((intptr_t)*pbuf + cp->mc_chunksize));
1020
1021                 if (flags & MCF_DEBUG) {
1022                         VERIFY(((intptr_t)base + rsize) <=
1023                             ((intptr_t)*pbuf + cp->mc_chunksize));
1024                         mcache_audit_free_verify(NULL, base, 0, rsize);
1025                 }
1026
1027                 /* Free it to zone */
1028                 zfree(cp->mc_slab_zone, *pbuf);
1029
1030                 /* No more objects to free; return to mcache */
1031                 if ((list = nlist) == NULL) {
1032                         break;
1033                 }
1034         }
1035 }
1036
1037 /*
1038  * Internal slab auditor for simple caches.
1039  */
1040 static void
1041 mcache_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
1042 {
1043         mcache_t *cp = arg;
1044         size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof(u_int64_t));
1045         void *base, **pbuf;
1046
1047         while (list != NULL) {
1048                 mcache_obj_t *next = list->obj_next;
1049
1050                 base = list;
1051                 VERIFY(IS_P2ALIGNED(base, cp->mc_align));
1052
1053                 /* Get the original address */
1054                 pbuf = (void **)((intptr_t)base - sizeof(void *));
1055
1056                 VERIFY(((intptr_t)base + rsize) <=
1057                     ((intptr_t)*pbuf + cp->mc_chunksize));
1058
1059                 if (!alloc) {
1060                         mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
1061                 } else {
1062                         mcache_audit_free_verify_set(NULL, base, 0, rsize);
1063                 }
1064
1065                 list = list->obj_next = next;
1066         }
1067 }
1068
1069 /*
1070  * Refill the CPU's filled bucket with bkt and save the previous one.
1071  */
1072 static void
1073 mcache_cpu_refill(mcache_cpu_t *ccp, mcache_bkt_t *bkt, int objs)
1074 {
1075         ASSERT((ccp->cc_filled == NULL && ccp->cc_objs == -1) ||
1076             (ccp->cc_filled && ccp->cc_objs + objs == ccp->cc_bktsize));
1077         ASSERT(ccp->cc_bktsize > 0);
1078
1079         ccp->cc_pfilled = ccp->cc_filled;
1080         ccp->cc_pobjs = ccp->cc_objs;
1081         ccp->cc_filled = bkt;
1082         ccp->cc_objs = objs;
1083 }
1084
1085 /*
1086  * Allocate a bucket from the bucket layer.
1087  */
1088 static mcache_bkt_t *
1089 mcache_bkt_alloc(mcache_t *cp, mcache_bktlist_t *blp)
1090 {
1091         mcache_bkt_t *bkt;
1092
1093         if (!MCACHE_LOCK_TRY(&cp->mc_bkt_lock)) {
1094                 /*
1095                  * The bucket layer lock is held by another CPU; increase
1096                  * the contention count so that we can later resize the
1097                  * bucket size accordingly.
1098                  */
1099                 MCACHE_LOCK(&cp->mc_bkt_lock);
1100                 cp->mc_bkt_contention++;
1101         }
1102
1103         if ((bkt = blp->bl_list) != NULL) {
1104                 blp->bl_list = bkt->bkt_next;
1105                 if (--blp->bl_total < blp->bl_min) {
1106                         blp->bl_min = blp->bl_total;
1107                 }
1108                 blp->bl_alloc++;
1109         }
1110
1111         MCACHE_UNLOCK(&cp->mc_bkt_lock);
1112
1113         return bkt;
1114 }
1115
1116 /*
1117  * Free a bucket to the bucket layer.
1118  */
1119 static void
1120 mcache_bkt_free(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkt_t *bkt)
1121 {
1122         MCACHE_LOCK(&cp->mc_bkt_lock);
1123
1124         bkt->bkt_next = blp->bl_list;
1125         blp->bl_list = bkt;
1126         blp->bl_total++;
1127
1128         MCACHE_UNLOCK(&cp->mc_bkt_lock);
1129 }
1130
1131 /*
1132  * Enable the bucket layer of a cache.
1133  */
1134 static void
1135 mcache_cache_bkt_enable(mcache_t *cp)
1136 {
1137         mcache_cpu_t *ccp;
1138         unsigned int cpu;
1139
1140         if (cp->mc_flags & MCF_NOCPUCACHE) {
1141                 return;
1142         }
1143
1144         for (cpu = 0; cpu < ncpu; cpu++) {
1145                 ccp = &cp->mc_cpu[cpu];
1146                 MCACHE_LOCK(&ccp->cc_lock);
1147                 ccp->cc_bktsize = cp->cache_bkttype->bt_bktsize;
1148                 MCACHE_UNLOCK(&ccp->cc_lock);
1149         }
1150 }
1151
1152 /*
1153  * Purge all buckets from a cache and disable its bucket layer.
1154  */
1155 static void
1156 mcache_bkt_purge(mcache_t *cp)
1157 {
1158         mcache_cpu_t *ccp;
1159         mcache_bkt_t *bp, *pbp;
1160         int objs, pobjs;
1161         unsigned int cpu;
1162
1163         for (cpu = 0; cpu < ncpu; cpu++) {
1164                 ccp = &cp->mc_cpu[cpu];
1165
1166                 MCACHE_LOCK(&ccp->cc_lock);
1167
1168                 bp = ccp->cc_filled;
1169                 pbp = ccp->cc_pfilled;
1170                 objs = ccp->cc_objs;
1171                 pobjs = ccp->cc_pobjs;
1172                 ccp->cc_filled = NULL;
1173                 ccp->cc_pfilled = NULL;
1174                 ccp->cc_objs = -1;
1175                 ccp->cc_pobjs = -1;
1176                 ccp->cc_bktsize = 0;
1177
1178                 MCACHE_UNLOCK(&ccp->cc_lock);
1179
1180                 if (bp != NULL) {
1181                         mcache_bkt_destroy(cp, bp, objs);
1182                 }
1183                 if (pbp != NULL) {
1184                         mcache_bkt_destroy(cp, pbp, pobjs);
1185                 }
1186         }
1187
1188         mcache_bkt_ws_zero(cp);
1189         mcache_bkt_ws_reap(cp);
1190 }
1191
1192 /*
1193  * Free one or more objects in the bucket to the slab layer,
1194  * and also free the bucket itself.
1195  */
1196 static void
1197 mcache_bkt_destroy(mcache_t *cp, mcache_bkt_t *bkt, int nobjs)
1198 {
1199         if (nobjs > 0) {
1200                 mcache_obj_t *top = bkt->bkt_obj[nobjs - 1];
1201
1202                 if (cp->mc_flags & MCF_DEBUG) {
1203                         mcache_obj_t *o = top;
1204                         int cnt = 0;
1205
1206                         /*
1207                          * Verify that the chain of objects in the bucket is
1208                          * valid.  Any mismatch here means a mistake when the
1209                          * object(s) were freed to the CPU layer, so we panic.
1210                          */
1211                         while (o != NULL) {
1212                                 o = o->obj_next;
1213                                 ++cnt;
1214                         }
1215                         if (cnt != nobjs) {
1216                                 panic("mcache_bkt_destroy: %s cp %p corrupted "
1217                                     "list in bkt %p (nobjs %d actual %d)\n",
1218                                     cp->mc_name, (void *)cp, (void *)bkt,
1219                                     nobjs, cnt);
1220                                 /* NOTREACHED */
1221                                 __builtin_unreachable();
1222                         }
1223                 }
1224
1225                 /* Advise the slab layer to purge the object(s) */
1226                 (*cp->mc_slab_free)(cp->mc_private, top,
1227                     (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
1228         }
1229         mcache_free(bkt->bkt_type->bt_cache, bkt);
1230 }
1231
1232 /*
1233  * Update the bucket layer working set statistics.
1234  */
1235 static void
1236 mcache_bkt_ws_update(mcache_t *cp)
1237 {
1238         MCACHE_LOCK(&cp->mc_bkt_lock);
1239
1240         cp->mc_full.bl_reaplimit = cp->mc_full.bl_min;
1241         cp->mc_full.bl_min = cp->mc_full.bl_total;
1242         cp->mc_empty.bl_reaplimit = cp->mc_empty.bl_min;
1243         cp->mc_empty.bl_min = cp->mc_empty.bl_total;
1244
1245         MCACHE_UNLOCK(&cp->mc_bkt_lock);
1246 }
1247
1248 /*
1249  * Mark everything as eligible for reaping (working set is zero).
1250  */
1251 static void
1252 mcache_bkt_ws_zero(mcache_t *cp)
1253 {
1254         MCACHE_LOCK(&cp->mc_bkt_lock);
1255
1256         cp->mc_full.bl_reaplimit = cp->mc_full.bl_total;
1257         cp->mc_full.bl_min = cp->mc_full.bl_total;
1258         cp->mc_empty.bl_reaplimit = cp->mc_empty.bl_total;
1259         cp->mc_empty.bl_min = cp->mc_empty.bl_total;
1260
1261         MCACHE_UNLOCK(&cp->mc_bkt_lock);
1262 }
1263
1264 /*
1265  * Reap all buckets that are beyond the working set.
1266  */
1267 static void
1268 mcache_bkt_ws_reap(mcache_t *cp)
1269 {
1270         long reap;
1271         mcache_bkt_t *bkt;
1272
1273         reap = MIN(cp->mc_full.bl_reaplimit, cp->mc_full.bl_min);
1274         while (reap-- &&
1275             (bkt = mcache_bkt_alloc(cp, &cp->mc_full)) != NULL) {
1276                 mcache_bkt_destroy(cp, bkt, bkt->bkt_type->bt_bktsize);
1277         }
1278
1279         reap = MIN(cp->mc_empty.bl_reaplimit, cp->mc_empty.bl_min);
1280         while (reap-- &&
1281             (bkt = mcache_bkt_alloc(cp, &cp->mc_empty)) != NULL) {
1282                 mcache_bkt_destroy(cp, bkt, 0);
1283         }
1284 }
1285
1286 static void
1287 mcache_reap_timeout(thread_call_param_t dummy __unused,
1288     thread_call_param_t arg)
1289 {
1290         volatile UInt32 *flag = arg;
1291
1292         ASSERT(flag == &mcache_reaping);
1293
1294         *flag = 0;
1295 }
1296
1297 static void
1298 mcache_reap_done(void *flag)
1299 {
1300         uint64_t deadline, leeway;
1301
1302         clock_interval_to_deadline(mcache_reap_interval, NSEC_PER_SEC,
1303             &deadline);
1304         clock_interval_to_absolutetime_interval(mcache_reap_interval_leeway,
1305             NSEC_PER_SEC, &leeway);
1306         thread_call_enter_delayed_with_leeway(mcache_reap_tcall, flag,
1307             deadline, leeway, THREAD_CALL_DELAY_LEEWAY);
1308 }
1309
1310 static void
1311 mcache_reap_start(void *arg)
1312 {
1313         UInt32 *flag = arg;
1314
1315         ASSERT(flag == &mcache_reaping);
1316
1317         mcache_applyall(mcache_cache_reap);
1318         mcache_dispatch(mcache_reap_done, flag);
1319 }
1320
1321 __private_extern__ void
1322 mcache_reap(void)
1323 {
1324         UInt32 *flag = &mcache_reaping;
1325
1326         if (mcache_llock_owner == current_thread() ||
1327             !OSCompareAndSwap(0, 1, flag)) {
1328                 return;
1329         }
1330
1331         mcache_dispatch(mcache_reap_start, flag);
1332 }
1333
1334 __private_extern__ void
1335 mcache_reap_now(mcache_t *cp, boolean_t purge)
1336 {
1337         if (purge) {
1338                 mcache_bkt_purge(cp);
1339                 mcache_cache_bkt_enable(cp);
1340         } else {
1341                 mcache_bkt_ws_zero(cp);
1342                 mcache_bkt_ws_reap(cp);
1343         }
1344 }
1345
1346 static void
1347 mcache_cache_reap(mcache_t *cp)
1348 {
1349         mcache_bkt_ws_reap(cp);
1350 }
1351
1352 /*
1353  * Performs period maintenance on a cache.
1354  */
1355 static void
1356 mcache_cache_update(mcache_t *cp)
1357 {
1358         int need_bkt_resize = 0;
1359         int need_bkt_reenable = 0;
1360
1361         lck_mtx_assert(mcache_llock, LCK_MTX_ASSERT_OWNED);
1362
1363         mcache_bkt_ws_update(cp);
1364
1365         /*
1366          * Cache resize and post-purge reenable are mutually exclusive.
1367          * If the cache was previously purged, there is no point of
1368          * increasing the bucket size as there was an indication of
1369          * memory pressure on the system.
1370          */
1371         lck_mtx_lock_spin(&cp->mc_sync_lock);
1372         if (!(cp->mc_flags & MCF_NOCPUCACHE) && cp->mc_enable_cnt) {
1373                 need_bkt_reenable = 1;
1374         }
1375         lck_mtx_unlock(&cp->mc_sync_lock);
1376
1377         MCACHE_LOCK(&cp->mc_bkt_lock);
1378         /*
1379          * If the contention count is greater than the threshold, and if
1380          * we are not already at the maximum bucket size, increase it.
1381          * Otherwise, if this cache was previously purged by the user
1382          * then we simply reenable it.
1383          */
1384         if ((unsigned int)cp->mc_chunksize < cp->cache_bkttype->bt_maxbuf &&
1385             (int)(cp->mc_bkt_contention - cp->mc_bkt_contention_prev) >
1386             mcache_bkt_contention && !need_bkt_reenable) {
1387                 need_bkt_resize = 1;
1388         }
1389
1390         cp->mc_bkt_contention_prev = cp->mc_bkt_contention;
1391         MCACHE_UNLOCK(&cp->mc_bkt_lock);
1392
1393         if (need_bkt_resize) {
1394                 mcache_dispatch(mcache_cache_bkt_resize, cp);
1395         } else if (need_bkt_reenable) {
1396                 mcache_dispatch(mcache_cache_enable, cp);
1397         }
1398 }
1399
1400 /*
1401  * Recompute a cache's bucket size.  This is an expensive operation
1402  * and should not be done frequently; larger buckets provide for a
1403  * higher transfer rate with the bucket while smaller buckets reduce
1404  * the memory consumption.
1405  */
1406 static void
1407 mcache_cache_bkt_resize(void *arg)
1408 {
1409         mcache_t *cp = arg;
1410         mcache_bkttype_t *btp = cp->cache_bkttype;
1411
1412         if ((unsigned int)cp->mc_chunksize < btp->bt_maxbuf) {
1413                 mcache_bkt_purge(cp);
1414
1415                 /*
1416                  * Upgrade to the next bucket type with larger bucket size;
1417                  * temporarily set the previous contention snapshot to a
1418                  * negative number to prevent unnecessary resize request.
1419                  */
1420                 MCACHE_LOCK(&cp->mc_bkt_lock);
1421                 cp->cache_bkttype = ++btp;
1422                 cp->mc_bkt_contention_prev = cp->mc_bkt_contention + INT_MAX;
1423                 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1424
1425                 mcache_cache_enable(cp);
1426         }
1427 }
1428
1429 /*
1430  * Reenable a previously disabled cache due to purge.
1431  */
1432 static void
1433 mcache_cache_enable(void *arg)
1434 {
1435         mcache_t *cp = arg;
1436
1437         lck_mtx_lock_spin(&cp->mc_sync_lock);
1438         cp->mc_purge_cnt = 0;
1439         cp->mc_enable_cnt = 0;
1440         lck_mtx_unlock(&cp->mc_sync_lock);
1441
1442         mcache_cache_bkt_enable(cp);
1443 }
1444
1445 static void
1446 mcache_update_timeout(__unused void *arg)
1447 {
1448         uint64_t deadline, leeway;
1449
1450         clock_interval_to_deadline(mcache_reap_interval, NSEC_PER_SEC,
1451             &deadline);
1452         clock_interval_to_absolutetime_interval(mcache_reap_interval_leeway,
1453             NSEC_PER_SEC, &leeway);
1454         thread_call_enter_delayed_with_leeway(mcache_update_tcall, NULL,
1455             deadline, leeway, THREAD_CALL_DELAY_LEEWAY);
1456 }
1457
1458 static void
1459 mcache_update(thread_call_param_t arg __unused,
1460     thread_call_param_t dummy __unused)
1461 {
1462         mcache_applyall(mcache_cache_update);
1463         mcache_update_timeout(NULL);
1464 }
1465
1466 static void
1467 mcache_applyall(void (*func)(mcache_t *))
1468 {
1469         mcache_t *cp;
1470
1471         MCACHE_LIST_LOCK();
1472         LIST_FOREACH(cp, &mcache_head, mc_list) {
1473                 func(cp);
1474         }
1475         MCACHE_LIST_UNLOCK();
1476 }
1477
1478 static void
1479 mcache_dispatch(void (*func)(void *), void *arg)
1480 {
1481         ASSERT(func != NULL);
1482         timeout(func, arg, hz / 1000);
1483 }
1484
1485 __private_extern__ void
1486 mcache_buffer_log(mcache_audit_t *mca, void *addr, mcache_t *cp,
1487     struct timeval *base_ts)
1488 {
1489         struct timeval now, base = { .tv_sec = 0, .tv_usec = 0 };
1490         void *stack[MCACHE_STACK_DEPTH + 1];
1491         struct mca_trn *transaction;
1492
1493         transaction = &mca->mca_trns[mca->mca_next_trn];
1494
1495         mca->mca_addr = addr;
1496         mca->mca_cache = cp;
1497
1498         transaction->mca_thread = current_thread();
1499
1500         bzero(stack, sizeof(stack));
1501         transaction->mca_depth = (uint16_t)OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
1502         bcopy(&stack[1], transaction->mca_stack,
1503             sizeof(transaction->mca_stack));
1504
1505         microuptime(&now);
1506         if (base_ts != NULL) {
1507                 base = *base_ts;
1508         }
1509         /* tstamp is in ms relative to base_ts */
1510         transaction->mca_tstamp = ((now.tv_usec - base.tv_usec) / 1000);
1511         if ((now.tv_sec - base.tv_sec) > 0) {
1512                 transaction->mca_tstamp += ((now.tv_sec - base.tv_sec) * 1000);
1513         }
1514
1515         mca->mca_next_trn =
1516             (mca->mca_next_trn + 1) % mca_trn_max;
1517 }
1518
1519 /*
1520  * N.B.: mcache_set_pattern(), mcache_verify_pattern() and
1521  * mcache_verify_set_pattern() are marked as noinline to prevent the
1522  * compiler from aliasing pointers when they are inlined inside the callers
1523  * (e.g. mcache_audit_free_verify_set()) which would be undefined behavior.
1524  */
1525 __private_extern__ OS_NOINLINE void
1526 mcache_set_pattern(u_int64_t pattern, void *buf_arg, size_t size)
1527 {
1528         u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1529         u_int64_t *buf = (u_int64_t *)buf_arg;
1530
1531         VERIFY(IS_P2ALIGNED(buf_arg, sizeof(u_int64_t)));
1532         VERIFY(IS_P2ALIGNED(size, sizeof(u_int64_t)));
1533
1534         while (buf < buf_end) {
1535                 *buf++ = pattern;
1536         }
1537 }
1538
1539 __private_extern__ OS_NOINLINE void *
1540 mcache_verify_pattern(u_int64_t pattern, void *buf_arg, size_t size)
1541 {
1542         u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1543         u_int64_t *buf;
1544
1545         VERIFY(IS_P2ALIGNED(buf_arg, sizeof(u_int64_t)));
1546         VERIFY(IS_P2ALIGNED(size, sizeof(u_int64_t)));
1547
1548         for (buf = buf_arg; buf < buf_end; buf++) {
1549                 if (*buf != pattern) {
1550                         return buf;
1551                 }
1552         }
1553         return NULL;
1554 }
1555
1556 OS_NOINLINE static void *
1557 mcache_verify_set_pattern(u_int64_t old, u_int64_t new, void *buf_arg,
1558     size_t size)
1559 {
1560         u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1561         u_int64_t *buf;
1562
1563         VERIFY(IS_P2ALIGNED(buf_arg, sizeof(u_int64_t)));
1564         VERIFY(IS_P2ALIGNED(size, sizeof(u_int64_t)));
1565
1566         for (buf = buf_arg; buf < buf_end; buf++) {
1567                 if (*buf != old) {
1568                         mcache_set_pattern(old, buf_arg,
1569                             (uintptr_t)buf - (uintptr_t)buf_arg);
1570                         return buf;
1571                 }
1572                 *buf = new;
1573         }
1574         return NULL;
1575 }
1576
1577 __private_extern__ void
1578 mcache_audit_free_verify(mcache_audit_t *mca, void *base, size_t offset,
1579     size_t size)
1580 {
1581         void *addr;
1582         u_int64_t *oaddr64;
1583         mcache_obj_t *next;
1584
1585         addr = (void *)((uintptr_t)base + offset);
1586         next = ((mcache_obj_t *)addr)->obj_next;
1587
1588         /* For the "obj_next" pointer in the buffer */
1589         oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof(u_int64_t));
1590         *oaddr64 = MCACHE_FREE_PATTERN;
1591
1592         if ((oaddr64 = mcache_verify_pattern(MCACHE_FREE_PATTERN,
1593             (caddr_t)base, size)) != NULL) {
1594                 mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
1595                     (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
1596                 /* NOTREACHED */
1597         }
1598         ((mcache_obj_t *)addr)->obj_next = next;
1599 }
1600
1601 __private_extern__ void
1602 mcache_audit_free_verify_set(mcache_audit_t *mca, void *base, size_t offset,
1603     size_t size)
1604 {
1605         void *addr;
1606         u_int64_t *oaddr64;
1607         mcache_obj_t *next;
1608
1609         addr = (void *)((uintptr_t)base + offset);
1610         next = ((mcache_obj_t *)addr)->obj_next;
1611
1612         /* For the "obj_next" pointer in the buffer */
1613         oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof(u_int64_t));
1614         *oaddr64 = MCACHE_FREE_PATTERN;
1615
1616         if ((oaddr64 = mcache_verify_set_pattern(MCACHE_FREE_PATTERN,
1617             MCACHE_UNINITIALIZED_PATTERN, (caddr_t)base, size)) != NULL) {
1618                 mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
1619                     (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
1620                 /* NOTREACHED */
1621         }
1622         ((mcache_obj_t *)addr)->obj_next = next;
1623 }
1624
1625 #undef panic
1626
1627 #define DUMP_TRN_FMT() \
1628             "%s transaction thread %p saved PC stack (%d deep):\n" \
1629             "\t%p, %p, %p, %p, %p, %p, %p, %p\n" \
1630             "\t%p, %p, %p, %p, %p, %p, %p, %p\n"
1631
1632 #define DUMP_TRN_FIELDS(s, x) \
1633             s, \
1634             mca->mca_trns[x].mca_thread, mca->mca_trns[x].mca_depth, \
1635             mca->mca_trns[x].mca_stack[0], mca->mca_trns[x].mca_stack[1], \
1636             mca->mca_trns[x].mca_stack[2], mca->mca_trns[x].mca_stack[3], \
1637             mca->mca_trns[x].mca_stack[4], mca->mca_trns[x].mca_stack[5], \
1638             mca->mca_trns[x].mca_stack[6], mca->mca_trns[x].mca_stack[7], \
1639             mca->mca_trns[x].mca_stack[8], mca->mca_trns[x].mca_stack[9], \
1640             mca->mca_trns[x].mca_stack[10], mca->mca_trns[x].mca_stack[11], \
1641             mca->mca_trns[x].mca_stack[12], mca->mca_trns[x].mca_stack[13], \
1642             mca->mca_trns[x].mca_stack[14], mca->mca_trns[x].mca_stack[15]
1643
1644 #define MCA_TRN_LAST ((mca->mca_next_trn + mca_trn_max) % mca_trn_max)
1645 #define MCA_TRN_PREV ((mca->mca_next_trn + mca_trn_max - 1) % mca_trn_max)
1646
1647 __private_extern__ char *
1648 mcache_dump_mca(mcache_audit_t *mca)
1649 {
1650         if (mca_dump_buf == NULL) {
1651                 return NULL;
1652         }
1653
1654         snprintf(mca_dump_buf, DUMP_MCA_BUF_SIZE,
1655             "mca %p: addr %p, cache %p (%s) nxttrn %d\n"
1656             DUMP_TRN_FMT()
1657             DUMP_TRN_FMT(),
1658
1659             mca, mca->mca_addr, mca->mca_cache,
1660             mca->mca_cache ? mca->mca_cache->mc_name : "?",
1661             mca->mca_next_trn,
1662
1663             DUMP_TRN_FIELDS("last", MCA_TRN_LAST),
1664             DUMP_TRN_FIELDS("previous", MCA_TRN_PREV));
1665
1666         return mca_dump_buf;
1667 }
1668
1669 __private_extern__ void
1670 mcache_audit_panic(mcache_audit_t *mca, void *addr, size_t offset,
1671     int64_t expected, int64_t got)
1672 {
1673         if (mca == NULL) {
1674                 panic("mcache_audit: buffer %p modified after free at "
1675                     "offset 0x%lx (0x%llx instead of 0x%llx)\n", addr,
1676                     offset, got, expected);
1677                 /* NOTREACHED */
1678                 __builtin_unreachable();
1679         }
1680
1681         panic("mcache_audit: buffer %p modified after free at offset 0x%lx "
1682             "(0x%llx instead of 0x%llx)\n%s\n",
1683             addr, offset, got, expected, mcache_dump_mca(mca));
1684         /* NOTREACHED */
1685         __builtin_unreachable();
1686 }
1687
1688 __attribute__((noinline, cold, not_tail_called, noreturn))
1689 __private_extern__ int
1690 assfail(const char *a, const char *f, int l)
1691 {
1692         panic("assertion failed: %s, file: %s, line: %d", a, f, l);
1693         /* NOTREACHED */
1694         __builtin_unreachable();
1695 }