bsd/kern/mcache.c

   1 /*
   2  * Copyright (c) 2006-2007 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 /*
  30  * Memory allocator with per-CPU caching, derived from the kmem magazine
  31  * concept and implementation as described in the following paper:
  32  * http://www.usenix.org/events/usenix01/full_papers/bonwick/bonwick.pdf
  33  * That implementation is Copyright 2006 Sun Microsystems, Inc.  All rights
  34  * reserved.  Use is subject to license terms.
  35  *
  36  * There are several major differences between this and the original kmem
  37  * magazine: this derivative implementation allows for multiple objects to
  38  * be allocated and freed from/to the object cache in one call; in addition,
  39  * it provides for better flexibility where the user is allowed to define
  40  * its own slab allocator (instead of the default zone allocator).  Finally,
  41  * no object construction/destruction takes place at the moment, although
  42  * this could be added in future to improve efficiency.
  43  */
  44
  45 #include <sys/param.h>
  46 #include <sys/types.h>
  47 #include <sys/malloc.h>
  48 #include <sys/mbuf.h>
  49 #include <sys/queue.h>
  50 #include <sys/kernel.h>
  51 #include <sys/systm.h>
  52
  53 #include <kern/debug.h>
  54 #include <kern/zalloc.h>
  55 #include <kern/cpu_number.h>
  56 #include <kern/locks.h>
  57
  58 #include <libkern/libkern.h>
  59 #include <libkern/OSAtomic.h>
  60 #include <libkern/OSDebug.h>
  61
  62 #include <mach/vm_param.h>
  63 #include <machine/limits.h>
  64 #include <machine/machine_routines.h>
  65
  66 #include <string.h>
  67
  68 #include <sys/mcache.h>
  69
  70 #define MCACHE_SIZE(n) \
  71         ((size_t)(&((mcache_t *)0)->mc_cpu[n]))
  72
  73 /* Allocate extra in case we need to manually align the pointer */
  74 #define MCACHE_ALLOC_SIZE \
  75         (sizeof (void *) + MCACHE_SIZE(ncpu) + CPU_CACHE_SIZE)
  76
  77 #define MCACHE_CPU(c) \
  78         (mcache_cpu_t *)((char *)(c) + MCACHE_SIZE(cpu_number()))
  79
  80 /*
  81  * MCACHE_LIST_LOCK() and MCACHE_LIST_UNLOCK() are macros used
  82  * to serialize accesses to the global list of caches in the system.
  83  * They also record the thread currently running in the critical
  84  * section, so that we can avoid recursive requests to reap the
  85  * caches when memory runs low.
  86  */
  87 #define MCACHE_LIST_LOCK() {                            \
  88         lck_mtx_lock(mcache_llock);                     \
  89         mcache_llock_owner = current_thread();          \
  90 }
  91
  92 #define MCACHE_LIST_UNLOCK() {                          \
  93         mcache_llock_owner = NULL;                      \
  94         lck_mtx_unlock(mcache_llock);                   \
  95 }
  96
  97 #define MCACHE_LOCK(l)          lck_mtx_lock(l)
  98 #define MCACHE_UNLOCK(l)        lck_mtx_unlock(l)
  99 #define MCACHE_LOCK_TRY(l)      lck_mtx_try_lock(l)
 100
 101 /* This should be in a header file */
 102 #define atomic_add_32(a, n)     ((void) OSAddAtomic(n, (volatile SInt32 *)a))
 103
 104 static int ncpu;
 105 static lck_mtx_t *mcache_llock;
 106 static struct thread *mcache_llock_owner;
 107 static lck_attr_t *mcache_llock_attr;
 108 static lck_grp_t *mcache_llock_grp;
 109 static lck_grp_attr_t *mcache_llock_grp_attr;
 110 static struct zone *mcache_zone;
 111 static unsigned int mcache_reap_interval;
 112 static UInt32 mcache_reaping;
 113 static int mcache_ready;
 114 static int mcache_updating;
 115
 116 static int mcache_bkt_contention = 3;
 117 #if DEBUG
 118 static unsigned int mcache_flags = MCF_DEBUG;
 119 #else
 120 static unsigned int mcache_flags = 0;
 121 #endif
 122
 123 #define DUMP_MCA_BUF_SIZE       512
 124 static char *mca_dump_buf;
 125
 126 static mcache_bkttype_t mcache_bkttype[] = {
 127         { 1,    4096,   32768,  NULL },
 128         { 3,    2048,   16384,  NULL },
 129         { 7,    1024,   12288,  NULL },
 130         { 15,   256,    8192,   NULL },
 131         { 31,   64,     4096,   NULL },
 132         { 47,   0,      2048,   NULL },
 133         { 63,   0,      1024,   NULL },
 134         { 95,   0,      512,    NULL },
 135         { 143,  0,      256,    NULL },
 136         { 165,  0,      0,      NULL },
 137 };
 138
 139 static mcache_t *mcache_create_common(const char *, size_t, size_t,
 140     mcache_allocfn_t, mcache_freefn_t, mcache_auditfn_t, mcache_notifyfn_t,
 141     void *, u_int32_t, int, int);
 142 static unsigned int mcache_slab_alloc(void *, mcache_obj_t ***,
 143     unsigned int, int);
 144 static void mcache_slab_free(void *, mcache_obj_t *, boolean_t);
 145 static void mcache_slab_audit(void *, mcache_obj_t *, boolean_t);
 146 static void mcache_cpu_refill(mcache_cpu_t *, mcache_bkt_t *, int);
 147 static mcache_bkt_t *mcache_bkt_alloc(mcache_t *, mcache_bktlist_t *,
 148     mcache_bkttype_t **);
 149 static void mcache_bkt_free(mcache_t *, mcache_bktlist_t *, mcache_bkt_t *);
 150 static void mcache_cache_bkt_enable(mcache_t *);
 151 static void mcache_bkt_purge(mcache_t *);
 152 static void mcache_bkt_destroy(mcache_t *, mcache_bkttype_t *,
 153     mcache_bkt_t *, int);
 154 static void mcache_bkt_ws_update(mcache_t *);
 155 static void mcache_bkt_ws_reap(mcache_t *);
 156 static void mcache_dispatch(void (*)(void *), void *);
 157 static void mcache_cache_reap(mcache_t *);
 158 static void mcache_cache_update(mcache_t *);
 159 static void mcache_cache_bkt_resize(void *);
 160 static void mcache_cache_enable(void *);
 161 static void mcache_update(void *);
 162 static void mcache_update_timeout(void *);
 163 static void mcache_applyall(void (*)(mcache_t *));
 164 static void mcache_reap_start(void *);
 165 static void mcache_reap_done(void *);
 166 static void mcache_reap_timeout(void *);
 167 static void mcache_notify(mcache_t *, u_int32_t);
 168 static void mcache_purge(void *);
 169
 170 static LIST_HEAD(, mcache) mcache_head;
 171 mcache_t *mcache_audit_cache;
 172
 173 /*
 174  * Initialize the framework; this is currently called as part of BSD init.
 175  */
 176 __private_extern__ void
 177 mcache_init(void)
 178 {
 179         mcache_bkttype_t *btp;
 180         unsigned int i;
 181         char name[32];
 182
 183         ncpu = ml_get_max_cpus();
 184
 185         mcache_llock_grp_attr = lck_grp_attr_alloc_init();
 186         mcache_llock_grp = lck_grp_alloc_init("mcache.list",
 187             mcache_llock_grp_attr);
 188         mcache_llock_attr = lck_attr_alloc_init();
 189         mcache_llock = lck_mtx_alloc_init(mcache_llock_grp, mcache_llock_attr);
 190
 191         mcache_zone = zinit(MCACHE_ALLOC_SIZE, 256 * MCACHE_ALLOC_SIZE,
 192             PAGE_SIZE, "mcache");
 193         if (mcache_zone == NULL)
 194                 panic("mcache_init: failed to allocate mcache zone\n");
 195
 196         LIST_INIT(&mcache_head);
 197
 198         for (i = 0; i < sizeof (mcache_bkttype) / sizeof (*btp); i++) {
 199                 btp = &mcache_bkttype[i];
 200                 (void) snprintf(name, sizeof (name), "bkt_%d",
 201                     btp->bt_bktsize);
 202                 btp->bt_cache = mcache_create(name,
 203                     (btp->bt_bktsize + 1) * sizeof (void *), 0, 0, MCR_SLEEP);
 204         }
 205
 206         PE_parse_boot_arg("mcache_flags", &mcache_flags);
 207         mcache_flags &= MCF_FLAGS_MASK;
 208
 209         mcache_audit_cache = mcache_create("audit", sizeof (mcache_audit_t),
 210             0, 0, MCR_SLEEP);
 211
 212         mcache_reap_interval = 15 * hz;
 213         mcache_applyall(mcache_cache_bkt_enable);
 214         mcache_ready = 1;
 215 }
 216
 217 /*
 218  * Return the global mcache flags.
 219  */
 220 __private_extern__ unsigned int
 221 mcache_getflags(void)
 222 {
 223         return (mcache_flags);
 224 }
 225
 226 /*
 227  * Create a cache using the zone allocator as the backend slab allocator.
 228  * The caller may specify any alignment for the object; if it specifies 0
 229  * the default alignment (MCACHE_ALIGN) will be used.
 230  */
 231 __private_extern__ mcache_t *
 232 mcache_create(const char *name, size_t bufsize, size_t align,
 233     u_int32_t flags, int wait)
 234 {
 235         return (mcache_create_common(name, bufsize, align, mcache_slab_alloc,
 236             mcache_slab_free, mcache_slab_audit, NULL, NULL, flags, 1, wait));
 237 }
 238
 239 /*
 240  * Create a cache using a custom backend slab allocator.  Since the caller
 241  * is responsible for allocation, no alignment guarantee will be provided
 242  * by this framework.
 243  */
 244 __private_extern__ mcache_t *
 245 mcache_create_ext(const char *name, size_t bufsize,
 246     mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
 247     mcache_notifyfn_t notifyfn, void *arg, u_int32_t flags, int wait)
 248 {
 249         return (mcache_create_common(name, bufsize, 0, allocfn,
 250             freefn, auditfn, notifyfn, arg, flags, 0, wait));
 251 }
 252
 253 /*
 254  * Common cache creation routine.
 255  */
 256 static mcache_t *
 257 mcache_create_common(const char *name, size_t bufsize, size_t align,
 258     mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
 259     mcache_notifyfn_t notifyfn, void *arg, u_int32_t flags, int need_zone,
 260     int wait)
 261 {
 262         mcache_bkttype_t *btp;
 263         mcache_t *cp = NULL;
 264         size_t chunksize;
 265         void *buf, **pbuf;
 266         int c;
 267         char lck_name[64];
 268
 269         /* If auditing is on and print buffer is NULL, allocate it now */
 270         if ((flags & MCF_AUDIT) && mca_dump_buf == NULL) {
 271                 int malloc_wait = (wait & MCR_NOSLEEP) ? M_NOWAIT : M_WAITOK;
 272                 MALLOC(mca_dump_buf, char *, DUMP_MCA_BUF_SIZE, M_TEMP,
 273                     malloc_wait | M_ZERO);
 274                 if (mca_dump_buf == NULL)
 275                         return (NULL);
 276         }
 277
 278         if (!(wait & MCR_NOSLEEP))
 279                 buf = zalloc(mcache_zone);
 280         else
 281                 buf = zalloc_noblock(mcache_zone);
 282
 283         if (buf == NULL)
 284                 goto fail;
 285
 286         bzero(buf, MCACHE_ALLOC_SIZE);
 287
 288         /*
 289          * In case we didn't get a cache-aligned memory, round it up
 290          * accordingly.  This is needed in order to get the rest of
 291          * structure members aligned properly.  It also means that
 292          * the memory span gets shifted due to the round up, but it
 293          * is okay since we've allocated extra space for this.
 294          */
 295         cp = (mcache_t *)
 296             P2ROUNDUP((intptr_t)buf + sizeof (void *), CPU_CACHE_SIZE);
 297         pbuf = (void **)((intptr_t)cp - sizeof (void *));
 298         *pbuf = buf;
 299
 300         /*
 301          * Guaranteed alignment is valid only when we use the internal
 302          * slab allocator (currently set to use the zone allocator).
 303          */
 304         if (!need_zone)
 305                 align = 1;
 306         else if (align == 0)
 307                 align = MCACHE_ALIGN;
 308
 309         if ((align & (align - 1)) != 0)
 310                 panic("mcache_create: bad alignment %lu", align);
 311
 312         cp->mc_align = align;
 313         cp->mc_slab_alloc = allocfn;
 314         cp->mc_slab_free = freefn;
 315         cp->mc_slab_audit = auditfn;
 316         cp->mc_slab_notify = notifyfn;
 317         cp->mc_private = need_zone ? cp : arg;
 318         cp->mc_bufsize = bufsize;
 319         cp->mc_flags = (flags & MCF_FLAGS_MASK) | mcache_flags;
 320
 321         (void) snprintf(cp->mc_name, sizeof (cp->mc_name), "mcache.%s", name);
 322
 323         (void) snprintf(lck_name, sizeof (lck_name), "%s.cpu", cp->mc_name);
 324         cp->mc_cpu_lock_grp_attr = lck_grp_attr_alloc_init();
 325         cp->mc_cpu_lock_grp = lck_grp_alloc_init(lck_name,
 326             cp->mc_cpu_lock_grp_attr);
 327         cp->mc_cpu_lock_attr = lck_attr_alloc_init();
 328
 329         /*
 330          * Allocation chunk size is the object's size plus any extra size
 331          * needed to satisfy the object's alignment.  It is enforced to be
 332          * at least the size of an LP64 pointer to simplify auditing and to
 333          * handle multiple-element allocation requests, where the elements
 334          * returned are linked together in a list.
 335          */
 336         chunksize = MAX(bufsize, sizeof (u_int64_t));
 337         if (need_zone) {
 338                 /* Enforce 64-bit minimum alignment for zone-based buffers */
 339                 align = MAX(align, sizeof (u_int64_t));
 340                 chunksize += sizeof (void *) + align;
 341                 chunksize = P2ROUNDUP(chunksize, align);
 342                 if ((cp->mc_slab_zone = zinit(chunksize, 64 * 1024 * ncpu,
 343                     PAGE_SIZE, cp->mc_name)) == NULL)
 344                         goto fail;
 345                 zone_change(cp->mc_slab_zone, Z_EXPAND, TRUE);
 346         }
 347         cp->mc_chunksize = chunksize;
 348
 349         /*
 350          * Initialize the bucket layer.
 351          */
 352         (void) snprintf(lck_name, sizeof (lck_name), "%s.bkt", cp->mc_name);
 353         cp->mc_bkt_lock_grp_attr = lck_grp_attr_alloc_init();
 354         cp->mc_bkt_lock_grp = lck_grp_alloc_init(lck_name,
 355             cp->mc_bkt_lock_grp_attr);
 356         cp->mc_bkt_lock_attr = lck_attr_alloc_init();
 357         lck_mtx_init(&cp->mc_bkt_lock, cp->mc_bkt_lock_grp,
 358             cp->mc_bkt_lock_attr);
 359
 360         (void) snprintf(lck_name, sizeof (lck_name), "%s.sync", cp->mc_name);
 361         cp->mc_sync_lock_grp_attr = lck_grp_attr_alloc_init();
 362         cp->mc_sync_lock_grp = lck_grp_alloc_init(lck_name,
 363             cp->mc_sync_lock_grp_attr);
 364         cp->mc_sync_lock_attr = lck_attr_alloc_init();
 365         lck_mtx_init(&cp->mc_sync_lock, cp->mc_sync_lock_grp,
 366             cp->mc_sync_lock_attr);
 367
 368         for (btp = mcache_bkttype; chunksize <= btp->bt_minbuf; btp++)
 369                 continue;
 370
 371         cp->cache_bkttype = btp;
 372
 373         /*
 374          * Initialize the CPU layer.  Each per-CPU structure is aligned
 375          * on the CPU cache line boundary to prevent false sharing.
 376          */
 377         for (c = 0; c < ncpu; c++) {
 378                 mcache_cpu_t *ccp = &cp->mc_cpu[c];
 379
 380                 VERIFY(IS_P2ALIGNED(ccp, CPU_CACHE_SIZE));
 381                 lck_mtx_init(&ccp->cc_lock, cp->mc_cpu_lock_grp,
 382                     cp->mc_cpu_lock_attr);
 383                 ccp->cc_objs = -1;
 384                 ccp->cc_pobjs = -1;
 385         }
 386
 387         if (mcache_ready)
 388                 mcache_cache_bkt_enable(cp);
 389
 390         /* TODO: dynamically create sysctl for stats */
 391
 392         MCACHE_LIST_LOCK();
 393         LIST_INSERT_HEAD(&mcache_head, cp, mc_list);
 394         MCACHE_LIST_UNLOCK();
 395
 396         /*
 397          * If cache buckets are enabled and this is the first cache
 398          * created, start the periodic cache update.
 399          */
 400         if (!(mcache_flags & MCF_NOCPUCACHE) && !mcache_updating) {
 401                 mcache_updating = 1;
 402                 mcache_update_timeout(NULL);
 403         }
 404         if (cp->mc_flags & MCF_DEBUG) {
 405                 printf("mcache_create: %s (%s) arg %p bufsize %lu align %lu "
 406                     "chunksize %lu bktsize %d\n", name, need_zone ? "i" : "e",
 407                     arg, bufsize, cp->mc_align, chunksize, btp->bt_bktsize);
 408         }
 409         return (cp);
 410
 411 fail:
 412         if (buf != NULL)
 413                 zfree(mcache_zone, buf);
 414         return (NULL);
 415 }
 416
 417 /*
 418  * Allocate one or more objects from a cache.
 419  */
 420 __private_extern__ unsigned int
 421 mcache_alloc_ext(mcache_t *cp, mcache_obj_t **list, unsigned int num, int wait)
 422 {
 423         mcache_cpu_t *ccp;
 424         mcache_obj_t **top = &(*list);
 425         mcache_bkt_t *bkt;
 426         unsigned int need = num;
 427         boolean_t nwretry = FALSE;
 428
 429         /* MCR_NOSLEEP and MCR_FAILOK are mutually exclusive */
 430         VERIFY((wait & (MCR_NOSLEEP|MCR_FAILOK)) != (MCR_NOSLEEP|MCR_FAILOK));
 431
 432         ASSERT(list != NULL);
 433         *list = NULL;
 434
 435         if (num == 0)
 436                 return (0);
 437
 438 retry_alloc:
 439         /* We may not always be running in the same CPU in case of retries */
 440         ccp = MCACHE_CPU(cp);
 441
 442         MCACHE_LOCK(&ccp->cc_lock);
 443         for (;;) {
 444                 /*
 445                  * If we have an object in the current CPU's filled bucket,
 446                  * chain the object to any previous objects and return if
 447                  * we've satisfied the number of requested objects.
 448                  */
 449                 if (ccp->cc_objs > 0) {
 450                         mcache_obj_t *tail;
 451                         int objs;
 452
 453                         /*
 454                          * Objects in the bucket are already linked together
 455                          * with the most recently freed object at the head of
 456                          * the list; grab as many objects as we can.
 457                          */
 458                         objs = MIN((unsigned int)ccp->cc_objs, need);
 459                         *list = ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
 460                         ccp->cc_objs -= objs;
 461                         ccp->cc_alloc += objs;
 462
 463                         tail = ccp->cc_filled->bkt_obj[ccp->cc_objs];
 464                         list = &tail->obj_next;
 465                         *list = NULL;
 466
 467                         /* If we got them all, return to caller */
 468                         if ((need -= objs) == 0) {
 469                                 MCACHE_UNLOCK(&ccp->cc_lock);
 470                                 if (cp->mc_flags & MCF_DEBUG)
 471                                         goto debug_alloc;
 472
 473                                 return (num);
 474                         }
 475                 }
 476
 477                 /*
 478                  * The CPU's filled bucket is empty.  If the previous filled
 479                  * bucket was full, exchange and try again.
 480                  */
 481                 if (ccp->cc_pobjs > 0) {
 482                         mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
 483                         continue;
 484                 }
 485
 486                 /*
 487                  * If the bucket layer is disabled, allocate from slab.  This
 488                  * can happen either because MCF_NOCPUCACHE is set, or because
 489                  * the bucket layer is currently being resized.
 490                  */
 491                 if (ccp->cc_bktsize == 0)
 492                         break;
 493
 494                 /*
 495                  * Both of the CPU's buckets are empty; try to get a full
 496                  * bucket from the bucket layer.  Upon success, refill this
 497                  * CPU and place any empty bucket into the empty list.
 498                  */
 499                 bkt = mcache_bkt_alloc(cp, &cp->mc_full, NULL);
 500                 if (bkt != NULL) {
 501                         if (ccp->cc_pfilled != NULL)
 502                                 mcache_bkt_free(cp, &cp->mc_empty,
 503                                     ccp->cc_pfilled);
 504                         mcache_cpu_refill(ccp, bkt, ccp->cc_bktsize);
 505                         continue;
 506                 }
 507
 508                 /*
 509                  * The bucket layer has no full buckets; allocate the
 510                  * object(s) directly from the slab layer.
 511                  */
 512                 break;
 513         }
 514         MCACHE_UNLOCK(&ccp->cc_lock);
 515
 516         need -= (*cp->mc_slab_alloc)(cp->mc_private, &list, need, wait);
 517
 518         /*
 519          * If this is a blocking allocation, or if it is non-blocking and
 520          * the cache's full bucket is non-empty, then retry the allocation.
 521          */
 522         if (need > 0) {
 523                 if (!(wait & MCR_NONBLOCKING)) {
 524                         atomic_add_32(&cp->mc_wretry_cnt, 1);
 525                         goto retry_alloc;
 526                 } else if ((wait & (MCR_NOSLEEP | MCR_TRYHARD)) &&
 527                     !mcache_bkt_isempty(cp)) {
 528                         if (!nwretry)
 529                                 nwretry = TRUE;
 530                         atomic_add_32(&cp->mc_nwretry_cnt, 1);
 531                         goto retry_alloc;
 532                 } else if (nwretry) {
 533                         atomic_add_32(&cp->mc_nwfail_cnt, 1);
 534                 }
 535         }
 536
 537         if (!(cp->mc_flags & MCF_DEBUG))
 538                 return (num - need);
 539
 540 debug_alloc:
 541         if (cp->mc_flags & MCF_VERIFY) {
 542                 mcache_obj_t **o = top;
 543                 unsigned int n;
 544
 545                 n = 0;
 546                 /*
 547                  * Verify that the chain of objects have the same count as
 548                  * what we are about to report to the caller.  Any mismatch
 549                  * here means that the object list is insanely broken and
 550                  * therefore we must panic.
 551                  */
 552                 while (*o != NULL) {
 553                         o = &(*o)->obj_next;
 554                         ++n;
 555                 }
 556                 if (n != (num - need)) {
 557                         panic("mcache_alloc_ext: %s cp %p corrupted list "
 558                             "(got %d actual %d)\n", cp->mc_name,
 559                             (void *)cp, num - need, n);
 560                 }
 561         }
 562
 563         /* Invoke the slab layer audit callback if auditing is enabled */
 564         if ((cp->mc_flags & MCF_AUDIT) && cp->mc_slab_audit != NULL)
 565                 (*cp->mc_slab_audit)(cp->mc_private, *top, TRUE);
 566
 567         return (num - need);
 568 }
 569
 570 /*
 571  * Allocate a single object from a cache.
 572  */
 573 __private_extern__ void *
 574 mcache_alloc(mcache_t *cp, int wait)
 575 {
 576         mcache_obj_t *buf;
 577
 578         (void) mcache_alloc_ext(cp, &buf, 1, wait);
 579         return (buf);
 580 }
 581
 582 __private_extern__ void
 583 mcache_waiter_inc(mcache_t *cp)
 584 {
 585         atomic_add_32(&cp->mc_waiter_cnt, 1);
 586 }
 587
 588 __private_extern__ void
 589 mcache_waiter_dec(mcache_t *cp)
 590 {
 591         atomic_add_32(&cp->mc_waiter_cnt, -1);
 592 }
 593
 594 __private_extern__ boolean_t
 595 mcache_bkt_isempty(mcache_t *cp)
 596 {
 597         /*
 598          * This isn't meant to accurately tell whether there are
 599          * any full buckets in the cache; it is simply a way to
 600          * obtain "hints" about the state of the cache.
 601          */
 602         return (cp->mc_full.bl_total == 0);
 603 }
 604
 605 /*
 606  * Notify the slab layer about an event.
 607  */
 608 static void
 609 mcache_notify(mcache_t *cp, u_int32_t event)
 610 {
 611         if (cp->mc_slab_notify != NULL)
 612                 (*cp->mc_slab_notify)(cp->mc_private, event);
 613 }
 614
 615 /*
 616  * Purge the cache and disable its buckets.
 617  */
 618 static void
 619 mcache_purge(void *arg)
 620 {
 621         mcache_t *cp = arg;
 622
 623         mcache_bkt_purge(cp);
 624         /*
 625          * We cannot simply call mcache_cache_bkt_enable() from here as
 626          * a bucket resize may be in flight and we would cause the CPU
 627          * layers of the cache to point to different sizes.  Therefore,
 628          * we simply increment the enable count so that during the next
 629          * periodic cache update the buckets can be reenabled.
 630          */
 631         lck_mtx_lock_spin(&cp->mc_sync_lock);
 632         cp->mc_enable_cnt++;
 633         lck_mtx_unlock(&cp->mc_sync_lock);
 634
 635 }
 636
 637 __private_extern__ boolean_t
 638 mcache_purge_cache(mcache_t *cp)
 639 {
 640         /*
 641          * Purging a cache that has no per-CPU caches or is already
 642          * in the process of being purged is rather pointless.
 643          */
 644         if (cp->mc_flags & MCF_NOCPUCACHE)
 645                 return (FALSE);
 646
 647         lck_mtx_lock_spin(&cp->mc_sync_lock);
 648         if (cp->mc_purge_cnt > 0) {
 649                 lck_mtx_unlock(&cp->mc_sync_lock);
 650                 return (FALSE);
 651         }
 652         cp->mc_purge_cnt++;
 653         lck_mtx_unlock(&cp->mc_sync_lock);
 654
 655         mcache_dispatch(mcache_purge, cp);
 656
 657         return (TRUE);
 658 }
 659
 660 /*
 661  * Free a single object to a cache.
 662  */
 663 __private_extern__ void
 664 mcache_free(mcache_t *cp, void *buf)
 665 {
 666         ((mcache_obj_t *)buf)->obj_next = NULL;
 667         mcache_free_ext(cp, (mcache_obj_t *)buf);
 668 }
 669
 670 /*
 671  * Free one or more objects to a cache.
 672  */
 673 __private_extern__ void
 674 mcache_free_ext(mcache_t *cp, mcache_obj_t *list)
 675 {
 676         mcache_cpu_t *ccp = MCACHE_CPU(cp);
 677         mcache_bkttype_t *btp;
 678         mcache_obj_t *nlist;
 679         mcache_bkt_t *bkt;
 680
 681         /* Invoke the slab layer audit callback if auditing is enabled */
 682         if ((cp->mc_flags & MCF_AUDIT) && cp->mc_slab_audit != NULL)
 683                 (*cp->mc_slab_audit)(cp->mc_private, list, FALSE);
 684
 685         MCACHE_LOCK(&ccp->cc_lock);
 686         for (;;) {
 687                 /*
 688                  * If there is space in the current CPU's filled bucket, put
 689                  * the object there and return once all objects are freed.
 690                  * Note the cast to unsigned integer takes care of the case
 691                  * where the bucket layer is disabled (when cc_objs is -1).
 692                  */
 693                 if ((unsigned int)ccp->cc_objs <
 694                     (unsigned int)ccp->cc_bktsize) {
 695                         /*
 696                          * Reverse the list while we place the object into the
 697                          * bucket; this effectively causes the most recently
 698                          * freed object(s) to be reused during allocation.
 699                          */
 700                         nlist = list->obj_next;
 701                         list->obj_next = (ccp->cc_objs == 0) ? NULL :
 702                             ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
 703                         ccp->cc_filled->bkt_obj[ccp->cc_objs++] = list;
 704                         ccp->cc_free++;
 705
 706                         if ((list = nlist) != NULL)
 707                                 continue;
 708
 709                         /* We are done; return to caller */
 710                         MCACHE_UNLOCK(&ccp->cc_lock);
 711
 712                         /* If there is a waiter below, notify it */
 713                         if (cp->mc_waiter_cnt > 0)
 714                                 mcache_notify(cp, MCN_RETRYALLOC);
 715                         return;
 716                 }
 717
 718                 /*
 719                  * The CPU's filled bucket is full.  If the previous filled
 720                  * bucket was empty, exchange and try again.
 721                  */
 722                 if (ccp->cc_pobjs == 0) {
 723                         mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
 724                         continue;
 725                 }
 726
 727                 /*
 728                  * If the bucket layer is disabled, free to slab.  This can
 729                  * happen either because MCF_NOCPUCACHE is set, or because
 730                  * the bucket layer is currently being resized.
 731                  */
 732                 if (ccp->cc_bktsize == 0)
 733                         break;
 734
 735                 /*
 736                  * Both of the CPU's buckets are full; try to get an empty
 737                  * bucket from the bucket layer.  Upon success, empty this
 738                  * CPU and place any full bucket into the full list.
 739                  */
 740                 bkt = mcache_bkt_alloc(cp, &cp->mc_empty, &btp);
 741                 if (bkt != NULL) {
 742                         if (ccp->cc_pfilled != NULL)
 743                                 mcache_bkt_free(cp, &cp->mc_full,
 744                                     ccp->cc_pfilled);
 745                         mcache_cpu_refill(ccp, bkt, 0);
 746                         continue;
 747                 }
 748
 749                 /*
 750                  * We need an empty bucket to put our freed objects into
 751                  * but couldn't get an empty bucket from the bucket layer;
 752                  * attempt to allocate one.  We do not want to block for
 753                  * allocation here, and if the bucket allocation fails
 754                  * we will simply fall through to the slab layer.
 755                  */
 756                 MCACHE_UNLOCK(&ccp->cc_lock);
 757                 bkt = mcache_alloc(btp->bt_cache, MCR_NOSLEEP);
 758                 MCACHE_LOCK(&ccp->cc_lock);
 759
 760                 if (bkt != NULL) {
 761                         /*
 762                          * We have an empty bucket, but since we drop the
 763                          * CPU lock above, the cache's bucket size may have
 764                          * changed.  If so, free the bucket and try again.
 765                          */
 766                         if (ccp->cc_bktsize != btp->bt_bktsize) {
 767                                 MCACHE_UNLOCK(&ccp->cc_lock);
 768                                 mcache_free(btp->bt_cache, bkt);
 769                                 MCACHE_LOCK(&ccp->cc_lock);
 770                                 continue;
 771                         }
 772
 773                         /*
 774                          * We have an empty bucket of the right size;
 775                          * add it to the bucket layer and try again.
 776                          */
 777                         mcache_bkt_free(cp, &cp->mc_empty, bkt);
 778                         continue;
 779                 }
 780
 781                 /*
 782                  * The bucket layer has no empty buckets; free the
 783                  * object(s) directly to the slab layer.
 784                  */
 785                 break;
 786         }
 787         MCACHE_UNLOCK(&ccp->cc_lock);
 788
 789         /* If there is a waiter below, notify it */
 790         if (cp->mc_waiter_cnt > 0)
 791                 mcache_notify(cp, MCN_RETRYALLOC);
 792
 793         /* Advise the slab layer to purge the object(s) */
 794         (*cp->mc_slab_free)(cp->mc_private, list,
 795             (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
 796 }
 797
 798 /*
 799  * Cache destruction routine.
 800  */
 801 __private_extern__ void
 802 mcache_destroy(mcache_t *cp)
 803 {
 804         void **pbuf;
 805
 806         MCACHE_LIST_LOCK();
 807         LIST_REMOVE(cp, mc_list);
 808         MCACHE_LIST_UNLOCK();
 809
 810         mcache_bkt_purge(cp);
 811
 812         /*
 813          * This cache is dead; there should be no further transaction.
 814          * If it's still invoked, make sure that it induces a fault.
 815          */
 816         cp->mc_slab_alloc = NULL;
 817         cp->mc_slab_free = NULL;
 818         cp->mc_slab_audit = NULL;
 819
 820         lck_attr_free(cp->mc_bkt_lock_attr);
 821         lck_grp_free(cp->mc_bkt_lock_grp);
 822         lck_grp_attr_free(cp->mc_bkt_lock_grp_attr);
 823
 824         lck_attr_free(cp->mc_cpu_lock_attr);
 825         lck_grp_free(cp->mc_cpu_lock_grp);
 826         lck_grp_attr_free(cp->mc_cpu_lock_grp_attr);
 827
 828         lck_attr_free(cp->mc_sync_lock_attr);
 829         lck_grp_free(cp->mc_sync_lock_grp);
 830         lck_grp_attr_free(cp->mc_sync_lock_grp_attr);
 831
 832         /*
 833          * TODO: We need to destroy the zone here, but cannot do it
 834          * because there is no such way to achieve that.  Until then
 835          * the memory allocated for the zone structure is leaked.
 836          * Once it is achievable, uncomment these lines:
 837          *
 838          *      if (cp->mc_slab_zone != NULL) {
 839          *              zdestroy(cp->mc_slab_zone);
 840          *              cp->mc_slab_zone = NULL;
 841          *      }
 842          */
 843
 844         /* Get the original address since we're about to free it */
 845         pbuf = (void **)((intptr_t)cp - sizeof (void *));
 846
 847         zfree(mcache_zone, *pbuf);
 848 }
 849
 850 /*
 851  * Internal slab allocator used as a backend for simple caches.  The current
 852  * implementation uses the zone allocator for simplicity reasons.
 853  */
 854 static unsigned int
 855 mcache_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
 856 {
 857         mcache_t *cp = arg;
 858         unsigned int need = num;
 859         size_t offset = 0;
 860         size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof (u_int64_t));
 861         u_int32_t flags = cp->mc_flags;
 862         void *buf, *base, **pbuf;
 863         mcache_obj_t **list = *plist;
 864
 865         *list = NULL;
 866
 867         /*
 868          * The address of the object returned to the caller is an
 869          * offset from the 64-bit aligned base address only if the
 870          * cache's alignment requirement is neither 1 nor 8 bytes.
 871          */
 872         if (cp->mc_align != 1 && cp->mc_align != sizeof (u_int64_t))
 873                 offset = cp->mc_align;
 874
 875         for (;;) {
 876                 if (!(wait & MCR_NOSLEEP))
 877                         buf = zalloc(cp->mc_slab_zone);
 878                 else
 879                         buf = zalloc_noblock(cp->mc_slab_zone);
 880
 881                 if (buf == NULL)
 882                         break;
 883
 884                 /* Get the 64-bit aligned base address for this object */
 885                 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof (u_int64_t),
 886                     sizeof (u_int64_t));
 887
 888                 /*
 889                  * Wind back a pointer size from the aligned base and
 890                  * save the original address so we can free it later.
 891                  */
 892                 pbuf = (void **)((intptr_t)base - sizeof (void *));
 893                 *pbuf = buf;
 894
 895                 /*
 896                  * If auditing is enabled, patternize the contents of
 897                  * the buffer starting from the 64-bit aligned base to
 898                  * the end of the buffer; the length is rounded up to
 899                  * the nearest 64-bit multiply; this is because we use
 900                  * 64-bit memory access to set/check the pattern.
 901                  */
 902                 if (flags & MCF_AUDIT) {
 903                         VERIFY(((intptr_t)base + rsize) <=
 904                             ((intptr_t)buf + cp->mc_chunksize));
 905                         mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
 906                 }
 907
 908                 /*
 909                  * Fix up the object's address to fulfill the cache's
 910                  * alignment requirement (if needed) and return this
 911                  * to the caller.
 912                  */
 913                 VERIFY(((intptr_t)base + offset + cp->mc_bufsize) <=
 914                     ((intptr_t)buf + cp->mc_chunksize));
 915                 *list = (mcache_obj_t *)((intptr_t)base + offset);
 916
 917                 (*list)->obj_next = NULL;
 918                 list = *plist = &(*list)->obj_next;
 919
 920                 /* If we got them all, return to mcache */
 921                 if (--need == 0)
 922                         break;
 923         }
 924
 925         return (num - need);
 926 }
 927
 928 /*
 929  * Internal slab deallocator used as a backend for simple caches.
 930  */
 931 static void
 932 mcache_slab_free(void *arg, mcache_obj_t *list, __unused boolean_t purged)
 933 {
 934         mcache_t *cp = arg;
 935         mcache_obj_t *nlist;
 936         size_t offset = 0;
 937         size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof (u_int64_t));
 938         u_int32_t flags = cp->mc_flags;
 939         void *base;
 940         void **pbuf;
 941
 942         /*
 943          * The address of the object is an offset from a 64-bit
 944          * aligned base address only if the cache's alignment
 945          * requirement is neither 1 nor 8 bytes.
 946          */
 947         if (cp->mc_align != 1 && cp->mc_align != sizeof (u_int64_t))
 948                 offset = cp->mc_align;
 949
 950         for (;;) {
 951                 nlist = list->obj_next;
 952                 list->obj_next = NULL;
 953
 954                 /* Get the 64-bit aligned base address of this object */
 955                 base = (void *)((intptr_t)list - offset);
 956                 VERIFY(IS_P2ALIGNED(base, sizeof (u_int64_t)));
 957
 958                 /* Get the original address since we're about to free it */
 959                 pbuf = (void **)((intptr_t)base - sizeof (void *));
 960
 961                 if (flags & MCF_AUDIT) {
 962                         VERIFY(((intptr_t)base + rsize) <=
 963                             ((intptr_t)*pbuf + cp->mc_chunksize));
 964                         mcache_audit_free_verify(NULL, base, offset, rsize);
 965                 }
 966
 967                 /* Free it to zone */
 968                 VERIFY(((intptr_t)base + offset + cp->mc_bufsize) <=
 969                     ((intptr_t)*pbuf + cp->mc_chunksize));
 970                 zfree(cp->mc_slab_zone, *pbuf);
 971
 972                 /* No more objects to free; return to mcache */
 973                 if ((list = nlist) == NULL)
 974                         break;
 975         }
 976 }
 977
 978 /*
 979  * Internal slab auditor for simple caches.
 980  */
 981 static void
 982 mcache_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
 983 {
 984         mcache_t *cp = arg;
 985         size_t offset = 0;
 986         size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof (u_int64_t));
 987         void *base, **pbuf;
 988
 989         /*
 990          * The address of the object returned to the caller is an
 991          * offset from the 64-bit aligned base address only if the
 992          * cache's alignment requirement is neither 1 nor 8 bytes.
 993          */
 994         if (cp->mc_align != 1 && cp->mc_align != sizeof (u_int64_t))
 995                 offset = cp->mc_align;
 996
 997         while (list != NULL) {
 998                 mcache_obj_t *next = list->obj_next;
 999
1000                 /* Get the 64-bit aligned base address of this object */
1001                 base = (void *)((intptr_t)list - offset);
1002                 VERIFY(IS_P2ALIGNED(base, sizeof (u_int64_t)));
1003
1004                 /* Get the original address */
1005                 pbuf = (void **)((intptr_t)base - sizeof (void *));
1006
1007                 VERIFY(((intptr_t)base + rsize) <=
1008                     ((intptr_t)*pbuf + cp->mc_chunksize));
1009
1010                 if (!alloc)
1011                         mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
1012                 else
1013                         mcache_audit_free_verify_set(NULL, base, offset, rsize);
1014
1015                 list = list->obj_next = next;
1016         }
1017 }
1018
1019 /*
1020  * Refill the CPU's filled bucket with bkt and save the previous one.
1021  */
1022 static void
1023 mcache_cpu_refill(mcache_cpu_t *ccp, mcache_bkt_t *bkt, int objs)
1024 {
1025         ASSERT((ccp->cc_filled == NULL && ccp->cc_objs == -1) ||
1026             (ccp->cc_filled && ccp->cc_objs + objs == ccp->cc_bktsize));
1027         ASSERT(ccp->cc_bktsize > 0);
1028
1029         ccp->cc_pfilled = ccp->cc_filled;
1030         ccp->cc_pobjs = ccp->cc_objs;
1031         ccp->cc_filled = bkt;
1032         ccp->cc_objs = objs;
1033 }
1034
1035 /*
1036  * Allocate a bucket from the bucket layer.
1037  */
1038 static mcache_bkt_t *
1039 mcache_bkt_alloc(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkttype_t **btp)
1040 {
1041         mcache_bkt_t *bkt;
1042
1043         if (!MCACHE_LOCK_TRY(&cp->mc_bkt_lock)) {
1044                 /*
1045                  * The bucket layer lock is held by another CPU; increase
1046                  * the contention count so that we can later resize the
1047                  * bucket size accordingly.
1048                  */
1049                 MCACHE_LOCK(&cp->mc_bkt_lock);
1050                 cp->mc_bkt_contention++;
1051         }
1052
1053         if ((bkt = blp->bl_list) != NULL) {
1054                 blp->bl_list = bkt->bkt_next;
1055                 if (--blp->bl_total < blp->bl_min)
1056                         blp->bl_min = blp->bl_total;
1057                 blp->bl_alloc++;
1058         }
1059
1060         if (btp != NULL)
1061                 *btp = cp->cache_bkttype;
1062
1063         MCACHE_UNLOCK(&cp->mc_bkt_lock);
1064
1065         return (bkt);
1066 }
1067
1068 /*
1069  * Free a bucket to the bucket layer.
1070  */
1071 static void
1072 mcache_bkt_free(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkt_t *bkt)
1073 {
1074         MCACHE_LOCK(&cp->mc_bkt_lock);
1075
1076         bkt->bkt_next = blp->bl_list;
1077         blp->bl_list = bkt;
1078         blp->bl_total++;
1079
1080         MCACHE_UNLOCK(&cp->mc_bkt_lock);
1081 }
1082
1083 /*
1084  * Enable the bucket layer of a cache.
1085  */
1086 static void
1087 mcache_cache_bkt_enable(mcache_t *cp)
1088 {
1089         mcache_cpu_t *ccp;
1090         int cpu;
1091
1092         if (cp->mc_flags & MCF_NOCPUCACHE)
1093                 return;
1094
1095         for (cpu = 0; cpu < ncpu; cpu++) {
1096                 ccp = &cp->mc_cpu[cpu];
1097                 MCACHE_LOCK(&ccp->cc_lock);
1098                 ccp->cc_bktsize = cp->cache_bkttype->bt_bktsize;
1099                 MCACHE_UNLOCK(&ccp->cc_lock);
1100         }
1101 }
1102
1103 /*
1104  * Purge all buckets from a cache and disable its bucket layer.
1105  */
1106 static void
1107 mcache_bkt_purge(mcache_t *cp)
1108 {
1109         mcache_cpu_t *ccp;
1110         mcache_bkt_t *bp, *pbp;
1111         mcache_bkttype_t *btp;
1112         int cpu, objs, pobjs;
1113
1114         for (cpu = 0; cpu < ncpu; cpu++) {
1115                 ccp = &cp->mc_cpu[cpu];
1116
1117                 MCACHE_LOCK(&ccp->cc_lock);
1118
1119                 btp = cp->cache_bkttype;
1120                 bp = ccp->cc_filled;
1121                 pbp = ccp->cc_pfilled;
1122                 objs = ccp->cc_objs;
1123                 pobjs = ccp->cc_pobjs;
1124                 ccp->cc_filled = NULL;
1125                 ccp->cc_pfilled = NULL;
1126                 ccp->cc_objs = -1;
1127                 ccp->cc_pobjs = -1;
1128                 ccp->cc_bktsize = 0;
1129
1130                 MCACHE_UNLOCK(&ccp->cc_lock);
1131
1132                 if (bp != NULL)
1133                         mcache_bkt_destroy(cp, btp, bp, objs);
1134                 if (pbp != NULL)
1135                         mcache_bkt_destroy(cp, btp, pbp, pobjs);
1136         }
1137
1138         /*
1139          * Updating the working set back to back essentially sets
1140          * the working set size to zero, so everything is reapable.
1141          */
1142         mcache_bkt_ws_update(cp);
1143         mcache_bkt_ws_update(cp);
1144
1145         mcache_bkt_ws_reap(cp);
1146 }
1147
1148 /*
1149  * Free one or more objects in the bucket to the slab layer,
1150  * and also free the bucket itself.
1151  */
1152 static void
1153 mcache_bkt_destroy(mcache_t *cp, mcache_bkttype_t *btp, mcache_bkt_t *bkt,
1154     int nobjs)
1155 {
1156         if (nobjs > 0) {
1157                 mcache_obj_t *top = bkt->bkt_obj[nobjs - 1];
1158
1159                 if (cp->mc_flags & MCF_VERIFY) {
1160                         mcache_obj_t *o = top;
1161                         int cnt = 0;
1162
1163                         /*
1164                          * Verify that the chain of objects in the bucket is
1165                          * valid.  Any mismatch here means a mistake when the
1166                          * object(s) were freed to the CPU layer, so we panic.
1167                          */
1168                         while (o != NULL) {
1169                                 o = o->obj_next;
1170                                 ++cnt;
1171                         }
1172                         if (cnt != nobjs) {
1173                                 panic("mcache_bkt_destroy: %s cp %p corrupted "
1174                                     "list in bkt %p (nobjs %d actual %d)\n",
1175                                     cp->mc_name, (void *)cp, (void *)bkt,
1176                                     nobjs, cnt);
1177                         }
1178                 }
1179
1180                 /* Advise the slab layer to purge the object(s) */
1181                 (*cp->mc_slab_free)(cp->mc_private, top,
1182                     (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
1183         }
1184         mcache_free(btp->bt_cache, bkt);
1185 }
1186
1187 /*
1188  * Update the bucket layer working set statistics.
1189  */
1190 static void
1191 mcache_bkt_ws_update(mcache_t *cp)
1192 {
1193         MCACHE_LOCK(&cp->mc_bkt_lock);
1194
1195         cp->mc_full.bl_reaplimit = cp->mc_full.bl_min;
1196         cp->mc_full.bl_min = cp->mc_full.bl_total;
1197         cp->mc_empty.bl_reaplimit = cp->mc_empty.bl_min;
1198         cp->mc_empty.bl_min = cp->mc_empty.bl_total;
1199
1200         MCACHE_UNLOCK(&cp->mc_bkt_lock);
1201 }
1202
1203 /*
1204  * Reap all buckets that are beyond the working set.
1205  */
1206 static void
1207 mcache_bkt_ws_reap(mcache_t *cp)
1208 {
1209         long reap;
1210         mcache_bkt_t *bkt;
1211         mcache_bkttype_t *btp;
1212
1213         reap = MIN(cp->mc_full.bl_reaplimit, cp->mc_full.bl_min);
1214         while (reap-- &&
1215             (bkt = mcache_bkt_alloc(cp, &cp->mc_full, &btp)) != NULL)
1216                 mcache_bkt_destroy(cp, btp, bkt, btp->bt_bktsize);
1217
1218         reap = MIN(cp->mc_empty.bl_reaplimit, cp->mc_empty.bl_min);
1219         while (reap-- &&
1220             (bkt = mcache_bkt_alloc(cp, &cp->mc_empty, &btp)) != NULL)
1221                 mcache_bkt_destroy(cp, btp, bkt, 0);
1222 }
1223
1224 static void
1225 mcache_reap_timeout(void *arg)
1226 {
1227         volatile UInt32 *flag = arg;
1228
1229         ASSERT(flag == &mcache_reaping);
1230
1231         *flag = 0;
1232 }
1233
1234 static void
1235 mcache_reap_done(void *flag)
1236 {
1237         timeout(mcache_reap_timeout, flag, mcache_reap_interval);
1238 }
1239
1240 static void
1241 mcache_reap_start(void *arg)
1242 {
1243         UInt32 *flag = arg;
1244
1245         ASSERT(flag == &mcache_reaping);
1246
1247         mcache_applyall(mcache_cache_reap);
1248         mcache_dispatch(mcache_reap_done, flag);
1249 }
1250
1251 __private_extern__ void
1252 mcache_reap(void)
1253 {
1254         UInt32 *flag = &mcache_reaping;
1255
1256         if (mcache_llock_owner == current_thread() ||
1257             !OSCompareAndSwap(0, 1, flag))
1258                 return;
1259
1260         mcache_dispatch(mcache_reap_start, flag);
1261 }
1262
1263 static void
1264 mcache_cache_reap(mcache_t *cp)
1265 {
1266         mcache_bkt_ws_reap(cp);
1267 }
1268
1269 /*
1270  * Performs period maintenance on a cache.
1271  */
1272 static void
1273 mcache_cache_update(mcache_t *cp)
1274 {
1275         int need_bkt_resize = 0;
1276         int need_bkt_reenable = 0;
1277
1278         lck_mtx_assert(mcache_llock, LCK_MTX_ASSERT_OWNED);
1279
1280         mcache_bkt_ws_update(cp);
1281
1282         /*
1283          * Cache resize and post-purge reenable are mutually exclusive.
1284          * If the cache was previously purged, there is no point of
1285          * increasing the bucket size as there was an indication of
1286          * memory pressure on the system.
1287          */
1288         lck_mtx_lock_spin(&cp->mc_sync_lock);
1289         if (!(cp->mc_flags & MCF_NOCPUCACHE) && cp->mc_enable_cnt)
1290                 need_bkt_reenable = 1;
1291         lck_mtx_unlock(&cp->mc_sync_lock);
1292
1293         MCACHE_LOCK(&cp->mc_bkt_lock);
1294         /*
1295          * If the contention count is greater than the threshold, and if
1296          * we are not already at the maximum bucket size, increase it.
1297          * Otherwise, if this cache was previously purged by the user
1298          * then we simply reenable it.
1299          */
1300         if ((unsigned int)cp->mc_chunksize < cp->cache_bkttype->bt_maxbuf &&
1301             (int)(cp->mc_bkt_contention - cp->mc_bkt_contention_prev) >
1302             mcache_bkt_contention && !need_bkt_reenable)
1303                 need_bkt_resize = 1;
1304
1305         cp ->mc_bkt_contention_prev = cp->mc_bkt_contention;
1306         MCACHE_UNLOCK(&cp->mc_bkt_lock);
1307
1308         if (need_bkt_resize)
1309                 mcache_dispatch(mcache_cache_bkt_resize, cp);
1310         else if (need_bkt_reenable)
1311                 mcache_dispatch(mcache_cache_enable, cp);
1312 }
1313
1314 /*
1315  * Recompute a cache's bucket size.  This is an expensive operation
1316  * and should not be done frequently; larger buckets provide for a
1317  * higher transfer rate with the bucket while smaller buckets reduce
1318  * the memory consumption.
1319  */
1320 static void
1321 mcache_cache_bkt_resize(void *arg)
1322 {
1323         mcache_t *cp = arg;
1324         mcache_bkttype_t *btp = cp->cache_bkttype;
1325
1326         if ((unsigned int)cp->mc_chunksize < btp->bt_maxbuf) {
1327                 mcache_bkt_purge(cp);
1328
1329                 /*
1330                  * Upgrade to the next bucket type with larger bucket size;
1331                  * temporarily set the previous contention snapshot to a
1332                  * negative number to prevent unnecessary resize request.
1333                  */
1334                 MCACHE_LOCK(&cp->mc_bkt_lock);
1335                 cp->cache_bkttype = ++btp;
1336                 cp ->mc_bkt_contention_prev = cp->mc_bkt_contention + INT_MAX;
1337                 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1338
1339                 mcache_cache_enable(cp);
1340         }
1341 }
1342
1343 /*
1344  * Reenable a previously disabled cache due to purge.
1345  */
1346 static void
1347 mcache_cache_enable(void *arg)
1348 {
1349         mcache_t *cp = arg;
1350
1351         lck_mtx_lock_spin(&cp->mc_sync_lock);
1352         cp->mc_purge_cnt = 0;
1353         cp->mc_enable_cnt = 0;
1354         lck_mtx_unlock(&cp->mc_sync_lock);
1355
1356         mcache_cache_bkt_enable(cp);
1357 }
1358
1359 static void
1360 mcache_update_timeout(__unused void *arg)
1361 {
1362         timeout(mcache_update, NULL, mcache_reap_interval);
1363 }
1364
1365 static void
1366 mcache_update(__unused void *arg)
1367 {
1368         mcache_applyall(mcache_cache_update);
1369         mcache_dispatch(mcache_update_timeout, NULL);
1370 }
1371
1372 static void
1373 mcache_applyall(void (*func)(mcache_t *))
1374 {
1375         mcache_t *cp;
1376
1377         MCACHE_LIST_LOCK();
1378         LIST_FOREACH(cp, &mcache_head, mc_list) {
1379                 func(cp);
1380         }
1381         MCACHE_LIST_UNLOCK();
1382 }
1383
1384 static void
1385 mcache_dispatch(void (*func)(void *), void *arg)
1386 {
1387         ASSERT(func != NULL);
1388         timeout(func, arg, hz/1000);
1389 }
1390
1391 __private_extern__ void
1392 mcache_buffer_log(mcache_audit_t *mca, void *addr, mcache_t *cp)
1393 {
1394         mca->mca_addr = addr;
1395         mca->mca_cache = cp;
1396         mca->mca_pthread = mca->mca_thread;
1397         mca->mca_thread = current_thread();
1398         bcopy(mca->mca_stack, mca->mca_pstack, sizeof (mca->mca_pstack));
1399         mca->mca_pdepth = mca->mca_depth;
1400         bzero(mca->mca_stack, sizeof (mca->mca_stack));
1401         mca->mca_depth = OSBacktrace(mca->mca_stack, MCACHE_STACK_DEPTH);
1402 }
1403
1404 __private_extern__ void
1405 mcache_set_pattern(u_int64_t pattern, void *buf_arg, size_t size)
1406 {
1407         u_int64_t *buf_end = (u_int64_t *)((char *)buf_arg + size);
1408         u_int64_t *buf = (u_int64_t *)buf_arg;
1409
1410         VERIFY(IS_P2ALIGNED(buf_arg, sizeof (u_int64_t)));
1411         VERIFY(IS_P2ALIGNED(size, sizeof (u_int64_t)));
1412
1413         while (buf < buf_end)
1414                 *buf++ = pattern;
1415 }
1416
1417 __private_extern__ void *
1418 mcache_verify_pattern(u_int64_t pattern, void *buf_arg, size_t size)
1419 {
1420         u_int64_t *buf_end = (u_int64_t *)((char *)buf_arg + size);
1421         u_int64_t *buf;
1422
1423         VERIFY(IS_P2ALIGNED(buf_arg, sizeof (u_int64_t)));
1424         VERIFY(IS_P2ALIGNED(size, sizeof (u_int64_t)));
1425
1426         for (buf = buf_arg; buf < buf_end; buf++) {
1427                 if (*buf != pattern)
1428                         return (buf);
1429         }
1430         return (NULL);
1431 }
1432
1433 __private_extern__ void *
1434 mcache_verify_set_pattern(u_int64_t old, u_int64_t new, void *buf_arg,
1435     size_t size)
1436 {
1437         u_int64_t *buf_end = (u_int64_t *)((char *)buf_arg + size);
1438         u_int64_t *buf;
1439
1440         VERIFY(IS_P2ALIGNED(buf_arg, sizeof (u_int64_t)));
1441         VERIFY(IS_P2ALIGNED(size, sizeof (u_int64_t)));
1442
1443         for (buf = buf_arg; buf < buf_end; buf++) {
1444                 if (*buf != old) {
1445                         mcache_set_pattern(old, buf_arg,
1446                             (uintptr_t)buf - (uintptr_t)buf_arg);
1447                         return (buf);
1448                 }
1449                 *buf = new;
1450         }
1451         return (NULL);
1452 }
1453
1454 __private_extern__ void
1455 mcache_audit_free_verify(mcache_audit_t *mca, void *base, size_t offset,
1456     size_t size)
1457 {
1458         void *addr;
1459         u_int64_t *oaddr64;
1460         mcache_obj_t *next;
1461
1462         addr = (void *)((uintptr_t)base + offset);
1463         next = ((mcache_obj_t *)addr)->obj_next;
1464
1465         /* For the "obj_next" pointer in the buffer */
1466         oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof (u_int64_t));
1467         *oaddr64 = MCACHE_FREE_PATTERN;
1468
1469         if ((oaddr64 = mcache_verify_pattern(MCACHE_FREE_PATTERN,
1470             (caddr_t)base, size)) != NULL) {
1471                 mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
1472                     (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
1473                 /* NOTREACHED */
1474         }
1475         ((mcache_obj_t *)addr)->obj_next = next;
1476 }
1477
1478 __private_extern__ void
1479 mcache_audit_free_verify_set(mcache_audit_t *mca, void *base, size_t offset,
1480     size_t size)
1481 {
1482         void *addr;
1483         u_int64_t *oaddr64;
1484         mcache_obj_t *next;
1485
1486         addr = (void *)((uintptr_t)base + offset);
1487         next = ((mcache_obj_t *)addr)->obj_next;
1488
1489         /* For the "obj_next" pointer in the buffer */
1490         oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof (u_int64_t));
1491         *oaddr64 = MCACHE_FREE_PATTERN;
1492
1493         if ((oaddr64 = mcache_verify_set_pattern(MCACHE_FREE_PATTERN,
1494             MCACHE_UNINITIALIZED_PATTERN, (caddr_t)base, size)) != NULL) {
1495                 mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
1496                     (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
1497                 /* NOTREACHED */
1498         }
1499         ((mcache_obj_t *)addr)->obj_next = next;
1500 }
1501
1502 #undef panic(...)
1503
1504 __private_extern__ char *
1505 mcache_dump_mca(mcache_audit_t *mca)
1506 {
1507         if (mca_dump_buf == NULL)
1508                 return (NULL);
1509
1510         snprintf(mca_dump_buf, DUMP_MCA_BUF_SIZE,
1511             "mca %p: addr %p, cache %p (%s)\n"
1512             "last transaction; thread %p, saved PC stack (%d deep):\n"
1513             "\t%p, %p, %p, %p, %p, %p, %p, %p\n"
1514             "\t%p, %p, %p, %p, %p, %p, %p, %p\n"
1515             "previous transaction; thread %p, saved PC stack (%d deep):\n"
1516             "\t%p, %p, %p, %p, %p, %p, %p, %p\n"
1517             "\t%p, %p, %p, %p, %p, %p, %p, %p\n",
1518             mca, mca->mca_addr, mca->mca_cache,
1519             mca->mca_cache ? mca->mca_cache->mc_name : "?",
1520             mca->mca_thread, mca->mca_depth,
1521             mca->mca_stack[0], mca->mca_stack[1], mca->mca_stack[2],
1522             mca->mca_stack[3], mca->mca_stack[4], mca->mca_stack[5],
1523             mca->mca_stack[6], mca->mca_stack[7], mca->mca_stack[8],
1524             mca->mca_stack[9], mca->mca_stack[10], mca->mca_stack[11],
1525             mca->mca_stack[12], mca->mca_stack[13], mca->mca_stack[14],
1526             mca->mca_stack[15],
1527             mca->mca_pthread, mca->mca_pdepth,
1528             mca->mca_pstack[0], mca->mca_pstack[1], mca->mca_pstack[2],
1529             mca->mca_pstack[3], mca->mca_pstack[4], mca->mca_pstack[5],
1530             mca->mca_pstack[6], mca->mca_pstack[7], mca->mca_pstack[8],
1531             mca->mca_pstack[9], mca->mca_pstack[10], mca->mca_pstack[11],
1532             mca->mca_pstack[12], mca->mca_pstack[13], mca->mca_pstack[14],
1533             mca->mca_pstack[15]);
1534
1535         return (mca_dump_buf);
1536 }
1537
1538 __private_extern__ void
1539 mcache_audit_panic(mcache_audit_t *mca, void *addr, size_t offset,
1540     int64_t expected, int64_t got)
1541 {
1542         if (mca == NULL) {
1543                 panic("mcache_audit: buffer %p modified after free at "
1544                     "offset 0x%lx (0x%llx instead of 0x%llx)\n", addr,
1545                     offset, got, expected);
1546                 /* NOTREACHED */
1547         }
1548
1549         panic("mcache_audit: buffer %p modified after free at offset 0x%lx "
1550             "(0x%llx instead of 0x%llx)\n%s\n",
1551             addr, offset, got, expected, mcache_dump_mca(mca));
1552         /* NOTREACHED */
1553 }
1554
1555 __private_extern__ int
1556 assfail(const char *a, const char *f, int l)
1557 {
1558         panic("assertion failed: %s, file: %s, line: %d", a, f, l);
1559         return (0);
1560 }