bsd/kern/mcache.c

   1 /*
   2  * Copyright (c) 2006-2013 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 /*
  30  * Memory allocator with per-CPU caching, derived from the kmem magazine
  31  * concept and implementation as described in the following paper:
  32  * http://www.usenix.org/events/usenix01/full_papers/bonwick/bonwick.pdf
  33  * That implementation is Copyright 2006 Sun Microsystems, Inc.  All rights
  34  * reserved.  Use is subject to license terms.
  35  *
  36  * There are several major differences between this and the original kmem
  37  * magazine: this derivative implementation allows for multiple objects to
  38  * be allocated and freed from/to the object cache in one call; in addition,
  39  * it provides for better flexibility where the user is allowed to define
  40  * its own slab allocator (instead of the default zone allocator).  Finally,
  41  * no object construction/destruction takes place at the moment, although
  42  * this could be added in future to improve efficiency.
  43  */
  44
  45 #include <sys/param.h>
  46 #include <sys/types.h>
  47 #include <sys/malloc.h>
  48 #include <sys/mbuf.h>
  49 #include <sys/queue.h>
  50 #include <sys/kernel.h>
  51 #include <sys/systm.h>
  52
  53 #include <kern/debug.h>
  54 #include <kern/zalloc.h>
  55 #include <kern/cpu_number.h>
  56 #include <kern/locks.h>
  57
  58 #include <libkern/libkern.h>
  59 #include <libkern/OSAtomic.h>
  60 #include <libkern/OSDebug.h>
  61
  62 #include <mach/vm_param.h>
  63 #include <machine/limits.h>
  64 #include <machine/machine_routines.h>
  65
  66 #include <string.h>
  67
  68 #include <sys/mcache.h>
  69
  70 #define MCACHE_SIZE(n) \
  71         ((size_t)(&((mcache_t *)0)->mc_cpu[n]))
  72
  73 /* Allocate extra in case we need to manually align the pointer */
  74 #define MCACHE_ALLOC_SIZE \
  75         (sizeof (void *) + MCACHE_SIZE(ncpu) + CPU_CACHE_LINE_SIZE)
  76
  77 #define MCACHE_CPU(c) \
  78         (mcache_cpu_t *)((void *)((char *)(c) + MCACHE_SIZE(cpu_number())))
  79
  80 /*
  81  * MCACHE_LIST_LOCK() and MCACHE_LIST_UNLOCK() are macros used
  82  * to serialize accesses to the global list of caches in the system.
  83  * They also record the thread currently running in the critical
  84  * section, so that we can avoid recursive requests to reap the
  85  * caches when memory runs low.
  86  */
  87 #define MCACHE_LIST_LOCK() {                            \
  88         lck_mtx_lock(mcache_llock);                     \
  89         mcache_llock_owner = current_thread();          \
  90 }
  91
  92 #define MCACHE_LIST_UNLOCK() {                          \
  93         mcache_llock_owner = NULL;                      \
  94         lck_mtx_unlock(mcache_llock);                   \
  95 }
  96
  97 #define MCACHE_LOCK(l)          lck_mtx_lock(l)
  98 #define MCACHE_UNLOCK(l)        lck_mtx_unlock(l)
  99 #define MCACHE_LOCK_TRY(l)      lck_mtx_try_lock(l)
 100
 101 static int ncpu;
 102 static unsigned int cache_line_size;
 103 static lck_mtx_t *mcache_llock;
 104 static struct thread *mcache_llock_owner;
 105 static lck_attr_t *mcache_llock_attr;
 106 static lck_grp_t *mcache_llock_grp;
 107 static lck_grp_attr_t *mcache_llock_grp_attr;
 108 static struct zone *mcache_zone;
 109 static unsigned int mcache_reap_interval;
 110 static UInt32 mcache_reaping;
 111 static int mcache_ready;
 112 static int mcache_updating;
 113
 114 static int mcache_bkt_contention = 3;
 115 #if DEBUG
 116 static unsigned int mcache_flags = MCF_DEBUG;
 117 #else
 118 static unsigned int mcache_flags = 0;
 119 #endif
 120
 121 #define DUMP_MCA_BUF_SIZE       512
 122 static char *mca_dump_buf;
 123
 124 static mcache_bkttype_t mcache_bkttype[] = {
 125         { 1,    4096,   32768,  NULL },
 126         { 3,    2048,   16384,  NULL },
 127         { 7,    1024,   12288,  NULL },
 128         { 15,   256,    8192,   NULL },
 129         { 31,   64,     4096,   NULL },
 130         { 47,   0,      2048,   NULL },
 131         { 63,   0,      1024,   NULL },
 132         { 95,   0,      512,    NULL },
 133         { 143,  0,      256,    NULL },
 134         { 165,  0,      0,      NULL },
 135 };
 136
 137 static mcache_t *mcache_create_common(const char *, size_t, size_t,
 138     mcache_allocfn_t, mcache_freefn_t, mcache_auditfn_t, mcache_logfn_t,
 139     mcache_notifyfn_t, void *, u_int32_t, int, int);
 140 static unsigned int mcache_slab_alloc(void *, mcache_obj_t ***,
 141     unsigned int, int);
 142 static void mcache_slab_free(void *, mcache_obj_t *, boolean_t);
 143 static void mcache_slab_audit(void *, mcache_obj_t *, boolean_t);
 144 static void mcache_cpu_refill(mcache_cpu_t *, mcache_bkt_t *, int);
 145 static mcache_bkt_t *mcache_bkt_alloc(mcache_t *, mcache_bktlist_t *,
 146     mcache_bkttype_t **);
 147 static void mcache_bkt_free(mcache_t *, mcache_bktlist_t *, mcache_bkt_t *);
 148 static void mcache_cache_bkt_enable(mcache_t *);
 149 static void mcache_bkt_purge(mcache_t *);
 150 static void mcache_bkt_destroy(mcache_t *, mcache_bkttype_t *,
 151     mcache_bkt_t *, int);
 152 static void mcache_bkt_ws_update(mcache_t *);
 153 static void mcache_bkt_ws_reap(mcache_t *);
 154 static void mcache_dispatch(void (*)(void *), void *);
 155 static void mcache_cache_reap(mcache_t *);
 156 static void mcache_cache_update(mcache_t *);
 157 static void mcache_cache_bkt_resize(void *);
 158 static void mcache_cache_enable(void *);
 159 static void mcache_update(void *);
 160 static void mcache_update_timeout(void *);
 161 static void mcache_applyall(void (*)(mcache_t *));
 162 static void mcache_reap_start(void *);
 163 static void mcache_reap_done(void *);
 164 static void mcache_reap_timeout(void *);
 165 static void mcache_notify(mcache_t *, u_int32_t);
 166 static void mcache_purge(void *);
 167
 168 static LIST_HEAD(, mcache) mcache_head;
 169 mcache_t *mcache_audit_cache;
 170
 171 /*
 172  * Initialize the framework; this is currently called as part of BSD init.
 173  */
 174 __private_extern__ void
 175 mcache_init(void)
 176 {
 177         mcache_bkttype_t *btp;
 178         unsigned int i;
 179         char name[32];
 180
 181         ncpu = ml_get_max_cpus();
 182         (void) mcache_cache_line_size();        /* prime it */
 183
 184         mcache_llock_grp_attr = lck_grp_attr_alloc_init();
 185         mcache_llock_grp = lck_grp_alloc_init("mcache.list",
 186             mcache_llock_grp_attr);
 187         mcache_llock_attr = lck_attr_alloc_init();
 188         mcache_llock = lck_mtx_alloc_init(mcache_llock_grp, mcache_llock_attr);
 189
 190         mcache_zone = zinit(MCACHE_ALLOC_SIZE, 256 * MCACHE_ALLOC_SIZE,
 191             PAGE_SIZE, "mcache");
 192         if (mcache_zone == NULL)
 193                 panic("mcache_init: failed to allocate mcache zone\n");
 194         zone_change(mcache_zone, Z_CALLERACCT, FALSE);
 195
 196         LIST_INIT(&mcache_head);
 197
 198         for (i = 0; i < sizeof (mcache_bkttype) / sizeof (*btp); i++) {
 199                 btp = &mcache_bkttype[i];
 200                 (void) snprintf(name, sizeof (name), "bkt_%d",
 201                     btp->bt_bktsize);
 202                 btp->bt_cache = mcache_create(name,
 203                     (btp->bt_bktsize + 1) * sizeof (void *), 0, 0, MCR_SLEEP);
 204         }
 205
 206         PE_parse_boot_argn("mcache_flags", &mcache_flags, sizeof (mcache_flags));
 207         mcache_flags &= MCF_FLAGS_MASK;
 208
 209         mcache_audit_cache = mcache_create("audit", sizeof (mcache_audit_t),
 210             0, 0, MCR_SLEEP);
 211
 212         mcache_reap_interval = 15 * hz;
 213         mcache_applyall(mcache_cache_bkt_enable);
 214         mcache_ready = 1;
 215
 216         printf("mcache: %d CPU(s), %d bytes CPU cache line size\n",
 217             ncpu, CPU_CACHE_LINE_SIZE);
 218 }
 219
 220 /*
 221  * Return the global mcache flags.
 222  */
 223 __private_extern__ unsigned int
 224 mcache_getflags(void)
 225 {
 226         return (mcache_flags);
 227 }
 228
 229 /*
 230  * Return the CPU cache line size.
 231  */
 232 __private_extern__ unsigned int
 233 mcache_cache_line_size(void)
 234 {
 235         if (cache_line_size == 0) {
 236                 ml_cpu_info_t cpu_info;
 237                 ml_cpu_get_info(&cpu_info);
 238                 cache_line_size = cpu_info.cache_line_size;
 239         }
 240         return (cache_line_size);
 241 }
 242
 243 /*
 244  * Create a cache using the zone allocator as the backend slab allocator.
 245  * The caller may specify any alignment for the object; if it specifies 0
 246  * the default alignment (MCACHE_ALIGN) will be used.
 247  */
 248 __private_extern__ mcache_t *
 249 mcache_create(const char *name, size_t bufsize, size_t align,
 250     u_int32_t flags, int wait)
 251 {
 252         return (mcache_create_common(name, bufsize, align, mcache_slab_alloc,
 253             mcache_slab_free, mcache_slab_audit, NULL, NULL, NULL, flags, 1,
 254             wait));
 255 }
 256
 257 /*
 258  * Create a cache using a custom backend slab allocator.  Since the caller
 259  * is responsible for allocation, no alignment guarantee will be provided
 260  * by this framework.
 261  */
 262 __private_extern__ mcache_t *
 263 mcache_create_ext(const char *name, size_t bufsize,
 264     mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
 265     mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
 266     u_int32_t flags, int wait)
 267 {
 268         return (mcache_create_common(name, bufsize, 0, allocfn,
 269             freefn, auditfn, logfn, notifyfn, arg, flags, 0, wait));
 270 }
 271
 272 /*
 273  * Common cache creation routine.
 274  */
 275 static mcache_t *
 276 mcache_create_common(const char *name, size_t bufsize, size_t align,
 277     mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
 278     mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
 279     u_int32_t flags, int need_zone, int wait)
 280 {
 281         mcache_bkttype_t *btp;
 282         mcache_t *cp = NULL;
 283         size_t chunksize;
 284         void *buf, **pbuf;
 285         int c;
 286         char lck_name[64];
 287
 288         /* If auditing is on and print buffer is NULL, allocate it now */
 289         if ((flags & MCF_DEBUG) && mca_dump_buf == NULL) {
 290                 int malloc_wait = (wait & MCR_NOSLEEP) ? M_NOWAIT : M_WAITOK;
 291                 MALLOC(mca_dump_buf, char *, DUMP_MCA_BUF_SIZE, M_TEMP,
 292                     malloc_wait | M_ZERO);
 293                 if (mca_dump_buf == NULL)
 294                         return (NULL);
 295         }
 296
 297         if (!(wait & MCR_NOSLEEP))
 298                 buf = zalloc(mcache_zone);
 299         else
 300                 buf = zalloc_noblock(mcache_zone);
 301
 302         if (buf == NULL)
 303                 goto fail;
 304
 305         bzero(buf, MCACHE_ALLOC_SIZE);
 306
 307         /*
 308          * In case we didn't get a cache-aligned memory, round it up
 309          * accordingly.  This is needed in order to get the rest of
 310          * structure members aligned properly.  It also means that
 311          * the memory span gets shifted due to the round up, but it
 312          * is okay since we've allocated extra space for this.
 313          */
 314         cp = (mcache_t *)
 315             P2ROUNDUP((intptr_t)buf + sizeof (void *), CPU_CACHE_LINE_SIZE);
 316         pbuf = (void **)((intptr_t)cp - sizeof (void *));
 317         *pbuf = buf;
 318
 319         /*
 320          * Guaranteed alignment is valid only when we use the internal
 321          * slab allocator (currently set to use the zone allocator).
 322          */
 323         if (!need_zone)
 324                 align = 1;
 325         else if (align == 0)
 326                 align = MCACHE_ALIGN;
 327
 328         if ((align & (align - 1)) != 0)
 329                 panic("mcache_create: bad alignment %lu", align);
 330
 331         cp->mc_align = align;
 332         cp->mc_slab_alloc = allocfn;
 333         cp->mc_slab_free = freefn;
 334         cp->mc_slab_audit = auditfn;
 335         cp->mc_slab_log = logfn;
 336         cp->mc_slab_notify = notifyfn;
 337         cp->mc_private = need_zone ? cp : arg;
 338         cp->mc_bufsize = bufsize;
 339         cp->mc_flags = (flags & MCF_FLAGS_MASK) | mcache_flags;
 340
 341         (void) snprintf(cp->mc_name, sizeof (cp->mc_name), "mcache.%s", name);
 342
 343         (void) snprintf(lck_name, sizeof (lck_name), "%s.cpu", cp->mc_name);
 344         cp->mc_cpu_lock_grp_attr = lck_grp_attr_alloc_init();
 345         cp->mc_cpu_lock_grp = lck_grp_alloc_init(lck_name,
 346             cp->mc_cpu_lock_grp_attr);
 347         cp->mc_cpu_lock_attr = lck_attr_alloc_init();
 348
 349         /*
 350          * Allocation chunk size is the object's size plus any extra size
 351          * needed to satisfy the object's alignment.  It is enforced to be
 352          * at least the size of an LP64 pointer to simplify auditing and to
 353          * handle multiple-element allocation requests, where the elements
 354          * returned are linked together in a list.
 355          */
 356         chunksize = MAX(bufsize, sizeof (u_int64_t));
 357         if (need_zone) {
 358                 /* Enforce 64-bit minimum alignment for zone-based buffers */
 359                 align = MAX(align, sizeof (u_int64_t));
 360                 chunksize += sizeof (void *) + align;
 361                 chunksize = P2ROUNDUP(chunksize, align);
 362                 if ((cp->mc_slab_zone = zinit(chunksize, 64 * 1024 * ncpu,
 363                     PAGE_SIZE, cp->mc_name)) == NULL)
 364                         goto fail;
 365                 zone_change(cp->mc_slab_zone, Z_EXPAND, TRUE);
 366         }
 367         cp->mc_chunksize = chunksize;
 368
 369         /*
 370          * Initialize the bucket layer.
 371          */
 372         (void) snprintf(lck_name, sizeof (lck_name), "%s.bkt", cp->mc_name);
 373         cp->mc_bkt_lock_grp_attr = lck_grp_attr_alloc_init();
 374         cp->mc_bkt_lock_grp = lck_grp_alloc_init(lck_name,
 375             cp->mc_bkt_lock_grp_attr);
 376         cp->mc_bkt_lock_attr = lck_attr_alloc_init();
 377         lck_mtx_init(&cp->mc_bkt_lock, cp->mc_bkt_lock_grp,
 378             cp->mc_bkt_lock_attr);
 379
 380         (void) snprintf(lck_name, sizeof (lck_name), "%s.sync", cp->mc_name);
 381         cp->mc_sync_lock_grp_attr = lck_grp_attr_alloc_init();
 382         cp->mc_sync_lock_grp = lck_grp_alloc_init(lck_name,
 383             cp->mc_sync_lock_grp_attr);
 384         cp->mc_sync_lock_attr = lck_attr_alloc_init();
 385         lck_mtx_init(&cp->mc_sync_lock, cp->mc_sync_lock_grp,
 386             cp->mc_sync_lock_attr);
 387
 388         for (btp = mcache_bkttype; chunksize <= btp->bt_minbuf; btp++)
 389                 continue;
 390
 391         cp->cache_bkttype = btp;
 392
 393         /*
 394          * Initialize the CPU layer.  Each per-CPU structure is aligned
 395          * on the CPU cache line boundary to prevent false sharing.
 396          */
 397         for (c = 0; c < ncpu; c++) {
 398                 mcache_cpu_t *ccp = &cp->mc_cpu[c];
 399
 400                 VERIFY(IS_P2ALIGNED(ccp, CPU_CACHE_LINE_SIZE));
 401                 lck_mtx_init(&ccp->cc_lock, cp->mc_cpu_lock_grp,
 402                     cp->mc_cpu_lock_attr);
 403                 ccp->cc_objs = -1;
 404                 ccp->cc_pobjs = -1;
 405         }
 406
 407         if (mcache_ready)
 408                 mcache_cache_bkt_enable(cp);
 409
 410         /* TODO: dynamically create sysctl for stats */
 411
 412         MCACHE_LIST_LOCK();
 413         LIST_INSERT_HEAD(&mcache_head, cp, mc_list);
 414         MCACHE_LIST_UNLOCK();
 415
 416         /*
 417          * If cache buckets are enabled and this is the first cache
 418          * created, start the periodic cache update.
 419          */
 420         if (!(mcache_flags & MCF_NOCPUCACHE) && !mcache_updating) {
 421                 mcache_updating = 1;
 422                 mcache_update_timeout(NULL);
 423         }
 424         if (cp->mc_flags & MCF_DEBUG) {
 425                 printf("mcache_create: %s (%s) arg %p bufsize %lu align %lu "
 426                     "chunksize %lu bktsize %d\n", name, need_zone ? "i" : "e",
 427                     arg, bufsize, cp->mc_align, chunksize, btp->bt_bktsize);
 428         }
 429         return (cp);
 430
 431 fail:
 432         if (buf != NULL)
 433                 zfree(mcache_zone, buf);
 434         return (NULL);
 435 }
 436
 437 /*
 438  * Allocate one or more objects from a cache.
 439  */
 440 __private_extern__ unsigned int
 441 mcache_alloc_ext(mcache_t *cp, mcache_obj_t **list, unsigned int num, int wait)
 442 {
 443         mcache_cpu_t *ccp;
 444         mcache_obj_t **top = &(*list);
 445         mcache_bkt_t *bkt;
 446         unsigned int need = num;
 447         boolean_t nwretry = FALSE;
 448
 449         /* MCR_NOSLEEP and MCR_FAILOK are mutually exclusive */
 450         VERIFY((wait & (MCR_NOSLEEP|MCR_FAILOK)) != (MCR_NOSLEEP|MCR_FAILOK));
 451
 452         ASSERT(list != NULL);
 453         *list = NULL;
 454
 455         if (num == 0)
 456                 return (0);
 457
 458 retry_alloc:
 459         /* We may not always be running in the same CPU in case of retries */
 460         ccp = MCACHE_CPU(cp);
 461
 462         MCACHE_LOCK(&ccp->cc_lock);
 463         for (;;) {
 464                 /*
 465                  * If we have an object in the current CPU's filled bucket,
 466                  * chain the object to any previous objects and return if
 467                  * we've satisfied the number of requested objects.
 468                  */
 469                 if (ccp->cc_objs > 0) {
 470                         mcache_obj_t *tail;
 471                         int objs;
 472
 473                         /*
 474                          * Objects in the bucket are already linked together
 475                          * with the most recently freed object at the head of
 476                          * the list; grab as many objects as we can.
 477                          */
 478                         objs = MIN((unsigned int)ccp->cc_objs, need);
 479                         *list = ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
 480                         ccp->cc_objs -= objs;
 481                         ccp->cc_alloc += objs;
 482
 483                         tail = ccp->cc_filled->bkt_obj[ccp->cc_objs];
 484                         list = &tail->obj_next;
 485                         *list = NULL;
 486
 487                         /* If we got them all, return to caller */
 488                         if ((need -= objs) == 0) {
 489                                 MCACHE_UNLOCK(&ccp->cc_lock);
 490
 491                                 if (!(cp->mc_flags & MCF_NOLEAKLOG) &&
 492                                     cp->mc_slab_log != NULL)
 493                                         (*cp->mc_slab_log)(num, *top, TRUE);
 494
 495                                 if (cp->mc_flags & MCF_DEBUG)
 496                                         goto debug_alloc;
 497
 498                                 return (num);
 499                         }
 500                 }
 501
 502                 /*
 503                  * The CPU's filled bucket is empty.  If the previous filled
 504                  * bucket was full, exchange and try again.
 505                  */
 506                 if (ccp->cc_pobjs > 0) {
 507                         mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
 508                         continue;
 509                 }
 510
 511                 /*
 512                  * If the bucket layer is disabled, allocate from slab.  This
 513                  * can happen either because MCF_NOCPUCACHE is set, or because
 514                  * the bucket layer is currently being resized.
 515                  */
 516                 if (ccp->cc_bktsize == 0)
 517                         break;
 518
 519                 /*
 520                  * Both of the CPU's buckets are empty; try to get a full
 521                  * bucket from the bucket layer.  Upon success, refill this
 522                  * CPU and place any empty bucket into the empty list.
 523                  */
 524                 bkt = mcache_bkt_alloc(cp, &cp->mc_full, NULL);
 525                 if (bkt != NULL) {
 526                         if (ccp->cc_pfilled != NULL)
 527                                 mcache_bkt_free(cp, &cp->mc_empty,
 528                                     ccp->cc_pfilled);
 529                         mcache_cpu_refill(ccp, bkt, ccp->cc_bktsize);
 530                         continue;
 531                 }
 532
 533                 /*
 534                  * The bucket layer has no full buckets; allocate the
 535                  * object(s) directly from the slab layer.
 536                  */
 537                 break;
 538         }
 539         MCACHE_UNLOCK(&ccp->cc_lock);
 540
 541         need -= (*cp->mc_slab_alloc)(cp->mc_private, &list, need, wait);
 542
 543         /*
 544          * If this is a blocking allocation, or if it is non-blocking and
 545          * the cache's full bucket is non-empty, then retry the allocation.
 546          */
 547         if (need > 0) {
 548                 if (!(wait & MCR_NONBLOCKING)) {
 549                         atomic_add_32(&cp->mc_wretry_cnt, 1);
 550                         goto retry_alloc;
 551                 } else if ((wait & (MCR_NOSLEEP | MCR_TRYHARD)) &&
 552                     !mcache_bkt_isempty(cp)) {
 553                         if (!nwretry)
 554                                 nwretry = TRUE;
 555                         atomic_add_32(&cp->mc_nwretry_cnt, 1);
 556                         goto retry_alloc;
 557                 } else if (nwretry) {
 558                         atomic_add_32(&cp->mc_nwfail_cnt, 1);
 559                 }
 560         }
 561
 562         if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL)
 563                 (*cp->mc_slab_log)((num - need), *top, TRUE);
 564
 565         if (!(cp->mc_flags & MCF_DEBUG))
 566                 return (num - need);
 567
 568 debug_alloc:
 569         if (cp->mc_flags & MCF_DEBUG) {
 570                 mcache_obj_t **o = top;
 571                 unsigned int n;
 572
 573                 n = 0;
 574                 /*
 575                  * Verify that the chain of objects have the same count as
 576                  * what we are about to report to the caller.  Any mismatch
 577                  * here means that the object list is insanely broken and
 578                  * therefore we must panic.
 579                  */
 580                 while (*o != NULL) {
 581                         o = &(*o)->obj_next;
 582                         ++n;
 583                 }
 584                 if (n != (num - need)) {
 585                         panic("mcache_alloc_ext: %s cp %p corrupted list "
 586                             "(got %d actual %d)\n", cp->mc_name,
 587                             (void *)cp, num - need, n);
 588                 }
 589         }
 590
 591         /* Invoke the slab layer audit callback if auditing is enabled */
 592         if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL)
 593                 (*cp->mc_slab_audit)(cp->mc_private, *top, TRUE);
 594
 595         return (num - need);
 596 }
 597
 598 /*
 599  * Allocate a single object from a cache.
 600  */
 601 __private_extern__ void *
 602 mcache_alloc(mcache_t *cp, int wait)
 603 {
 604         mcache_obj_t *buf;
 605
 606         (void) mcache_alloc_ext(cp, &buf, 1, wait);
 607         return (buf);
 608 }
 609
 610 __private_extern__ void
 611 mcache_waiter_inc(mcache_t *cp)
 612 {
 613         atomic_add_32(&cp->mc_waiter_cnt, 1);
 614 }
 615
 616 __private_extern__ void
 617 mcache_waiter_dec(mcache_t *cp)
 618 {
 619         atomic_add_32(&cp->mc_waiter_cnt, -1);
 620 }
 621
 622 __private_extern__ boolean_t
 623 mcache_bkt_isempty(mcache_t *cp)
 624 {
 625         /*
 626          * This isn't meant to accurately tell whether there are
 627          * any full buckets in the cache; it is simply a way to
 628          * obtain "hints" about the state of the cache.
 629          */
 630         return (cp->mc_full.bl_total == 0);
 631 }
 632
 633 /*
 634  * Notify the slab layer about an event.
 635  */
 636 static void
 637 mcache_notify(mcache_t *cp, u_int32_t event)
 638 {
 639         if (cp->mc_slab_notify != NULL)
 640                 (*cp->mc_slab_notify)(cp->mc_private, event);
 641 }
 642
 643 /*
 644  * Purge the cache and disable its buckets.
 645  */
 646 static void
 647 mcache_purge(void *arg)
 648 {
 649         mcache_t *cp = arg;
 650
 651         mcache_bkt_purge(cp);
 652         /*
 653          * We cannot simply call mcache_cache_bkt_enable() from here as
 654          * a bucket resize may be in flight and we would cause the CPU
 655          * layers of the cache to point to different sizes.  Therefore,
 656          * we simply increment the enable count so that during the next
 657          * periodic cache update the buckets can be reenabled.
 658          */
 659         lck_mtx_lock_spin(&cp->mc_sync_lock);
 660         cp->mc_enable_cnt++;
 661         lck_mtx_unlock(&cp->mc_sync_lock);
 662
 663 }
 664
 665 __private_extern__ boolean_t
 666 mcache_purge_cache(mcache_t *cp)
 667 {
 668         /*
 669          * Purging a cache that has no per-CPU caches or is already
 670          * in the process of being purged is rather pointless.
 671          */
 672         if (cp->mc_flags & MCF_NOCPUCACHE)
 673                 return (FALSE);
 674
 675         lck_mtx_lock_spin(&cp->mc_sync_lock);
 676         if (cp->mc_purge_cnt > 0) {
 677                 lck_mtx_unlock(&cp->mc_sync_lock);
 678                 return (FALSE);
 679         }
 680         cp->mc_purge_cnt++;
 681         lck_mtx_unlock(&cp->mc_sync_lock);
 682
 683         mcache_dispatch(mcache_purge, cp);
 684
 685         return (TRUE);
 686 }
 687
 688 /*
 689  * Free a single object to a cache.
 690  */
 691 __private_extern__ void
 692 mcache_free(mcache_t *cp, void *buf)
 693 {
 694         ((mcache_obj_t *)buf)->obj_next = NULL;
 695         mcache_free_ext(cp, (mcache_obj_t *)buf);
 696 }
 697
 698 /*
 699  * Free one or more objects to a cache.
 700  */
 701 __private_extern__ void
 702 mcache_free_ext(mcache_t *cp, mcache_obj_t *list)
 703 {
 704         mcache_cpu_t *ccp = MCACHE_CPU(cp);
 705         mcache_bkttype_t *btp;
 706         mcache_obj_t *nlist;
 707         mcache_bkt_t *bkt;
 708
 709         if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL)
 710                 (*cp->mc_slab_log)(0, list, FALSE);
 711
 712         /* Invoke the slab layer audit callback if auditing is enabled */
 713         if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL)
 714                 (*cp->mc_slab_audit)(cp->mc_private, list, FALSE);
 715
 716         MCACHE_LOCK(&ccp->cc_lock);
 717         for (;;) {
 718                 /*
 719                  * If there is space in the current CPU's filled bucket, put
 720                  * the object there and return once all objects are freed.
 721                  * Note the cast to unsigned integer takes care of the case
 722                  * where the bucket layer is disabled (when cc_objs is -1).
 723                  */
 724                 if ((unsigned int)ccp->cc_objs <
 725                     (unsigned int)ccp->cc_bktsize) {
 726                         /*
 727                          * Reverse the list while we place the object into the
 728                          * bucket; this effectively causes the most recently
 729                          * freed object(s) to be reused during allocation.
 730                          */
 731                         nlist = list->obj_next;
 732                         list->obj_next = (ccp->cc_objs == 0) ? NULL :
 733                             ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
 734                         ccp->cc_filled->bkt_obj[ccp->cc_objs++] = list;
 735                         ccp->cc_free++;
 736
 737                         if ((list = nlist) != NULL)
 738                                 continue;
 739
 740                         /* We are done; return to caller */
 741                         MCACHE_UNLOCK(&ccp->cc_lock);
 742
 743                         /* If there is a waiter below, notify it */
 744                         if (cp->mc_waiter_cnt > 0)
 745                                 mcache_notify(cp, MCN_RETRYALLOC);
 746                         return;
 747                 }
 748
 749                 /*
 750                  * The CPU's filled bucket is full.  If the previous filled
 751                  * bucket was empty, exchange and try again.
 752                  */
 753                 if (ccp->cc_pobjs == 0) {
 754                         mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
 755                         continue;
 756                 }
 757
 758                 /*
 759                  * If the bucket layer is disabled, free to slab.  This can
 760                  * happen either because MCF_NOCPUCACHE is set, or because
 761                  * the bucket layer is currently being resized.
 762                  */
 763                 if (ccp->cc_bktsize == 0)
 764                         break;
 765
 766                 /*
 767                  * Both of the CPU's buckets are full; try to get an empty
 768                  * bucket from the bucket layer.  Upon success, empty this
 769                  * CPU and place any full bucket into the full list.
 770                  */
 771                 bkt = mcache_bkt_alloc(cp, &cp->mc_empty, &btp);
 772                 if (bkt != NULL) {
 773                         if (ccp->cc_pfilled != NULL)
 774                                 mcache_bkt_free(cp, &cp->mc_full,
 775                                     ccp->cc_pfilled);
 776                         mcache_cpu_refill(ccp, bkt, 0);
 777                         continue;
 778                 }
 779
 780                 /*
 781                  * We need an empty bucket to put our freed objects into
 782                  * but couldn't get an empty bucket from the bucket layer;
 783                  * attempt to allocate one.  We do not want to block for
 784                  * allocation here, and if the bucket allocation fails
 785                  * we will simply fall through to the slab layer.
 786                  */
 787                 MCACHE_UNLOCK(&ccp->cc_lock);
 788                 bkt = mcache_alloc(btp->bt_cache, MCR_NOSLEEP);
 789                 MCACHE_LOCK(&ccp->cc_lock);
 790
 791                 if (bkt != NULL) {
 792                         /*
 793                          * We have an empty bucket, but since we drop the
 794                          * CPU lock above, the cache's bucket size may have
 795                          * changed.  If so, free the bucket and try again.
 796                          */
 797                         if (ccp->cc_bktsize != btp->bt_bktsize) {
 798                                 MCACHE_UNLOCK(&ccp->cc_lock);
 799                                 mcache_free(btp->bt_cache, bkt);
 800                                 MCACHE_LOCK(&ccp->cc_lock);
 801                                 continue;
 802                         }
 803
 804                         /*
 805                          * We have an empty bucket of the right size;
 806                          * add it to the bucket layer and try again.
 807                          */
 808                         mcache_bkt_free(cp, &cp->mc_empty, bkt);
 809                         continue;
 810                 }
 811
 812                 /*
 813                  * The bucket layer has no empty buckets; free the
 814                  * object(s) directly to the slab layer.
 815                  */
 816                 break;
 817         }
 818         MCACHE_UNLOCK(&ccp->cc_lock);
 819
 820         /* If there is a waiter below, notify it */
 821         if (cp->mc_waiter_cnt > 0)
 822                 mcache_notify(cp, MCN_RETRYALLOC);
 823
 824         /* Advise the slab layer to purge the object(s) */
 825         (*cp->mc_slab_free)(cp->mc_private, list,
 826             (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
 827 }
 828
 829 /*
 830  * Cache destruction routine.
 831  */
 832 __private_extern__ void
 833 mcache_destroy(mcache_t *cp)
 834 {
 835         void **pbuf;
 836
 837         MCACHE_LIST_LOCK();
 838         LIST_REMOVE(cp, mc_list);
 839         MCACHE_LIST_UNLOCK();
 840
 841         mcache_bkt_purge(cp);
 842
 843         /*
 844          * This cache is dead; there should be no further transaction.
 845          * If it's still invoked, make sure that it induces a fault.
 846          */
 847         cp->mc_slab_alloc = NULL;
 848         cp->mc_slab_free = NULL;
 849         cp->mc_slab_audit = NULL;
 850
 851         lck_attr_free(cp->mc_bkt_lock_attr);
 852         lck_grp_free(cp->mc_bkt_lock_grp);
 853         lck_grp_attr_free(cp->mc_bkt_lock_grp_attr);
 854
 855         lck_attr_free(cp->mc_cpu_lock_attr);
 856         lck_grp_free(cp->mc_cpu_lock_grp);
 857         lck_grp_attr_free(cp->mc_cpu_lock_grp_attr);
 858
 859         lck_attr_free(cp->mc_sync_lock_attr);
 860         lck_grp_free(cp->mc_sync_lock_grp);
 861         lck_grp_attr_free(cp->mc_sync_lock_grp_attr);
 862
 863         /*
 864          * TODO: We need to destroy the zone here, but cannot do it
 865          * because there is no such way to achieve that.  Until then
 866          * the memory allocated for the zone structure is leaked.
 867          * Once it is achievable, uncomment these lines:
 868          *
 869          *      if (cp->mc_slab_zone != NULL) {
 870          *              zdestroy(cp->mc_slab_zone);
 871          *              cp->mc_slab_zone = NULL;
 872          *      }
 873          */
 874
 875         /* Get the original address since we're about to free it */
 876         pbuf = (void **)((intptr_t)cp - sizeof (void *));
 877
 878         zfree(mcache_zone, *pbuf);
 879 }
 880
 881 /*
 882  * Internal slab allocator used as a backend for simple caches.  The current
 883  * implementation uses the zone allocator for simplicity reasons.
 884  */
 885 static unsigned int
 886 mcache_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
 887 {
 888         mcache_t *cp = arg;
 889         unsigned int need = num;
 890         size_t offset = 0;
 891         size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof (u_int64_t));
 892         u_int32_t flags = cp->mc_flags;
 893         void *buf, *base, **pbuf;
 894         mcache_obj_t **list = *plist;
 895
 896         *list = NULL;
 897
 898         /*
 899          * The address of the object returned to the caller is an
 900          * offset from the 64-bit aligned base address only if the
 901          * cache's alignment requirement is neither 1 nor 8 bytes.
 902          */
 903         if (cp->mc_align != 1 && cp->mc_align != sizeof (u_int64_t))
 904                 offset = cp->mc_align;
 905
 906         for (;;) {
 907                 if (!(wait & MCR_NOSLEEP))
 908                         buf = zalloc(cp->mc_slab_zone);
 909                 else
 910                         buf = zalloc_noblock(cp->mc_slab_zone);
 911
 912                 if (buf == NULL)
 913                         break;
 914
 915                 /* Get the 64-bit aligned base address for this object */
 916                 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof (u_int64_t),
 917                     sizeof (u_int64_t));
 918
 919                 /*
 920                  * Wind back a pointer size from the aligned base and
 921                  * save the original address so we can free it later.
 922                  */
 923                 pbuf = (void **)((intptr_t)base - sizeof (void *));
 924                 *pbuf = buf;
 925
 926                 /*
 927                  * If auditing is enabled, patternize the contents of
 928                  * the buffer starting from the 64-bit aligned base to
 929                  * the end of the buffer; the length is rounded up to
 930                  * the nearest 64-bit multiply; this is because we use
 931                  * 64-bit memory access to set/check the pattern.
 932                  */
 933                 if (flags & MCF_DEBUG) {
 934                         VERIFY(((intptr_t)base + rsize) <=
 935                             ((intptr_t)buf + cp->mc_chunksize));
 936                         mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
 937                 }
 938
 939                 /*
 940                  * Fix up the object's address to fulfill the cache's
 941                  * alignment requirement (if needed) and return this
 942                  * to the caller.
 943                  */
 944                 VERIFY(((intptr_t)base + offset + cp->mc_bufsize) <=
 945                     ((intptr_t)buf + cp->mc_chunksize));
 946                 *list = (mcache_obj_t *)((intptr_t)base + offset);
 947
 948                 (*list)->obj_next = NULL;
 949                 list = *plist = &(*list)->obj_next;
 950
 951                 /* If we got them all, return to mcache */
 952                 if (--need == 0)
 953                         break;
 954         }
 955
 956         return (num - need);
 957 }
 958
 959 /*
 960  * Internal slab deallocator used as a backend for simple caches.
 961  */
 962 static void
 963 mcache_slab_free(void *arg, mcache_obj_t *list, __unused boolean_t purged)
 964 {
 965         mcache_t *cp = arg;
 966         mcache_obj_t *nlist;
 967         size_t offset = 0;
 968         size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof (u_int64_t));
 969         u_int32_t flags = cp->mc_flags;
 970         void *base;
 971         void **pbuf;
 972
 973         /*
 974          * The address of the object is an offset from a 64-bit
 975          * aligned base address only if the cache's alignment
 976          * requirement is neither 1 nor 8 bytes.
 977          */
 978         if (cp->mc_align != 1 && cp->mc_align != sizeof (u_int64_t))
 979                 offset = cp->mc_align;
 980
 981         for (;;) {
 982                 nlist = list->obj_next;
 983                 list->obj_next = NULL;
 984
 985                 /* Get the 64-bit aligned base address of this object */
 986                 base = (void *)((intptr_t)list - offset);
 987                 VERIFY(IS_P2ALIGNED(base, sizeof (u_int64_t)));
 988
 989                 /* Get the original address since we're about to free it */
 990                 pbuf = (void **)((intptr_t)base - sizeof (void *));
 991
 992                 if (flags & MCF_DEBUG) {
 993                         VERIFY(((intptr_t)base + rsize) <=
 994                             ((intptr_t)*pbuf + cp->mc_chunksize));
 995                         mcache_audit_free_verify(NULL, base, offset, rsize);
 996                 }
 997
 998                 /* Free it to zone */
 999                 VERIFY(((intptr_t)base + offset + cp->mc_bufsize) <=
1000                     ((intptr_t)*pbuf + cp->mc_chunksize));
1001                 zfree(cp->mc_slab_zone, *pbuf);
1002
1003                 /* No more objects to free; return to mcache */
1004                 if ((list = nlist) == NULL)
1005                         break;
1006         }
1007 }
1008
1009 /*
1010  * Internal slab auditor for simple caches.
1011  */
1012 static void
1013 mcache_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
1014 {
1015         mcache_t *cp = arg;
1016         size_t offset = 0;
1017         size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof (u_int64_t));
1018         void *base, **pbuf;
1019
1020         /*
1021          * The address of the object returned to the caller is an
1022          * offset from the 64-bit aligned base address only if the
1023          * cache's alignment requirement is neither 1 nor 8 bytes.
1024          */
1025         if (cp->mc_align != 1 && cp->mc_align != sizeof (u_int64_t))
1026                 offset = cp->mc_align;
1027
1028         while (list != NULL) {
1029                 mcache_obj_t *next = list->obj_next;
1030
1031                 /* Get the 64-bit aligned base address of this object */
1032                 base = (void *)((intptr_t)list - offset);
1033                 VERIFY(IS_P2ALIGNED(base, sizeof (u_int64_t)));
1034
1035                 /* Get the original address */
1036                 pbuf = (void **)((intptr_t)base - sizeof (void *));
1037
1038                 VERIFY(((intptr_t)base + rsize) <=
1039                     ((intptr_t)*pbuf + cp->mc_chunksize));
1040
1041                 if (!alloc)
1042                         mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
1043                 else
1044                         mcache_audit_free_verify_set(NULL, base, offset, rsize);
1045
1046                 list = list->obj_next = next;
1047         }
1048 }
1049
1050 /*
1051  * Refill the CPU's filled bucket with bkt and save the previous one.
1052  */
1053 static void
1054 mcache_cpu_refill(mcache_cpu_t *ccp, mcache_bkt_t *bkt, int objs)
1055 {
1056         ASSERT((ccp->cc_filled == NULL && ccp->cc_objs == -1) ||
1057             (ccp->cc_filled && ccp->cc_objs + objs == ccp->cc_bktsize));
1058         ASSERT(ccp->cc_bktsize > 0);
1059
1060         ccp->cc_pfilled = ccp->cc_filled;
1061         ccp->cc_pobjs = ccp->cc_objs;
1062         ccp->cc_filled = bkt;
1063         ccp->cc_objs = objs;
1064 }
1065
1066 /*
1067  * Allocate a bucket from the bucket layer.
1068  */
1069 static mcache_bkt_t *
1070 mcache_bkt_alloc(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkttype_t **btp)
1071 {
1072         mcache_bkt_t *bkt;
1073
1074         if (!MCACHE_LOCK_TRY(&cp->mc_bkt_lock)) {
1075                 /*
1076                  * The bucket layer lock is held by another CPU; increase
1077                  * the contention count so that we can later resize the
1078                  * bucket size accordingly.
1079                  */
1080                 MCACHE_LOCK(&cp->mc_bkt_lock);
1081                 cp->mc_bkt_contention++;
1082         }
1083
1084         if ((bkt = blp->bl_list) != NULL) {
1085                 blp->bl_list = bkt->bkt_next;
1086                 if (--blp->bl_total < blp->bl_min)
1087                         blp->bl_min = blp->bl_total;
1088                 blp->bl_alloc++;
1089         }
1090
1091         if (btp != NULL)
1092                 *btp = cp->cache_bkttype;
1093
1094         MCACHE_UNLOCK(&cp->mc_bkt_lock);
1095
1096         return (bkt);
1097 }
1098
1099 /*
1100  * Free a bucket to the bucket layer.
1101  */
1102 static void
1103 mcache_bkt_free(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkt_t *bkt)
1104 {
1105         MCACHE_LOCK(&cp->mc_bkt_lock);
1106
1107         bkt->bkt_next = blp->bl_list;
1108         blp->bl_list = bkt;
1109         blp->bl_total++;
1110
1111         MCACHE_UNLOCK(&cp->mc_bkt_lock);
1112 }
1113
1114 /*
1115  * Enable the bucket layer of a cache.
1116  */
1117 static void
1118 mcache_cache_bkt_enable(mcache_t *cp)
1119 {
1120         mcache_cpu_t *ccp;
1121         int cpu;
1122
1123         if (cp->mc_flags & MCF_NOCPUCACHE)
1124                 return;
1125
1126         for (cpu = 0; cpu < ncpu; cpu++) {
1127                 ccp = &cp->mc_cpu[cpu];
1128                 MCACHE_LOCK(&ccp->cc_lock);
1129                 ccp->cc_bktsize = cp->cache_bkttype->bt_bktsize;
1130                 MCACHE_UNLOCK(&ccp->cc_lock);
1131         }
1132 }
1133
1134 /*
1135  * Purge all buckets from a cache and disable its bucket layer.
1136  */
1137 static void
1138 mcache_bkt_purge(mcache_t *cp)
1139 {
1140         mcache_cpu_t *ccp;
1141         mcache_bkt_t *bp, *pbp;
1142         mcache_bkttype_t *btp;
1143         int cpu, objs, pobjs;
1144
1145         for (cpu = 0; cpu < ncpu; cpu++) {
1146                 ccp = &cp->mc_cpu[cpu];
1147
1148                 MCACHE_LOCK(&ccp->cc_lock);
1149
1150                 btp = cp->cache_bkttype;
1151                 bp = ccp->cc_filled;
1152                 pbp = ccp->cc_pfilled;
1153                 objs = ccp->cc_objs;
1154                 pobjs = ccp->cc_pobjs;
1155                 ccp->cc_filled = NULL;
1156                 ccp->cc_pfilled = NULL;
1157                 ccp->cc_objs = -1;
1158                 ccp->cc_pobjs = -1;
1159                 ccp->cc_bktsize = 0;
1160
1161                 MCACHE_UNLOCK(&ccp->cc_lock);
1162
1163                 if (bp != NULL)
1164                         mcache_bkt_destroy(cp, btp, bp, objs);
1165                 if (pbp != NULL)
1166                         mcache_bkt_destroy(cp, btp, pbp, pobjs);
1167         }
1168
1169         /*
1170          * Updating the working set back to back essentially sets
1171          * the working set size to zero, so everything is reapable.
1172          */
1173         mcache_bkt_ws_update(cp);
1174         mcache_bkt_ws_update(cp);
1175
1176         mcache_bkt_ws_reap(cp);
1177 }
1178
1179 /*
1180  * Free one or more objects in the bucket to the slab layer,
1181  * and also free the bucket itself.
1182  */
1183 static void
1184 mcache_bkt_destroy(mcache_t *cp, mcache_bkttype_t *btp, mcache_bkt_t *bkt,
1185     int nobjs)
1186 {
1187         if (nobjs > 0) {
1188                 mcache_obj_t *top = bkt->bkt_obj[nobjs - 1];
1189
1190                 if (cp->mc_flags & MCF_DEBUG) {
1191                         mcache_obj_t *o = top;
1192                         int cnt = 0;
1193
1194                         /*
1195                          * Verify that the chain of objects in the bucket is
1196                          * valid.  Any mismatch here means a mistake when the
1197                          * object(s) were freed to the CPU layer, so we panic.
1198                          */
1199                         while (o != NULL) {
1200                                 o = o->obj_next;
1201                                 ++cnt;
1202                         }
1203                         if (cnt != nobjs) {
1204                                 panic("mcache_bkt_destroy: %s cp %p corrupted "
1205                                     "list in bkt %p (nobjs %d actual %d)\n",
1206                                     cp->mc_name, (void *)cp, (void *)bkt,
1207                                     nobjs, cnt);
1208                         }
1209                 }
1210
1211                 /* Advise the slab layer to purge the object(s) */
1212                 (*cp->mc_slab_free)(cp->mc_private, top,
1213                     (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
1214         }
1215         mcache_free(btp->bt_cache, bkt);
1216 }
1217
1218 /*
1219  * Update the bucket layer working set statistics.
1220  */
1221 static void
1222 mcache_bkt_ws_update(mcache_t *cp)
1223 {
1224         MCACHE_LOCK(&cp->mc_bkt_lock);
1225
1226         cp->mc_full.bl_reaplimit = cp->mc_full.bl_min;
1227         cp->mc_full.bl_min = cp->mc_full.bl_total;
1228         cp->mc_empty.bl_reaplimit = cp->mc_empty.bl_min;
1229         cp->mc_empty.bl_min = cp->mc_empty.bl_total;
1230
1231         MCACHE_UNLOCK(&cp->mc_bkt_lock);
1232 }
1233
1234 /*
1235  * Reap all buckets that are beyond the working set.
1236  */
1237 static void
1238 mcache_bkt_ws_reap(mcache_t *cp)
1239 {
1240         long reap;
1241         mcache_bkt_t *bkt;
1242         mcache_bkttype_t *btp;
1243
1244         reap = MIN(cp->mc_full.bl_reaplimit, cp->mc_full.bl_min);
1245         while (reap-- &&
1246             (bkt = mcache_bkt_alloc(cp, &cp->mc_full, &btp)) != NULL)
1247                 mcache_bkt_destroy(cp, btp, bkt, btp->bt_bktsize);
1248
1249         reap = MIN(cp->mc_empty.bl_reaplimit, cp->mc_empty.bl_min);
1250         while (reap-- &&
1251             (bkt = mcache_bkt_alloc(cp, &cp->mc_empty, &btp)) != NULL)
1252                 mcache_bkt_destroy(cp, btp, bkt, 0);
1253 }
1254
1255 static void
1256 mcache_reap_timeout(void *arg)
1257 {
1258         volatile UInt32 *flag = arg;
1259
1260         ASSERT(flag == &mcache_reaping);
1261
1262         *flag = 0;
1263 }
1264
1265 static void
1266 mcache_reap_done(void *flag)
1267 {
1268         timeout(mcache_reap_timeout, flag, mcache_reap_interval);
1269 }
1270
1271 static void
1272 mcache_reap_start(void *arg)
1273 {
1274         UInt32 *flag = arg;
1275
1276         ASSERT(flag == &mcache_reaping);
1277
1278         mcache_applyall(mcache_cache_reap);
1279         mcache_dispatch(mcache_reap_done, flag);
1280 }
1281
1282 __private_extern__ void
1283 mcache_reap(void)
1284 {
1285         UInt32 *flag = &mcache_reaping;
1286
1287         if (mcache_llock_owner == current_thread() ||
1288             !OSCompareAndSwap(0, 1, flag))
1289                 return;
1290
1291         mcache_dispatch(mcache_reap_start, flag);
1292 }
1293
1294 static void
1295 mcache_cache_reap(mcache_t *cp)
1296 {
1297         mcache_bkt_ws_reap(cp);
1298 }
1299
1300 /*
1301  * Performs period maintenance on a cache.
1302  */
1303 static void
1304 mcache_cache_update(mcache_t *cp)
1305 {
1306         int need_bkt_resize = 0;
1307         int need_bkt_reenable = 0;
1308
1309         lck_mtx_assert(mcache_llock, LCK_MTX_ASSERT_OWNED);
1310
1311         mcache_bkt_ws_update(cp);
1312
1313         /*
1314          * Cache resize and post-purge reenable are mutually exclusive.
1315          * If the cache was previously purged, there is no point of
1316          * increasing the bucket size as there was an indication of
1317          * memory pressure on the system.
1318          */
1319         lck_mtx_lock_spin(&cp->mc_sync_lock);
1320         if (!(cp->mc_flags & MCF_NOCPUCACHE) && cp->mc_enable_cnt)
1321                 need_bkt_reenable = 1;
1322         lck_mtx_unlock(&cp->mc_sync_lock);
1323
1324         MCACHE_LOCK(&cp->mc_bkt_lock);
1325         /*
1326          * If the contention count is greater than the threshold, and if
1327          * we are not already at the maximum bucket size, increase it.
1328          * Otherwise, if this cache was previously purged by the user
1329          * then we simply reenable it.
1330          */
1331         if ((unsigned int)cp->mc_chunksize < cp->cache_bkttype->bt_maxbuf &&
1332             (int)(cp->mc_bkt_contention - cp->mc_bkt_contention_prev) >
1333             mcache_bkt_contention && !need_bkt_reenable)
1334                 need_bkt_resize = 1;
1335
1336         cp ->mc_bkt_contention_prev = cp->mc_bkt_contention;
1337         MCACHE_UNLOCK(&cp->mc_bkt_lock);
1338
1339         if (need_bkt_resize)
1340                 mcache_dispatch(mcache_cache_bkt_resize, cp);
1341         else if (need_bkt_reenable)
1342                 mcache_dispatch(mcache_cache_enable, cp);
1343 }
1344
1345 /*
1346  * Recompute a cache's bucket size.  This is an expensive operation
1347  * and should not be done frequently; larger buckets provide for a
1348  * higher transfer rate with the bucket while smaller buckets reduce
1349  * the memory consumption.
1350  */
1351 static void
1352 mcache_cache_bkt_resize(void *arg)
1353 {
1354         mcache_t *cp = arg;
1355         mcache_bkttype_t *btp = cp->cache_bkttype;
1356
1357         if ((unsigned int)cp->mc_chunksize < btp->bt_maxbuf) {
1358                 mcache_bkt_purge(cp);
1359
1360                 /*
1361                  * Upgrade to the next bucket type with larger bucket size;
1362                  * temporarily set the previous contention snapshot to a
1363                  * negative number to prevent unnecessary resize request.
1364                  */
1365                 MCACHE_LOCK(&cp->mc_bkt_lock);
1366                 cp->cache_bkttype = ++btp;
1367                 cp ->mc_bkt_contention_prev = cp->mc_bkt_contention + INT_MAX;
1368                 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1369
1370                 mcache_cache_enable(cp);
1371         }
1372 }
1373
1374 /*
1375  * Reenable a previously disabled cache due to purge.
1376  */
1377 static void
1378 mcache_cache_enable(void *arg)
1379 {
1380         mcache_t *cp = arg;
1381
1382         lck_mtx_lock_spin(&cp->mc_sync_lock);
1383         cp->mc_purge_cnt = 0;
1384         cp->mc_enable_cnt = 0;
1385         lck_mtx_unlock(&cp->mc_sync_lock);
1386
1387         mcache_cache_bkt_enable(cp);
1388 }
1389
1390 static void
1391 mcache_update_timeout(__unused void *arg)
1392 {
1393         timeout(mcache_update, NULL, mcache_reap_interval);
1394 }
1395
1396 static void
1397 mcache_update(__unused void *arg)
1398 {
1399         mcache_applyall(mcache_cache_update);
1400         mcache_dispatch(mcache_update_timeout, NULL);
1401 }
1402
1403 static void
1404 mcache_applyall(void (*func)(mcache_t *))
1405 {
1406         mcache_t *cp;
1407
1408         MCACHE_LIST_LOCK();
1409         LIST_FOREACH(cp, &mcache_head, mc_list) {
1410                 func(cp);
1411         }
1412         MCACHE_LIST_UNLOCK();
1413 }
1414
1415 static void
1416 mcache_dispatch(void (*func)(void *), void *arg)
1417 {
1418         ASSERT(func != NULL);
1419         timeout(func, arg, hz/1000);
1420 }
1421
1422 __private_extern__ void
1423 mcache_buffer_log(mcache_audit_t *mca, void *addr, mcache_t *cp,
1424     struct timeval *base_ts)
1425 {
1426         struct timeval now, base = { 0, 0 };
1427         void *stack[MCACHE_STACK_DEPTH + 1];
1428
1429         mca->mca_addr = addr;
1430         mca->mca_cache = cp;
1431         mca->mca_pthread = mca->mca_thread;
1432         mca->mca_thread = current_thread();
1433         bcopy(mca->mca_stack, mca->mca_pstack, sizeof (mca->mca_pstack));
1434         mca->mca_pdepth = mca->mca_depth;
1435         bzero(stack, sizeof (stack));
1436         mca->mca_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
1437         bcopy(&stack[1], mca->mca_stack, sizeof (mca->mca_pstack));
1438
1439         mca->mca_ptstamp = mca->mca_tstamp;
1440         microuptime(&now);
1441         if (base_ts != NULL)
1442                 base = *base_ts;
1443         /* tstamp is in ms relative to base_ts */
1444         mca->mca_tstamp = ((now.tv_usec - base.tv_usec) / 1000);
1445         if ((now.tv_sec - base.tv_sec) > 0)
1446                 mca->mca_tstamp += ((now.tv_sec - base.tv_sec) * 1000);
1447 }
1448
1449 __private_extern__ void
1450 mcache_set_pattern(u_int64_t pattern, void *buf_arg, size_t size)
1451 {
1452         u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1453         u_int64_t *buf = (u_int64_t *)buf_arg;
1454
1455         VERIFY(IS_P2ALIGNED(buf_arg, sizeof (u_int64_t)));
1456         VERIFY(IS_P2ALIGNED(size, sizeof (u_int64_t)));
1457
1458         while (buf < buf_end)
1459                 *buf++ = pattern;
1460 }
1461
1462 __private_extern__ void *
1463 mcache_verify_pattern(u_int64_t pattern, void *buf_arg, size_t size)
1464 {
1465         u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1466         u_int64_t *buf;
1467
1468         VERIFY(IS_P2ALIGNED(buf_arg, sizeof (u_int64_t)));
1469         VERIFY(IS_P2ALIGNED(size, sizeof (u_int64_t)));
1470
1471         for (buf = buf_arg; buf < buf_end; buf++) {
1472                 if (*buf != pattern)
1473                         return (buf);
1474         }
1475         return (NULL);
1476 }
1477
1478 __private_extern__ void *
1479 mcache_verify_set_pattern(u_int64_t old, u_int64_t new, void *buf_arg,
1480     size_t size)
1481 {
1482         u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1483         u_int64_t *buf;
1484
1485         VERIFY(IS_P2ALIGNED(buf_arg, sizeof (u_int64_t)));
1486         VERIFY(IS_P2ALIGNED(size, sizeof (u_int64_t)));
1487
1488         for (buf = buf_arg; buf < buf_end; buf++) {
1489                 if (*buf != old) {
1490                         mcache_set_pattern(old, buf_arg,
1491                             (uintptr_t)buf - (uintptr_t)buf_arg);
1492                         return (buf);
1493                 }
1494                 *buf = new;
1495         }
1496         return (NULL);
1497 }
1498
1499 __private_extern__ void
1500 mcache_audit_free_verify(mcache_audit_t *mca, void *base, size_t offset,
1501     size_t size)
1502 {
1503         void *addr;
1504         u_int64_t *oaddr64;
1505         mcache_obj_t *next;
1506
1507         addr = (void *)((uintptr_t)base + offset);
1508         next = ((mcache_obj_t *)addr)->obj_next;
1509
1510         /* For the "obj_next" pointer in the buffer */
1511         oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof (u_int64_t));
1512         *oaddr64 = MCACHE_FREE_PATTERN;
1513
1514         if ((oaddr64 = mcache_verify_pattern(MCACHE_FREE_PATTERN,
1515             (caddr_t)base, size)) != NULL) {
1516                 mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
1517                     (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
1518                 /* NOTREACHED */
1519         }
1520         ((mcache_obj_t *)addr)->obj_next = next;
1521 }
1522
1523 __private_extern__ void
1524 mcache_audit_free_verify_set(mcache_audit_t *mca, void *base, size_t offset,
1525     size_t size)
1526 {
1527         void *addr;
1528         u_int64_t *oaddr64;
1529         mcache_obj_t *next;
1530
1531         addr = (void *)((uintptr_t)base + offset);
1532         next = ((mcache_obj_t *)addr)->obj_next;
1533
1534         /* For the "obj_next" pointer in the buffer */
1535         oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof (u_int64_t));
1536         *oaddr64 = MCACHE_FREE_PATTERN;
1537
1538         if ((oaddr64 = mcache_verify_set_pattern(MCACHE_FREE_PATTERN,
1539             MCACHE_UNINITIALIZED_PATTERN, (caddr_t)base, size)) != NULL) {
1540                 mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
1541                     (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
1542                 /* NOTREACHED */
1543         }
1544         ((mcache_obj_t *)addr)->obj_next = next;
1545 }
1546
1547 #undef panic
1548
1549 __private_extern__ char *
1550 mcache_dump_mca(mcache_audit_t *mca)
1551 {
1552         if (mca_dump_buf == NULL)
1553                 return (NULL);
1554
1555         snprintf(mca_dump_buf, DUMP_MCA_BUF_SIZE,
1556             "mca %p: addr %p, cache %p (%s)\n"
1557             "last transaction; thread %p, saved PC stack (%d deep):\n"
1558             "\t%p, %p, %p, %p, %p, %p, %p, %p\n"
1559             "\t%p, %p, %p, %p, %p, %p, %p, %p\n"
1560             "previous transaction; thread %p, saved PC stack (%d deep):\n"
1561             "\t%p, %p, %p, %p, %p, %p, %p, %p\n"
1562             "\t%p, %p, %p, %p, %p, %p, %p, %p\n",
1563             mca, mca->mca_addr, mca->mca_cache,
1564             mca->mca_cache ? mca->mca_cache->mc_name : "?",
1565             mca->mca_thread, mca->mca_depth,
1566             mca->mca_stack[0], mca->mca_stack[1], mca->mca_stack[2],
1567             mca->mca_stack[3], mca->mca_stack[4], mca->mca_stack[5],
1568             mca->mca_stack[6], mca->mca_stack[7], mca->mca_stack[8],
1569             mca->mca_stack[9], mca->mca_stack[10], mca->mca_stack[11],
1570             mca->mca_stack[12], mca->mca_stack[13], mca->mca_stack[14],
1571             mca->mca_stack[15],
1572             mca->mca_pthread, mca->mca_pdepth,
1573             mca->mca_pstack[0], mca->mca_pstack[1], mca->mca_pstack[2],
1574             mca->mca_pstack[3], mca->mca_pstack[4], mca->mca_pstack[5],
1575             mca->mca_pstack[6], mca->mca_pstack[7], mca->mca_pstack[8],
1576             mca->mca_pstack[9], mca->mca_pstack[10], mca->mca_pstack[11],
1577             mca->mca_pstack[12], mca->mca_pstack[13], mca->mca_pstack[14],
1578             mca->mca_pstack[15]);
1579
1580         return (mca_dump_buf);
1581 }
1582
1583 __private_extern__ void
1584 mcache_audit_panic(mcache_audit_t *mca, void *addr, size_t offset,
1585     int64_t expected, int64_t got)
1586 {
1587         if (mca == NULL) {
1588                 panic("mcache_audit: buffer %p modified after free at "
1589                     "offset 0x%lx (0x%llx instead of 0x%llx)\n", addr,
1590                     offset, got, expected);
1591                 /* NOTREACHED */
1592         }
1593
1594         panic("mcache_audit: buffer %p modified after free at offset 0x%lx "
1595             "(0x%llx instead of 0x%llx)\n%s\n",
1596             addr, offset, got, expected, mcache_dump_mca(mca));
1597         /* NOTREACHED */
1598 }
1599
1600 __private_extern__ int
1601 assfail(const char *a, const char *f, int l)
1602 {
1603         panic("assertion failed: %s, file: %s, line: %d", a, f, l);
1604         return (0);
1605 }