bsd/kern/mcache.c

   1 /*
   2  * Copyright (c) 2006-2014 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 /*
  30  * Memory allocator with per-CPU caching, derived from the kmem magazine
  31  * concept and implementation as described in the following paper:
  32  * http://www.usenix.org/events/usenix01/full_papers/bonwick/bonwick.pdf
  33  * That implementation is Copyright 2006 Sun Microsystems, Inc.  All rights
  34  * reserved.  Use is subject to license terms.
  35  *
  36  * There are several major differences between this and the original kmem
  37  * magazine: this derivative implementation allows for multiple objects to
  38  * be allocated and freed from/to the object cache in one call; in addition,
  39  * it provides for better flexibility where the user is allowed to define
  40  * its own slab allocator (instead of the default zone allocator).  Finally,
  41  * no object construction/destruction takes place at the moment, although
  42  * this could be added in future to improve efficiency.
  43  */
  44
  45 #include <sys/param.h>
  46 #include <sys/types.h>
  47 #include <sys/malloc.h>
  48 #include <sys/mbuf.h>
  49 #include <sys/queue.h>
  50 #include <sys/kernel.h>
  51 #include <sys/systm.h>
  52
  53 #include <kern/debug.h>
  54 #include <kern/zalloc.h>
  55 #include <kern/cpu_number.h>
  56 #include <kern/locks.h>
  57 #include <kern/thread_call.h>
  58
  59 #include <libkern/libkern.h>
  60 #include <libkern/OSAtomic.h>
  61 #include <libkern/OSDebug.h>
  62
  63 #include <mach/vm_param.h>
  64 #include <machine/limits.h>
  65 #include <machine/machine_routines.h>
  66
  67 #include <string.h>
  68
  69 #include <sys/mcache.h>
  70
  71 #define MCACHE_SIZE(n) \
  72         ((size_t)(&((mcache_t *)0)->mc_cpu[n]))
  73
  74 /* Allocate extra in case we need to manually align the pointer */
  75 #define MCACHE_ALLOC_SIZE \
  76         (sizeof (void *) + MCACHE_SIZE(ncpu) + CPU_CACHE_LINE_SIZE)
  77
  78 #define MCACHE_CPU(c) \
  79         (mcache_cpu_t *)((void *)((char *)(c) + MCACHE_SIZE(cpu_number())))
  80
  81 /*
  82  * MCACHE_LIST_LOCK() and MCACHE_LIST_UNLOCK() are macros used
  83  * to serialize accesses to the global list of caches in the system.
  84  * They also record the thread currently running in the critical
  85  * section, so that we can avoid recursive requests to reap the
  86  * caches when memory runs low.
  87  */
  88 #define MCACHE_LIST_LOCK() {                            \
  89         lck_mtx_lock(mcache_llock);                     \
  90         mcache_llock_owner = current_thread();          \
  91 }
  92
  93 #define MCACHE_LIST_UNLOCK() {                          \
  94         mcache_llock_owner = NULL;                      \
  95         lck_mtx_unlock(mcache_llock);                   \
  96 }
  97
  98 #define MCACHE_LOCK(l)          lck_mtx_lock(l)
  99 #define MCACHE_UNLOCK(l)        lck_mtx_unlock(l)
 100 #define MCACHE_LOCK_TRY(l)      lck_mtx_try_lock(l)
 101
 102 static int ncpu;
 103 static unsigned int cache_line_size;
 104 static lck_mtx_t *mcache_llock;
 105 static struct thread *mcache_llock_owner;
 106 static lck_attr_t *mcache_llock_attr;
 107 static lck_grp_t *mcache_llock_grp;
 108 static lck_grp_attr_t *mcache_llock_grp_attr;
 109 static struct zone *mcache_zone;
 110 static const uint32_t mcache_reap_interval = 15;
 111 static const uint32_t mcache_reap_interval_leeway = 2;
 112 static UInt32 mcache_reaping;
 113 static int mcache_ready;
 114 static int mcache_updating;
 115
 116 static int mcache_bkt_contention = 3;
 117 #if DEBUG
 118 static unsigned int mcache_flags = MCF_DEBUG;
 119 #else
 120 static unsigned int mcache_flags = 0;
 121 #endif
 122
 123 int mca_trn_max = MCA_TRN_MAX;
 124
 125 #define DUMP_MCA_BUF_SIZE       512
 126 static char *mca_dump_buf;
 127
 128 static mcache_bkttype_t mcache_bkttype[] = {
 129         { 1,    4096,   32768,  NULL },
 130         { 3,    2048,   16384,  NULL },
 131         { 7,    1024,   12288,  NULL },
 132         { 15,   256,    8192,   NULL },
 133         { 31,   64,     4096,   NULL },
 134         { 47,   0,      2048,   NULL },
 135         { 63,   0,      1024,   NULL },
 136         { 95,   0,      512,    NULL },
 137         { 143,  0,      256,    NULL },
 138         { 165,  0,      0,      NULL },
 139 };
 140
 141 static mcache_t *mcache_create_common(const char *, size_t, size_t,
 142     mcache_allocfn_t, mcache_freefn_t, mcache_auditfn_t, mcache_logfn_t,
 143     mcache_notifyfn_t, void *, u_int32_t, int, int);
 144 static unsigned int mcache_slab_alloc(void *, mcache_obj_t ***,
 145     unsigned int, int);
 146 static void mcache_slab_free(void *, mcache_obj_t *, boolean_t);
 147 static void mcache_slab_audit(void *, mcache_obj_t *, boolean_t);
 148 static void mcache_cpu_refill(mcache_cpu_t *, mcache_bkt_t *, int);
 149 static mcache_bkt_t *mcache_bkt_alloc(mcache_t *, mcache_bktlist_t *,
 150     mcache_bkttype_t **);
 151 static void mcache_bkt_free(mcache_t *, mcache_bktlist_t *, mcache_bkt_t *);
 152 static void mcache_cache_bkt_enable(mcache_t *);
 153 static void mcache_bkt_purge(mcache_t *);
 154 static void mcache_bkt_destroy(mcache_t *, mcache_bkttype_t *,
 155     mcache_bkt_t *, int);
 156 static void mcache_bkt_ws_update(mcache_t *);
 157 static void mcache_bkt_ws_reap(mcache_t *);
 158 static void mcache_dispatch(void (*)(void *), void *);
 159 static void mcache_cache_reap(mcache_t *);
 160 static void mcache_cache_update(mcache_t *);
 161 static void mcache_cache_bkt_resize(void *);
 162 static void mcache_cache_enable(void *);
 163 static void mcache_update(thread_call_param_t __unused, thread_call_param_t __unused);
 164 static void mcache_update_timeout(void *);
 165 static void mcache_applyall(void (*)(mcache_t *));
 166 static void mcache_reap_start(void *);
 167 static void mcache_reap_done(void *);
 168 static void mcache_reap_timeout(thread_call_param_t __unused, thread_call_param_t);
 169 static void mcache_notify(mcache_t *, u_int32_t);
 170 static void mcache_purge(void *);
 171
 172 static LIST_HEAD(, mcache) mcache_head;
 173 mcache_t *mcache_audit_cache;
 174
 175 static thread_call_t mcache_reap_tcall;
 176 static thread_call_t mcache_update_tcall;
 177
 178 /*
 179  * Initialize the framework; this is currently called as part of BSD init.
 180  */
 181 __private_extern__ void
 182 mcache_init(void)
 183 {
 184         mcache_bkttype_t *btp;
 185         unsigned int i;
 186         char name[32];
 187
 188         VERIFY(mca_trn_max >= 2);
 189
 190         ncpu = ml_get_max_cpus();
 191         (void) mcache_cache_line_size();        /* prime it */
 192
 193         mcache_llock_grp_attr = lck_grp_attr_alloc_init();
 194         mcache_llock_grp = lck_grp_alloc_init("mcache.list",
 195             mcache_llock_grp_attr);
 196         mcache_llock_attr = lck_attr_alloc_init();
 197         mcache_llock = lck_mtx_alloc_init(mcache_llock_grp, mcache_llock_attr);
 198
 199         mcache_reap_tcall = thread_call_allocate(mcache_reap_timeout, NULL);
 200         mcache_update_tcall = thread_call_allocate(mcache_update, NULL);
 201         if (mcache_reap_tcall == NULL || mcache_update_tcall == NULL)
 202                 panic("mcache_init: thread_call_allocate failed");
 203
 204         mcache_zone = zinit(MCACHE_ALLOC_SIZE, 256 * MCACHE_ALLOC_SIZE,
 205             PAGE_SIZE, "mcache");
 206         if (mcache_zone == NULL)
 207                 panic("mcache_init: failed to allocate mcache zone\n");
 208         zone_change(mcache_zone, Z_CALLERACCT, FALSE);
 209
 210         LIST_INIT(&mcache_head);
 211
 212         for (i = 0; i < sizeof (mcache_bkttype) / sizeof (*btp); i++) {
 213                 btp = &mcache_bkttype[i];
 214                 (void) snprintf(name, sizeof (name), "bkt_%d",
 215                     btp->bt_bktsize);
 216                 btp->bt_cache = mcache_create(name,
 217                     (btp->bt_bktsize + 1) * sizeof (void *), 0, 0, MCR_SLEEP);
 218         }
 219
 220         PE_parse_boot_argn("mcache_flags", &mcache_flags, sizeof(mcache_flags));
 221         mcache_flags &= MCF_FLAGS_MASK;
 222
 223         mcache_audit_cache = mcache_create("audit", sizeof (mcache_audit_t),
 224             0, 0, MCR_SLEEP);
 225
 226         mcache_applyall(mcache_cache_bkt_enable);
 227         mcache_ready = 1;
 228
 229         printf("mcache: %d CPU(s), %d bytes CPU cache line size\n",
 230             ncpu, CPU_CACHE_LINE_SIZE);
 231 }
 232
 233 /*
 234  * Return the global mcache flags.
 235  */
 236 __private_extern__ unsigned int
 237 mcache_getflags(void)
 238 {
 239         return (mcache_flags);
 240 }
 241
 242 /*
 243  * Return the CPU cache line size.
 244  */
 245 __private_extern__ unsigned int
 246 mcache_cache_line_size(void)
 247 {
 248         if (cache_line_size == 0) {
 249                 ml_cpu_info_t cpu_info;
 250                 ml_cpu_get_info(&cpu_info);
 251                 cache_line_size = cpu_info.cache_line_size;
 252         }
 253         return (cache_line_size);
 254 }
 255
 256 /*
 257  * Create a cache using the zone allocator as the backend slab allocator.
 258  * The caller may specify any alignment for the object; if it specifies 0
 259  * the default alignment (MCACHE_ALIGN) will be used.
 260  */
 261 __private_extern__ mcache_t *
 262 mcache_create(const char *name, size_t bufsize, size_t align,
 263     u_int32_t flags, int wait)
 264 {
 265         return (mcache_create_common(name, bufsize, align, mcache_slab_alloc,
 266             mcache_slab_free, mcache_slab_audit, NULL, NULL, NULL, flags, 1,
 267             wait));
 268 }
 269
 270 /*
 271  * Create a cache using a custom backend slab allocator.  Since the caller
 272  * is responsible for allocation, no alignment guarantee will be provided
 273  * by this framework.
 274  */
 275 __private_extern__ mcache_t *
 276 mcache_create_ext(const char *name, size_t bufsize,
 277     mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
 278     mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
 279     u_int32_t flags, int wait)
 280 {
 281         return (mcache_create_common(name, bufsize, 0, allocfn,
 282             freefn, auditfn, logfn, notifyfn, arg, flags, 0, wait));
 283 }
 284
 285 /*
 286  * Common cache creation routine.
 287  */
 288 static mcache_t *
 289 mcache_create_common(const char *name, size_t bufsize, size_t align,
 290     mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
 291     mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
 292     u_int32_t flags, int need_zone, int wait)
 293 {
 294         mcache_bkttype_t *btp;
 295         mcache_t *cp = NULL;
 296         size_t chunksize;
 297         void *buf, **pbuf;
 298         int c;
 299         char lck_name[64];
 300
 301         /* If auditing is on and print buffer is NULL, allocate it now */
 302         if ((flags & MCF_DEBUG) && mca_dump_buf == NULL) {
 303                 int malloc_wait = (wait & MCR_NOSLEEP) ? M_NOWAIT : M_WAITOK;
 304                 MALLOC(mca_dump_buf, char *, DUMP_MCA_BUF_SIZE, M_TEMP,
 305                     malloc_wait | M_ZERO);
 306                 if (mca_dump_buf == NULL)
 307                         return (NULL);
 308         }
 309
 310         if (!(wait & MCR_NOSLEEP))
 311                 buf = zalloc(mcache_zone);
 312         else
 313                 buf = zalloc_noblock(mcache_zone);
 314
 315         if (buf == NULL)
 316                 goto fail;
 317
 318         bzero(buf, MCACHE_ALLOC_SIZE);
 319
 320         /*
 321          * In case we didn't get a cache-aligned memory, round it up
 322          * accordingly.  This is needed in order to get the rest of
 323          * structure members aligned properly.  It also means that
 324          * the memory span gets shifted due to the round up, but it
 325          * is okay since we've allocated extra space for this.
 326          */
 327         cp = (mcache_t *)
 328             P2ROUNDUP((intptr_t)buf + sizeof (void *), CPU_CACHE_LINE_SIZE);
 329         pbuf = (void **)((intptr_t)cp - sizeof (void *));
 330         *pbuf = buf;
 331
 332         /*
 333          * Guaranteed alignment is valid only when we use the internal
 334          * slab allocator (currently set to use the zone allocator).
 335          */
 336         if (!need_zone)
 337                 align = 1;
 338         else if (align == 0)
 339                 align = MCACHE_ALIGN;
 340
 341         if ((align & (align - 1)) != 0)
 342                 panic("mcache_create: bad alignment %lu", align);
 343
 344         cp->mc_align = align;
 345         cp->mc_slab_alloc = allocfn;
 346         cp->mc_slab_free = freefn;
 347         cp->mc_slab_audit = auditfn;
 348         cp->mc_slab_log = logfn;
 349         cp->mc_slab_notify = notifyfn;
 350         cp->mc_private = need_zone ? cp : arg;
 351         cp->mc_bufsize = bufsize;
 352         cp->mc_flags = (flags & MCF_FLAGS_MASK) | mcache_flags;
 353
 354         (void) snprintf(cp->mc_name, sizeof (cp->mc_name), "mcache.%s", name);
 355
 356         (void) snprintf(lck_name, sizeof (lck_name), "%s.cpu", cp->mc_name);
 357         cp->mc_cpu_lock_grp_attr = lck_grp_attr_alloc_init();
 358         cp->mc_cpu_lock_grp = lck_grp_alloc_init(lck_name,
 359             cp->mc_cpu_lock_grp_attr);
 360         cp->mc_cpu_lock_attr = lck_attr_alloc_init();
 361
 362         /*
 363          * Allocation chunk size is the object's size plus any extra size
 364          * needed to satisfy the object's alignment.  It is enforced to be
 365          * at least the size of an LP64 pointer to simplify auditing and to
 366          * handle multiple-element allocation requests, where the elements
 367          * returned are linked together in a list.
 368          */
 369         chunksize = MAX(bufsize, sizeof (u_int64_t));
 370         if (need_zone) {
 371                 /* Enforce 64-bit minimum alignment for zone-based buffers */
 372                 align = MAX(align, sizeof (u_int64_t));
 373                 chunksize += sizeof (void *) + align;
 374                 chunksize = P2ROUNDUP(chunksize, align);
 375                 if ((cp->mc_slab_zone = zinit(chunksize, 64 * 1024 * ncpu,
 376                     PAGE_SIZE, cp->mc_name)) == NULL)
 377                         goto fail;
 378                 zone_change(cp->mc_slab_zone, Z_EXPAND, TRUE);
 379         }
 380         cp->mc_chunksize = chunksize;
 381
 382         /*
 383          * Initialize the bucket layer.
 384          */
 385         (void) snprintf(lck_name, sizeof (lck_name), "%s.bkt", cp->mc_name);
 386         cp->mc_bkt_lock_grp_attr = lck_grp_attr_alloc_init();
 387         cp->mc_bkt_lock_grp = lck_grp_alloc_init(lck_name,
 388             cp->mc_bkt_lock_grp_attr);
 389         cp->mc_bkt_lock_attr = lck_attr_alloc_init();
 390         lck_mtx_init(&cp->mc_bkt_lock, cp->mc_bkt_lock_grp,
 391             cp->mc_bkt_lock_attr);
 392
 393         (void) snprintf(lck_name, sizeof (lck_name), "%s.sync", cp->mc_name);
 394         cp->mc_sync_lock_grp_attr = lck_grp_attr_alloc_init();
 395         cp->mc_sync_lock_grp = lck_grp_alloc_init(lck_name,
 396             cp->mc_sync_lock_grp_attr);
 397         cp->mc_sync_lock_attr = lck_attr_alloc_init();
 398         lck_mtx_init(&cp->mc_sync_lock, cp->mc_sync_lock_grp,
 399             cp->mc_sync_lock_attr);
 400
 401         for (btp = mcache_bkttype; chunksize <= btp->bt_minbuf; btp++)
 402                 continue;
 403
 404         cp->cache_bkttype = btp;
 405
 406         /*
 407          * Initialize the CPU layer.  Each per-CPU structure is aligned
 408          * on the CPU cache line boundary to prevent false sharing.
 409          */
 410         for (c = 0; c < ncpu; c++) {
 411                 mcache_cpu_t *ccp = &cp->mc_cpu[c];
 412
 413                 VERIFY(IS_P2ALIGNED(ccp, CPU_CACHE_LINE_SIZE));
 414                 lck_mtx_init(&ccp->cc_lock, cp->mc_cpu_lock_grp,
 415                     cp->mc_cpu_lock_attr);
 416                 ccp->cc_objs = -1;
 417                 ccp->cc_pobjs = -1;
 418         }
 419
 420         if (mcache_ready)
 421                 mcache_cache_bkt_enable(cp);
 422
 423         /* TODO: dynamically create sysctl for stats */
 424
 425         MCACHE_LIST_LOCK();
 426         LIST_INSERT_HEAD(&mcache_head, cp, mc_list);
 427         MCACHE_LIST_UNLOCK();
 428
 429         /*
 430          * If cache buckets are enabled and this is the first cache
 431          * created, start the periodic cache update.
 432          */
 433         if (!(mcache_flags & MCF_NOCPUCACHE) && !mcache_updating) {
 434                 mcache_updating = 1;
 435                 mcache_update_timeout(NULL);
 436         }
 437         if (cp->mc_flags & MCF_DEBUG) {
 438                 printf("mcache_create: %s (%s) arg %p bufsize %lu align %lu "
 439                     "chunksize %lu bktsize %d\n", name, need_zone ? "i" : "e",
 440                     arg, bufsize, cp->mc_align, chunksize, btp->bt_bktsize);
 441         }
 442         return (cp);
 443
 444 fail:
 445         if (buf != NULL)
 446                 zfree(mcache_zone, buf);
 447         return (NULL);
 448 }
 449
 450 /*
 451  * Allocate one or more objects from a cache.
 452  */
 453 __private_extern__ unsigned int
 454 mcache_alloc_ext(mcache_t *cp, mcache_obj_t **list, unsigned int num, int wait)
 455 {
 456         mcache_cpu_t *ccp;
 457         mcache_obj_t **top = &(*list);
 458         mcache_bkt_t *bkt;
 459         unsigned int need = num;
 460         boolean_t nwretry = FALSE;
 461
 462         /* MCR_NOSLEEP and MCR_FAILOK are mutually exclusive */
 463         VERIFY((wait & (MCR_NOSLEEP|MCR_FAILOK)) != (MCR_NOSLEEP|MCR_FAILOK));
 464
 465         ASSERT(list != NULL);
 466         *list = NULL;
 467
 468         if (num == 0)
 469                 return (0);
 470
 471 retry_alloc:
 472         /* We may not always be running in the same CPU in case of retries */
 473         ccp = MCACHE_CPU(cp);
 474
 475         MCACHE_LOCK(&ccp->cc_lock);
 476         for (;;) {
 477                 /*
 478                  * If we have an object in the current CPU's filled bucket,
 479                  * chain the object to any previous objects and return if
 480                  * we've satisfied the number of requested objects.
 481                  */
 482                 if (ccp->cc_objs > 0) {
 483                         mcache_obj_t *tail;
 484                         int objs;
 485
 486                         /*
 487                          * Objects in the bucket are already linked together
 488                          * with the most recently freed object at the head of
 489                          * the list; grab as many objects as we can.
 490                          */
 491                         objs = MIN((unsigned int)ccp->cc_objs, need);
 492                         *list = ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
 493                         ccp->cc_objs -= objs;
 494                         ccp->cc_alloc += objs;
 495
 496                         tail = ccp->cc_filled->bkt_obj[ccp->cc_objs];
 497                         list = &tail->obj_next;
 498                         *list = NULL;
 499
 500                         /* If we got them all, return to caller */
 501                         if ((need -= objs) == 0) {
 502                                 MCACHE_UNLOCK(&ccp->cc_lock);
 503
 504                                 if (!(cp->mc_flags & MCF_NOLEAKLOG) &&
 505                                     cp->mc_slab_log != NULL)
 506                                         (*cp->mc_slab_log)(num, *top, TRUE);
 507
 508                                 if (cp->mc_flags & MCF_DEBUG)
 509                                         goto debug_alloc;
 510
 511                                 return (num);
 512                         }
 513                 }
 514
 515                 /*
 516                  * The CPU's filled bucket is empty.  If the previous filled
 517                  * bucket was full, exchange and try again.
 518                  */
 519                 if (ccp->cc_pobjs > 0) {
 520                         mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
 521                         continue;
 522                 }
 523
 524                 /*
 525                  * If the bucket layer is disabled, allocate from slab.  This
 526                  * can happen either because MCF_NOCPUCACHE is set, or because
 527                  * the bucket layer is currently being resized.
 528                  */
 529                 if (ccp->cc_bktsize == 0)
 530                         break;
 531
 532                 /*
 533                  * Both of the CPU's buckets are empty; try to get a full
 534                  * bucket from the bucket layer.  Upon success, refill this
 535                  * CPU and place any empty bucket into the empty list.
 536                  */
 537                 bkt = mcache_bkt_alloc(cp, &cp->mc_full, NULL);
 538                 if (bkt != NULL) {
 539                         if (ccp->cc_pfilled != NULL)
 540                                 mcache_bkt_free(cp, &cp->mc_empty,
 541                                     ccp->cc_pfilled);
 542                         mcache_cpu_refill(ccp, bkt, ccp->cc_bktsize);
 543                         continue;
 544                 }
 545
 546                 /*
 547                  * The bucket layer has no full buckets; allocate the
 548                  * object(s) directly from the slab layer.
 549                  */
 550                 break;
 551         }
 552         MCACHE_UNLOCK(&ccp->cc_lock);
 553
 554         need -= (*cp->mc_slab_alloc)(cp->mc_private, &list, need, wait);
 555
 556         /*
 557          * If this is a blocking allocation, or if it is non-blocking and
 558          * the cache's full bucket is non-empty, then retry the allocation.
 559          */
 560         if (need > 0) {
 561                 if (!(wait & MCR_NONBLOCKING)) {
 562                         atomic_add_32(&cp->mc_wretry_cnt, 1);
 563                         goto retry_alloc;
 564                 } else if ((wait & (MCR_NOSLEEP | MCR_TRYHARD)) &&
 565                     !mcache_bkt_isempty(cp)) {
 566                         if (!nwretry)
 567                                 nwretry = TRUE;
 568                         atomic_add_32(&cp->mc_nwretry_cnt, 1);
 569                         goto retry_alloc;
 570                 } else if (nwretry) {
 571                         atomic_add_32(&cp->mc_nwfail_cnt, 1);
 572                 }
 573         }
 574
 575         if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL)
 576                 (*cp->mc_slab_log)((num - need), *top, TRUE);
 577
 578         if (!(cp->mc_flags & MCF_DEBUG))
 579                 return (num - need);
 580
 581 debug_alloc:
 582         if (cp->mc_flags & MCF_DEBUG) {
 583                 mcache_obj_t **o = top;
 584                 unsigned int n;
 585
 586                 n = 0;
 587                 /*
 588                  * Verify that the chain of objects have the same count as
 589                  * what we are about to report to the caller.  Any mismatch
 590                  * here means that the object list is insanely broken and
 591                  * therefore we must panic.
 592                  */
 593                 while (*o != NULL) {
 594                         o = &(*o)->obj_next;
 595                         ++n;
 596                 }
 597                 if (n != (num - need)) {
 598                         panic("mcache_alloc_ext: %s cp %p corrupted list "
 599                             "(got %d actual %d)\n", cp->mc_name,
 600                             (void *)cp, num - need, n);
 601                 }
 602         }
 603
 604         /* Invoke the slab layer audit callback if auditing is enabled */
 605         if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL)
 606                 (*cp->mc_slab_audit)(cp->mc_private, *top, TRUE);
 607
 608         return (num - need);
 609 }
 610
 611 /*
 612  * Allocate a single object from a cache.
 613  */
 614 __private_extern__ void *
 615 mcache_alloc(mcache_t *cp, int wait)
 616 {
 617         mcache_obj_t *buf;
 618
 619         (void) mcache_alloc_ext(cp, &buf, 1, wait);
 620         return (buf);
 621 }
 622
 623 __private_extern__ void
 624 mcache_waiter_inc(mcache_t *cp)
 625 {
 626         atomic_add_32(&cp->mc_waiter_cnt, 1);
 627 }
 628
 629 __private_extern__ void
 630 mcache_waiter_dec(mcache_t *cp)
 631 {
 632         atomic_add_32(&cp->mc_waiter_cnt, -1);
 633 }
 634
 635 __private_extern__ boolean_t
 636 mcache_bkt_isempty(mcache_t *cp)
 637 {
 638         /*
 639          * This isn't meant to accurately tell whether there are
 640          * any full buckets in the cache; it is simply a way to
 641          * obtain "hints" about the state of the cache.
 642          */
 643         return (cp->mc_full.bl_total == 0);
 644 }
 645
 646 /*
 647  * Notify the slab layer about an event.
 648  */
 649 static void
 650 mcache_notify(mcache_t *cp, u_int32_t event)
 651 {
 652         if (cp->mc_slab_notify != NULL)
 653                 (*cp->mc_slab_notify)(cp->mc_private, event);
 654 }
 655
 656 /*
 657  * Purge the cache and disable its buckets.
 658  */
 659 static void
 660 mcache_purge(void *arg)
 661 {
 662         mcache_t *cp = arg;
 663
 664         mcache_bkt_purge(cp);
 665         /*
 666          * We cannot simply call mcache_cache_bkt_enable() from here as
 667          * a bucket resize may be in flight and we would cause the CPU
 668          * layers of the cache to point to different sizes.  Therefore,
 669          * we simply increment the enable count so that during the next
 670          * periodic cache update the buckets can be reenabled.
 671          */
 672         lck_mtx_lock_spin(&cp->mc_sync_lock);
 673         cp->mc_enable_cnt++;
 674         lck_mtx_unlock(&cp->mc_sync_lock);
 675 }
 676
 677 __private_extern__ boolean_t
 678 mcache_purge_cache(mcache_t *cp, boolean_t async)
 679 {
 680         /*
 681          * Purging a cache that has no per-CPU caches or is already
 682          * in the process of being purged is rather pointless.
 683          */
 684         if (cp->mc_flags & MCF_NOCPUCACHE)
 685                 return (FALSE);
 686
 687         lck_mtx_lock_spin(&cp->mc_sync_lock);
 688         if (cp->mc_purge_cnt > 0) {
 689                 lck_mtx_unlock(&cp->mc_sync_lock);
 690                 return (FALSE);
 691         }
 692         cp->mc_purge_cnt++;
 693         lck_mtx_unlock(&cp->mc_sync_lock);
 694
 695         if (async)
 696                 mcache_dispatch(mcache_purge, cp);
 697         else
 698                 mcache_purge(cp);
 699
 700         return (TRUE);
 701 }
 702
 703 /*
 704  * Free a single object to a cache.
 705  */
 706 __private_extern__ void
 707 mcache_free(mcache_t *cp, void *buf)
 708 {
 709         ((mcache_obj_t *)buf)->obj_next = NULL;
 710         mcache_free_ext(cp, (mcache_obj_t *)buf);
 711 }
 712
 713 /*
 714  * Free one or more objects to a cache.
 715  */
 716 __private_extern__ void
 717 mcache_free_ext(mcache_t *cp, mcache_obj_t *list)
 718 {
 719         mcache_cpu_t *ccp = MCACHE_CPU(cp);
 720         mcache_bkttype_t *btp;
 721         mcache_obj_t *nlist;
 722         mcache_bkt_t *bkt;
 723
 724         if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL)
 725                 (*cp->mc_slab_log)(0, list, FALSE);
 726
 727         /* Invoke the slab layer audit callback if auditing is enabled */
 728         if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL)
 729                 (*cp->mc_slab_audit)(cp->mc_private, list, FALSE);
 730
 731         MCACHE_LOCK(&ccp->cc_lock);
 732         for (;;) {
 733                 /*
 734                  * If there is space in the current CPU's filled bucket, put
 735                  * the object there and return once all objects are freed.
 736                  * Note the cast to unsigned integer takes care of the case
 737                  * where the bucket layer is disabled (when cc_objs is -1).
 738                  */
 739                 if ((unsigned int)ccp->cc_objs <
 740                     (unsigned int)ccp->cc_bktsize) {
 741                         /*
 742                          * Reverse the list while we place the object into the
 743                          * bucket; this effectively causes the most recently
 744                          * freed object(s) to be reused during allocation.
 745                          */
 746                         nlist = list->obj_next;
 747                         list->obj_next = (ccp->cc_objs == 0) ? NULL :
 748                             ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
 749                         ccp->cc_filled->bkt_obj[ccp->cc_objs++] = list;
 750                         ccp->cc_free++;
 751
 752                         if ((list = nlist) != NULL)
 753                                 continue;
 754
 755                         /* We are done; return to caller */
 756                         MCACHE_UNLOCK(&ccp->cc_lock);
 757
 758                         /* If there is a waiter below, notify it */
 759                         if (cp->mc_waiter_cnt > 0)
 760                                 mcache_notify(cp, MCN_RETRYALLOC);
 761                         return;
 762                 }
 763
 764                 /*
 765                  * The CPU's filled bucket is full.  If the previous filled
 766                  * bucket was empty, exchange and try again.
 767                  */
 768                 if (ccp->cc_pobjs == 0) {
 769                         mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
 770                         continue;
 771                 }
 772
 773                 /*
 774                  * If the bucket layer is disabled, free to slab.  This can
 775                  * happen either because MCF_NOCPUCACHE is set, or because
 776                  * the bucket layer is currently being resized.
 777                  */
 778                 if (ccp->cc_bktsize == 0)
 779                         break;
 780
 781                 /*
 782                  * Both of the CPU's buckets are full; try to get an empty
 783                  * bucket from the bucket layer.  Upon success, empty this
 784                  * CPU and place any full bucket into the full list.
 785                  */
 786                 bkt = mcache_bkt_alloc(cp, &cp->mc_empty, &btp);
 787                 if (bkt != NULL) {
 788                         if (ccp->cc_pfilled != NULL)
 789                                 mcache_bkt_free(cp, &cp->mc_full,
 790                                     ccp->cc_pfilled);
 791                         mcache_cpu_refill(ccp, bkt, 0);
 792                         continue;
 793                 }
 794
 795                 /*
 796                  * We need an empty bucket to put our freed objects into
 797                  * but couldn't get an empty bucket from the bucket layer;
 798                  * attempt to allocate one.  We do not want to block for
 799                  * allocation here, and if the bucket allocation fails
 800                  * we will simply fall through to the slab layer.
 801                  */
 802                 MCACHE_UNLOCK(&ccp->cc_lock);
 803                 bkt = mcache_alloc(btp->bt_cache, MCR_NOSLEEP);
 804                 MCACHE_LOCK(&ccp->cc_lock);
 805
 806                 if (bkt != NULL) {
 807                         /*
 808                          * We have an empty bucket, but since we drop the
 809                          * CPU lock above, the cache's bucket size may have
 810                          * changed.  If so, free the bucket and try again.
 811                          */
 812                         if (ccp->cc_bktsize != btp->bt_bktsize) {
 813                                 MCACHE_UNLOCK(&ccp->cc_lock);
 814                                 mcache_free(btp->bt_cache, bkt);
 815                                 MCACHE_LOCK(&ccp->cc_lock);
 816                                 continue;
 817                         }
 818
 819                         /*
 820                          * We have an empty bucket of the right size;
 821                          * add it to the bucket layer and try again.
 822                          */
 823                         mcache_bkt_free(cp, &cp->mc_empty, bkt);
 824                         continue;
 825                 }
 826
 827                 /*
 828                  * The bucket layer has no empty buckets; free the
 829                  * object(s) directly to the slab layer.
 830                  */
 831                 break;
 832         }
 833         MCACHE_UNLOCK(&ccp->cc_lock);
 834
 835         /* If there is a waiter below, notify it */
 836         if (cp->mc_waiter_cnt > 0)
 837                 mcache_notify(cp, MCN_RETRYALLOC);
 838
 839         /* Advise the slab layer to purge the object(s) */
 840         (*cp->mc_slab_free)(cp->mc_private, list,
 841             (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
 842 }
 843
 844 /*
 845  * Cache destruction routine.
 846  */
 847 __private_extern__ void
 848 mcache_destroy(mcache_t *cp)
 849 {
 850         void **pbuf;
 851
 852         MCACHE_LIST_LOCK();
 853         LIST_REMOVE(cp, mc_list);
 854         MCACHE_LIST_UNLOCK();
 855
 856         mcache_bkt_purge(cp);
 857
 858         /*
 859          * This cache is dead; there should be no further transaction.
 860          * If it's still invoked, make sure that it induces a fault.
 861          */
 862         cp->mc_slab_alloc = NULL;
 863         cp->mc_slab_free = NULL;
 864         cp->mc_slab_audit = NULL;
 865
 866         lck_attr_free(cp->mc_bkt_lock_attr);
 867         lck_grp_free(cp->mc_bkt_lock_grp);
 868         lck_grp_attr_free(cp->mc_bkt_lock_grp_attr);
 869
 870         lck_attr_free(cp->mc_cpu_lock_attr);
 871         lck_grp_free(cp->mc_cpu_lock_grp);
 872         lck_grp_attr_free(cp->mc_cpu_lock_grp_attr);
 873
 874         lck_attr_free(cp->mc_sync_lock_attr);
 875         lck_grp_free(cp->mc_sync_lock_grp);
 876         lck_grp_attr_free(cp->mc_sync_lock_grp_attr);
 877
 878         /*
 879          * TODO: We need to destroy the zone here, but cannot do it
 880          * because there is no such way to achieve that.  Until then
 881          * the memory allocated for the zone structure is leaked.
 882          * Once it is achievable, uncomment these lines:
 883          *
 884          *      if (cp->mc_slab_zone != NULL) {
 885          *              zdestroy(cp->mc_slab_zone);
 886          *              cp->mc_slab_zone = NULL;
 887          *      }
 888          */
 889
 890         /* Get the original address since we're about to free it */
 891         pbuf = (void **)((intptr_t)cp - sizeof (void *));
 892
 893         zfree(mcache_zone, *pbuf);
 894 }
 895
 896 /*
 897  * Internal slab allocator used as a backend for simple caches.  The current
 898  * implementation uses the zone allocator for simplicity reasons.
 899  */
 900 static unsigned int
 901 mcache_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
 902 {
 903         mcache_t *cp = arg;
 904         unsigned int need = num;
 905         size_t offset = 0;
 906         size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof (u_int64_t));
 907         u_int32_t flags = cp->mc_flags;
 908         void *buf, *base, **pbuf;
 909         mcache_obj_t **list = *plist;
 910
 911         *list = NULL;
 912
 913         /*
 914          * The address of the object returned to the caller is an
 915          * offset from the 64-bit aligned base address only if the
 916          * cache's alignment requirement is neither 1 nor 8 bytes.
 917          */
 918         if (cp->mc_align != 1 && cp->mc_align != sizeof (u_int64_t))
 919                 offset = cp->mc_align;
 920
 921         for (;;) {
 922                 if (!(wait & MCR_NOSLEEP))
 923                         buf = zalloc(cp->mc_slab_zone);
 924                 else
 925                         buf = zalloc_noblock(cp->mc_slab_zone);
 926
 927                 if (buf == NULL)
 928                         break;
 929
 930                 /* Get the 64-bit aligned base address for this object */
 931                 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof (u_int64_t),
 932                     sizeof (u_int64_t));
 933
 934                 /*
 935                  * Wind back a pointer size from the aligned base and
 936                  * save the original address so we can free it later.
 937                  */
 938                 pbuf = (void **)((intptr_t)base - sizeof (void *));
 939                 *pbuf = buf;
 940
 941                 /*
 942                  * If auditing is enabled, patternize the contents of
 943                  * the buffer starting from the 64-bit aligned base to
 944                  * the end of the buffer; the length is rounded up to
 945                  * the nearest 64-bit multiply; this is because we use
 946                  * 64-bit memory access to set/check the pattern.
 947                  */
 948                 if (flags & MCF_DEBUG) {
 949                         VERIFY(((intptr_t)base + rsize) <=
 950                             ((intptr_t)buf + cp->mc_chunksize));
 951                         mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
 952                 }
 953
 954                 /*
 955                  * Fix up the object's address to fulfill the cache's
 956                  * alignment requirement (if needed) and return this
 957                  * to the caller.
 958                  */
 959                 VERIFY(((intptr_t)base + offset + cp->mc_bufsize) <=
 960                     ((intptr_t)buf + cp->mc_chunksize));
 961                 *list = (mcache_obj_t *)((intptr_t)base + offset);
 962
 963                 (*list)->obj_next = NULL;
 964                 list = *plist = &(*list)->obj_next;
 965
 966                 /* If we got them all, return to mcache */
 967                 if (--need == 0)
 968                         break;
 969         }
 970
 971         return (num - need);
 972 }
 973
 974 /*
 975  * Internal slab deallocator used as a backend for simple caches.
 976  */
 977 static void
 978 mcache_slab_free(void *arg, mcache_obj_t *list, __unused boolean_t purged)
 979 {
 980         mcache_t *cp = arg;
 981         mcache_obj_t *nlist;
 982         size_t offset = 0;
 983         size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof (u_int64_t));
 984         u_int32_t flags = cp->mc_flags;
 985         void *base;
 986         void **pbuf;
 987
 988         /*
 989          * The address of the object is an offset from a 64-bit
 990          * aligned base address only if the cache's alignment
 991          * requirement is neither 1 nor 8 bytes.
 992          */
 993         if (cp->mc_align != 1 && cp->mc_align != sizeof (u_int64_t))
 994                 offset = cp->mc_align;
 995
 996         for (;;) {
 997                 nlist = list->obj_next;
 998                 list->obj_next = NULL;
 999
1000                 /* Get the 64-bit aligned base address of this object */
1001                 base = (void *)((intptr_t)list - offset);
1002                 VERIFY(IS_P2ALIGNED(base, sizeof (u_int64_t)));
1003
1004                 /* Get the original address since we're about to free it */
1005                 pbuf = (void **)((intptr_t)base - sizeof (void *));
1006
1007                 if (flags & MCF_DEBUG) {
1008                         VERIFY(((intptr_t)base + rsize) <=
1009                             ((intptr_t)*pbuf + cp->mc_chunksize));
1010                         mcache_audit_free_verify(NULL, base, offset, rsize);
1011                 }
1012
1013                 /* Free it to zone */
1014                 VERIFY(((intptr_t)base + offset + cp->mc_bufsize) <=
1015                     ((intptr_t)*pbuf + cp->mc_chunksize));
1016                 zfree(cp->mc_slab_zone, *pbuf);
1017
1018                 /* No more objects to free; return to mcache */
1019                 if ((list = nlist) == NULL)
1020                         break;
1021         }
1022 }
1023
1024 /*
1025  * Internal slab auditor for simple caches.
1026  */
1027 static void
1028 mcache_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
1029 {
1030         mcache_t *cp = arg;
1031         size_t offset = 0;
1032         size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof (u_int64_t));
1033         void *base, **pbuf;
1034
1035         /*
1036          * The address of the object returned to the caller is an
1037          * offset from the 64-bit aligned base address only if the
1038          * cache's alignment requirement is neither 1 nor 8 bytes.
1039          */
1040         if (cp->mc_align != 1 && cp->mc_align != sizeof (u_int64_t))
1041                 offset = cp->mc_align;
1042
1043         while (list != NULL) {
1044                 mcache_obj_t *next = list->obj_next;
1045
1046                 /* Get the 64-bit aligned base address of this object */
1047                 base = (void *)((intptr_t)list - offset);
1048                 VERIFY(IS_P2ALIGNED(base, sizeof (u_int64_t)));
1049
1050                 /* Get the original address */
1051                 pbuf = (void **)((intptr_t)base - sizeof (void *));
1052
1053                 VERIFY(((intptr_t)base + rsize) <=
1054                     ((intptr_t)*pbuf + cp->mc_chunksize));
1055
1056                 if (!alloc)
1057                         mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
1058                 else
1059                         mcache_audit_free_verify_set(NULL, base, offset, rsize);
1060
1061                 list = list->obj_next = next;
1062         }
1063 }
1064
1065 /*
1066  * Refill the CPU's filled bucket with bkt and save the previous one.
1067  */
1068 static void
1069 mcache_cpu_refill(mcache_cpu_t *ccp, mcache_bkt_t *bkt, int objs)
1070 {
1071         ASSERT((ccp->cc_filled == NULL && ccp->cc_objs == -1) ||
1072             (ccp->cc_filled && ccp->cc_objs + objs == ccp->cc_bktsize));
1073         ASSERT(ccp->cc_bktsize > 0);
1074
1075         ccp->cc_pfilled = ccp->cc_filled;
1076         ccp->cc_pobjs = ccp->cc_objs;
1077         ccp->cc_filled = bkt;
1078         ccp->cc_objs = objs;
1079 }
1080
1081 /*
1082  * Allocate a bucket from the bucket layer.
1083  */
1084 static mcache_bkt_t *
1085 mcache_bkt_alloc(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkttype_t **btp)
1086 {
1087         mcache_bkt_t *bkt;
1088
1089         if (!MCACHE_LOCK_TRY(&cp->mc_bkt_lock)) {
1090                 /*
1091                  * The bucket layer lock is held by another CPU; increase
1092                  * the contention count so that we can later resize the
1093                  * bucket size accordingly.
1094                  */
1095                 MCACHE_LOCK(&cp->mc_bkt_lock);
1096                 cp->mc_bkt_contention++;
1097         }
1098
1099         if ((bkt = blp->bl_list) != NULL) {
1100                 blp->bl_list = bkt->bkt_next;
1101                 if (--blp->bl_total < blp->bl_min)
1102                         blp->bl_min = blp->bl_total;
1103                 blp->bl_alloc++;
1104         }
1105
1106         if (btp != NULL)
1107                 *btp = cp->cache_bkttype;
1108
1109         MCACHE_UNLOCK(&cp->mc_bkt_lock);
1110
1111         return (bkt);
1112 }
1113
1114 /*
1115  * Free a bucket to the bucket layer.
1116  */
1117 static void
1118 mcache_bkt_free(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkt_t *bkt)
1119 {
1120         MCACHE_LOCK(&cp->mc_bkt_lock);
1121
1122         bkt->bkt_next = blp->bl_list;
1123         blp->bl_list = bkt;
1124         blp->bl_total++;
1125
1126         MCACHE_UNLOCK(&cp->mc_bkt_lock);
1127 }
1128
1129 /*
1130  * Enable the bucket layer of a cache.
1131  */
1132 static void
1133 mcache_cache_bkt_enable(mcache_t *cp)
1134 {
1135         mcache_cpu_t *ccp;
1136         int cpu;
1137
1138         if (cp->mc_flags & MCF_NOCPUCACHE)
1139                 return;
1140
1141         for (cpu = 0; cpu < ncpu; cpu++) {
1142                 ccp = &cp->mc_cpu[cpu];
1143                 MCACHE_LOCK(&ccp->cc_lock);
1144                 ccp->cc_bktsize = cp->cache_bkttype->bt_bktsize;
1145                 MCACHE_UNLOCK(&ccp->cc_lock);
1146         }
1147 }
1148
1149 /*
1150  * Purge all buckets from a cache and disable its bucket layer.
1151  */
1152 static void
1153 mcache_bkt_purge(mcache_t *cp)
1154 {
1155         mcache_cpu_t *ccp;
1156         mcache_bkt_t *bp, *pbp;
1157         mcache_bkttype_t *btp;
1158         int cpu, objs, pobjs;
1159
1160         for (cpu = 0; cpu < ncpu; cpu++) {
1161                 ccp = &cp->mc_cpu[cpu];
1162
1163                 MCACHE_LOCK(&ccp->cc_lock);
1164
1165                 btp = cp->cache_bkttype;
1166                 bp = ccp->cc_filled;
1167                 pbp = ccp->cc_pfilled;
1168                 objs = ccp->cc_objs;
1169                 pobjs = ccp->cc_pobjs;
1170                 ccp->cc_filled = NULL;
1171                 ccp->cc_pfilled = NULL;
1172                 ccp->cc_objs = -1;
1173                 ccp->cc_pobjs = -1;
1174                 ccp->cc_bktsize = 0;
1175
1176                 MCACHE_UNLOCK(&ccp->cc_lock);
1177
1178                 if (bp != NULL)
1179                         mcache_bkt_destroy(cp, btp, bp, objs);
1180                 if (pbp != NULL)
1181                         mcache_bkt_destroy(cp, btp, pbp, pobjs);
1182         }
1183
1184         /*
1185          * Updating the working set back to back essentially sets
1186          * the working set size to zero, so everything is reapable.
1187          */
1188         mcache_bkt_ws_update(cp);
1189         mcache_bkt_ws_update(cp);
1190
1191         mcache_bkt_ws_reap(cp);
1192 }
1193
1194 /*
1195  * Free one or more objects in the bucket to the slab layer,
1196  * and also free the bucket itself.
1197  */
1198 static void
1199 mcache_bkt_destroy(mcache_t *cp, mcache_bkttype_t *btp, mcache_bkt_t *bkt,
1200     int nobjs)
1201 {
1202         if (nobjs > 0) {
1203                 mcache_obj_t *top = bkt->bkt_obj[nobjs - 1];
1204
1205                 if (cp->mc_flags & MCF_DEBUG) {
1206                         mcache_obj_t *o = top;
1207                         int cnt = 0;
1208
1209                         /*
1210                          * Verify that the chain of objects in the bucket is
1211                          * valid.  Any mismatch here means a mistake when the
1212                          * object(s) were freed to the CPU layer, so we panic.
1213                          */
1214                         while (o != NULL) {
1215                                 o = o->obj_next;
1216                                 ++cnt;
1217                         }
1218                         if (cnt != nobjs) {
1219                                 panic("mcache_bkt_destroy: %s cp %p corrupted "
1220                                     "list in bkt %p (nobjs %d actual %d)\n",
1221                                     cp->mc_name, (void *)cp, (void *)bkt,
1222                                     nobjs, cnt);
1223                         }
1224                 }
1225
1226                 /* Advise the slab layer to purge the object(s) */
1227                 (*cp->mc_slab_free)(cp->mc_private, top,
1228                     (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
1229         }
1230         mcache_free(btp->bt_cache, bkt);
1231 }
1232
1233 /*
1234  * Update the bucket layer working set statistics.
1235  */
1236 static void
1237 mcache_bkt_ws_update(mcache_t *cp)
1238 {
1239         MCACHE_LOCK(&cp->mc_bkt_lock);
1240
1241         cp->mc_full.bl_reaplimit = cp->mc_full.bl_min;
1242         cp->mc_full.bl_min = cp->mc_full.bl_total;
1243         cp->mc_empty.bl_reaplimit = cp->mc_empty.bl_min;
1244         cp->mc_empty.bl_min = cp->mc_empty.bl_total;
1245
1246         MCACHE_UNLOCK(&cp->mc_bkt_lock);
1247 }
1248
1249 /*
1250  * Reap all buckets that are beyond the working set.
1251  */
1252 static void
1253 mcache_bkt_ws_reap(mcache_t *cp)
1254 {
1255         long reap;
1256         mcache_bkt_t *bkt;
1257         mcache_bkttype_t *btp;
1258
1259         reap = MIN(cp->mc_full.bl_reaplimit, cp->mc_full.bl_min);
1260         while (reap-- &&
1261             (bkt = mcache_bkt_alloc(cp, &cp->mc_full, &btp)) != NULL)
1262                 mcache_bkt_destroy(cp, btp, bkt, btp->bt_bktsize);
1263
1264         reap = MIN(cp->mc_empty.bl_reaplimit, cp->mc_empty.bl_min);
1265         while (reap-- &&
1266             (bkt = mcache_bkt_alloc(cp, &cp->mc_empty, &btp)) != NULL)
1267                 mcache_bkt_destroy(cp, btp, bkt, 0);
1268 }
1269
1270 static void
1271 mcache_reap_timeout(thread_call_param_t dummy __unused,
1272     thread_call_param_t arg)
1273 {
1274         volatile UInt32 *flag = arg;
1275
1276         ASSERT(flag == &mcache_reaping);
1277
1278         *flag = 0;
1279 }
1280
1281 static void
1282 mcache_reap_done(void *flag)
1283 {
1284         uint64_t deadline, leeway;
1285
1286         clock_interval_to_deadline(mcache_reap_interval, NSEC_PER_SEC,
1287             &deadline);
1288         clock_interval_to_absolutetime_interval(mcache_reap_interval_leeway,
1289             NSEC_PER_SEC, &leeway);
1290         thread_call_enter_delayed_with_leeway(mcache_reap_tcall, flag,
1291             deadline, leeway, THREAD_CALL_DELAY_LEEWAY);
1292 }
1293
1294 static void
1295 mcache_reap_start(void *arg)
1296 {
1297         UInt32 *flag = arg;
1298
1299         ASSERT(flag == &mcache_reaping);
1300
1301         mcache_applyall(mcache_cache_reap);
1302         mcache_dispatch(mcache_reap_done, flag);
1303 }
1304
1305 __private_extern__ void
1306 mcache_reap(void)
1307 {
1308         UInt32 *flag = &mcache_reaping;
1309
1310         if (mcache_llock_owner == current_thread() ||
1311             !OSCompareAndSwap(0, 1, flag))
1312                 return;
1313
1314         mcache_dispatch(mcache_reap_start, flag);
1315 }
1316
1317 static void
1318 mcache_cache_reap(mcache_t *cp)
1319 {
1320         mcache_bkt_ws_reap(cp);
1321 }
1322
1323 /*
1324  * Performs period maintenance on a cache.
1325  */
1326 static void
1327 mcache_cache_update(mcache_t *cp)
1328 {
1329         int need_bkt_resize = 0;
1330         int need_bkt_reenable = 0;
1331
1332         lck_mtx_assert(mcache_llock, LCK_MTX_ASSERT_OWNED);
1333
1334         mcache_bkt_ws_update(cp);
1335
1336         /*
1337          * Cache resize and post-purge reenable are mutually exclusive.
1338          * If the cache was previously purged, there is no point of
1339          * increasing the bucket size as there was an indication of
1340          * memory pressure on the system.
1341          */
1342         lck_mtx_lock_spin(&cp->mc_sync_lock);
1343         if (!(cp->mc_flags & MCF_NOCPUCACHE) && cp->mc_enable_cnt)
1344                 need_bkt_reenable = 1;
1345         lck_mtx_unlock(&cp->mc_sync_lock);
1346
1347         MCACHE_LOCK(&cp->mc_bkt_lock);
1348         /*
1349          * If the contention count is greater than the threshold, and if
1350          * we are not already at the maximum bucket size, increase it.
1351          * Otherwise, if this cache was previously purged by the user
1352          * then we simply reenable it.
1353          */
1354         if ((unsigned int)cp->mc_chunksize < cp->cache_bkttype->bt_maxbuf &&
1355             (int)(cp->mc_bkt_contention - cp->mc_bkt_contention_prev) >
1356             mcache_bkt_contention && !need_bkt_reenable)
1357                 need_bkt_resize = 1;
1358
1359         cp ->mc_bkt_contention_prev = cp->mc_bkt_contention;
1360         MCACHE_UNLOCK(&cp->mc_bkt_lock);
1361
1362         if (need_bkt_resize)
1363                 mcache_dispatch(mcache_cache_bkt_resize, cp);
1364         else if (need_bkt_reenable)
1365                 mcache_dispatch(mcache_cache_enable, cp);
1366 }
1367
1368 /*
1369  * Recompute a cache's bucket size.  This is an expensive operation
1370  * and should not be done frequently; larger buckets provide for a
1371  * higher transfer rate with the bucket while smaller buckets reduce
1372  * the memory consumption.
1373  */
1374 static void
1375 mcache_cache_bkt_resize(void *arg)
1376 {
1377         mcache_t *cp = arg;
1378         mcache_bkttype_t *btp = cp->cache_bkttype;
1379
1380         if ((unsigned int)cp->mc_chunksize < btp->bt_maxbuf) {
1381                 mcache_bkt_purge(cp);
1382
1383                 /*
1384                  * Upgrade to the next bucket type with larger bucket size;
1385                  * temporarily set the previous contention snapshot to a
1386                  * negative number to prevent unnecessary resize request.
1387                  */
1388                 MCACHE_LOCK(&cp->mc_bkt_lock);
1389                 cp->cache_bkttype = ++btp;
1390                 cp ->mc_bkt_contention_prev = cp->mc_bkt_contention + INT_MAX;
1391                 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1392
1393                 mcache_cache_enable(cp);
1394         }
1395 }
1396
1397 /*
1398  * Reenable a previously disabled cache due to purge.
1399  */
1400 static void
1401 mcache_cache_enable(void *arg)
1402 {
1403         mcache_t *cp = arg;
1404
1405         lck_mtx_lock_spin(&cp->mc_sync_lock);
1406         cp->mc_purge_cnt = 0;
1407         cp->mc_enable_cnt = 0;
1408         lck_mtx_unlock(&cp->mc_sync_lock);
1409
1410         mcache_cache_bkt_enable(cp);
1411 }
1412
1413 static void
1414 mcache_update_timeout(__unused void *arg)
1415 {
1416         uint64_t deadline, leeway;
1417
1418         clock_interval_to_deadline(mcache_reap_interval, NSEC_PER_SEC,
1419             &deadline);
1420         clock_interval_to_absolutetime_interval(mcache_reap_interval_leeway,
1421             NSEC_PER_SEC, &leeway);
1422         thread_call_enter_delayed_with_leeway(mcache_update_tcall, NULL,
1423             deadline, leeway, THREAD_CALL_DELAY_LEEWAY);
1424 }
1425
1426 static void
1427 mcache_update(thread_call_param_t arg __unused,
1428     thread_call_param_t dummy __unused)
1429 {
1430         mcache_applyall(mcache_cache_update);
1431         mcache_update_timeout(NULL);
1432 }
1433
1434 static void
1435 mcache_applyall(void (*func)(mcache_t *))
1436 {
1437         mcache_t *cp;
1438
1439         MCACHE_LIST_LOCK();
1440         LIST_FOREACH(cp, &mcache_head, mc_list) {
1441                 func(cp);
1442         }
1443         MCACHE_LIST_UNLOCK();
1444 }
1445
1446 static void
1447 mcache_dispatch(void (*func)(void *), void *arg)
1448 {
1449         ASSERT(func != NULL);
1450         timeout(func, arg, hz/1000);
1451 }
1452
1453 __private_extern__ void
1454 mcache_buffer_log(mcache_audit_t *mca, void *addr, mcache_t *cp,
1455     struct timeval *base_ts)
1456 {
1457         struct timeval now, base = { 0, 0 };
1458         void *stack[MCACHE_STACK_DEPTH + 1];
1459         struct mca_trn *transaction;
1460
1461         transaction = &mca->mca_trns[mca->mca_next_trn];
1462
1463         mca->mca_addr = addr;
1464         mca->mca_cache = cp;
1465
1466         transaction->mca_thread = current_thread();
1467
1468         bzero(stack, sizeof (stack));
1469         transaction->mca_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
1470         bcopy(&stack[1], transaction->mca_stack,
1471                 sizeof (transaction->mca_stack));
1472
1473         microuptime(&now);
1474         if (base_ts != NULL)
1475                 base = *base_ts;
1476         /* tstamp is in ms relative to base_ts */
1477         transaction->mca_tstamp = ((now.tv_usec - base.tv_usec) / 1000);
1478         if ((now.tv_sec - base.tv_sec) > 0)
1479                 transaction->mca_tstamp += ((now.tv_sec - base.tv_sec) * 1000);
1480
1481         mca->mca_next_trn =
1482                 (mca->mca_next_trn + 1) % mca_trn_max;
1483 }
1484
1485 __private_extern__ void
1486 mcache_set_pattern(u_int64_t pattern, void *buf_arg, size_t size)
1487 {
1488         u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1489         u_int64_t *buf = (u_int64_t *)buf_arg;
1490
1491         VERIFY(IS_P2ALIGNED(buf_arg, sizeof (u_int64_t)));
1492         VERIFY(IS_P2ALIGNED(size, sizeof (u_int64_t)));
1493
1494         while (buf < buf_end)
1495                 *buf++ = pattern;
1496 }
1497
1498 __private_extern__ void *
1499 mcache_verify_pattern(u_int64_t pattern, void *buf_arg, size_t size)
1500 {
1501         u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1502         u_int64_t *buf;
1503
1504         VERIFY(IS_P2ALIGNED(buf_arg, sizeof (u_int64_t)));
1505         VERIFY(IS_P2ALIGNED(size, sizeof (u_int64_t)));
1506
1507         for (buf = buf_arg; buf < buf_end; buf++) {
1508                 if (*buf != pattern)
1509                         return (buf);
1510         }
1511         return (NULL);
1512 }
1513
1514 __private_extern__ void *
1515 mcache_verify_set_pattern(u_int64_t old, u_int64_t new, void *buf_arg,
1516     size_t size)
1517 {
1518         u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1519         u_int64_t *buf;
1520
1521         VERIFY(IS_P2ALIGNED(buf_arg, sizeof (u_int64_t)));
1522         VERIFY(IS_P2ALIGNED(size, sizeof (u_int64_t)));
1523
1524         for (buf = buf_arg; buf < buf_end; buf++) {
1525                 if (*buf != old) {
1526                         mcache_set_pattern(old, buf_arg,
1527                             (uintptr_t)buf - (uintptr_t)buf_arg);
1528                         return (buf);
1529                 }
1530                 *buf = new;
1531         }
1532         return (NULL);
1533 }
1534
1535 __private_extern__ void
1536 mcache_audit_free_verify(mcache_audit_t *mca, void *base, size_t offset,
1537     size_t size)
1538 {
1539         void *addr;
1540         u_int64_t *oaddr64;
1541         mcache_obj_t *next;
1542
1543         addr = (void *)((uintptr_t)base + offset);
1544         next = ((mcache_obj_t *)addr)->obj_next;
1545
1546         /* For the "obj_next" pointer in the buffer */
1547         oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof (u_int64_t));
1548         *oaddr64 = MCACHE_FREE_PATTERN;
1549
1550         if ((oaddr64 = mcache_verify_pattern(MCACHE_FREE_PATTERN,
1551             (caddr_t)base, size)) != NULL) {
1552                 mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
1553                     (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
1554                 /* NOTREACHED */
1555         }
1556         ((mcache_obj_t *)addr)->obj_next = next;
1557 }
1558
1559 __private_extern__ void
1560 mcache_audit_free_verify_set(mcache_audit_t *mca, void *base, size_t offset,
1561     size_t size)
1562 {
1563         void *addr;
1564         u_int64_t *oaddr64;
1565         mcache_obj_t *next;
1566
1567         addr = (void *)((uintptr_t)base + offset);
1568         next = ((mcache_obj_t *)addr)->obj_next;
1569
1570         /* For the "obj_next" pointer in the buffer */
1571         oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof (u_int64_t));
1572         *oaddr64 = MCACHE_FREE_PATTERN;
1573
1574         if ((oaddr64 = mcache_verify_set_pattern(MCACHE_FREE_PATTERN,
1575             MCACHE_UNINITIALIZED_PATTERN, (caddr_t)base, size)) != NULL) {
1576                 mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
1577                     (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
1578                 /* NOTREACHED */
1579         }
1580         ((mcache_obj_t *)addr)->obj_next = next;
1581 }
1582
1583 #undef panic
1584
1585 #define DUMP_TRN_FMT() \
1586             "%s transaction thread %p saved PC stack (%d deep):\n" \
1587             "\t%p, %p, %p, %p, %p, %p, %p, %p\n" \
1588             "\t%p, %p, %p, %p, %p, %p, %p, %p\n"
1589
1590 #define DUMP_TRN_FIELDS(s, x) \
1591             s, \
1592             mca->mca_trns[x].mca_thread, mca->mca_trns[x].mca_depth, \
1593             mca->mca_trns[x].mca_stack[0], mca->mca_trns[x].mca_stack[1], \
1594             mca->mca_trns[x].mca_stack[2], mca->mca_trns[x].mca_stack[3], \
1595             mca->mca_trns[x].mca_stack[4], mca->mca_trns[x].mca_stack[5], \
1596             mca->mca_trns[x].mca_stack[6], mca->mca_trns[x].mca_stack[7], \
1597             mca->mca_trns[x].mca_stack[8], mca->mca_trns[x].mca_stack[9], \
1598             mca->mca_trns[x].mca_stack[10], mca->mca_trns[x].mca_stack[11], \
1599             mca->mca_trns[x].mca_stack[12], mca->mca_trns[x].mca_stack[13], \
1600             mca->mca_trns[x].mca_stack[14], mca->mca_trns[x].mca_stack[15]
1601
1602 #define MCA_TRN_LAST ((mca->mca_next_trn + mca_trn_max) % mca_trn_max)
1603 #define MCA_TRN_PREV ((mca->mca_next_trn + mca_trn_max - 1) % mca_trn_max)
1604
1605 __private_extern__ char *
1606 mcache_dump_mca(mcache_audit_t *mca)
1607 {
1608         if (mca_dump_buf == NULL)
1609                 return (NULL);
1610
1611         snprintf(mca_dump_buf, DUMP_MCA_BUF_SIZE,
1612             "mca %p: addr %p, cache %p (%s) nxttrn %d\n"
1613             DUMP_TRN_FMT()
1614             DUMP_TRN_FMT(),
1615
1616             mca, mca->mca_addr, mca->mca_cache,
1617             mca->mca_cache ? mca->mca_cache->mc_name : "?",
1618             mca->mca_next_trn,
1619
1620             DUMP_TRN_FIELDS("last", MCA_TRN_LAST),
1621             DUMP_TRN_FIELDS("previous", MCA_TRN_PREV));
1622
1623         return (mca_dump_buf);
1624 }
1625
1626 __private_extern__ void
1627 mcache_audit_panic(mcache_audit_t *mca, void *addr, size_t offset,
1628     int64_t expected, int64_t got)
1629 {
1630         if (mca == NULL) {
1631                 panic("mcache_audit: buffer %p modified after free at "
1632                     "offset 0x%lx (0x%llx instead of 0x%llx)\n", addr,
1633                     offset, got, expected);
1634                 /* NOTREACHED */
1635         }
1636
1637         panic("mcache_audit: buffer %p modified after free at offset 0x%lx "
1638             "(0x%llx instead of 0x%llx)\n%s\n",
1639             addr, offset, got, expected, mcache_dump_mca(mca));
1640         /* NOTREACHED */
1641 }
1642
1643 __private_extern__ int
1644 assfail(const char *a, const char *f, int l)
1645 {
1646         panic("assertion failed: %s, file: %s, line: %d", a, f, l);
1647         return (0);
1648 }