bsd/kern/uipc_mbuf.c

   1 /*
   2  * Copyright (c) 1998-2012 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1988, 1991, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/malloc.h>
  73 #include <sys/mbuf.h>
  74 #include <sys/kernel.h>
  75 #include <sys/sysctl.h>
  76 #include <sys/syslog.h>
  77 #include <sys/protosw.h>
  78 #include <sys/domain.h>
  79 #include <sys/queue.h>
  80 #include <sys/proc.h>
  81
  82 #include <kern/kern_types.h>
  83 #include <kern/simple_lock.h>
  84 #include <kern/queue.h>
  85 #include <kern/sched_prim.h>
  86 #include <kern/cpu_number.h>
  87 #include <kern/zalloc.h>
  88
  89 #include <libkern/OSAtomic.h>
  90 #include <libkern/libkern.h>
  91
  92 #include <IOKit/IOMapper.h>
  93
  94 #include <machine/limits.h>
  95 #include <machine/machine_routines.h>
  96
  97 #if CONFIG_MACF_NET
  98 #include <security/mac_framework.h>
  99 #endif /* MAC_NET */
 100
 101 #include <sys/mcache.h>
 102
 103 /*
 104  * MBUF IMPLEMENTATION NOTES.
 105  *
 106  * There is a total of 5 per-CPU caches:
 107  *
 108  * MC_MBUF:
 109  *      This is a cache of rudimentary objects of MSIZE in size; each
 110  *      object represents an mbuf structure.  This cache preserves only
 111  *      the m_type field of the mbuf during its transactions.
 112  *
 113  * MC_CL:
 114  *      This is a cache of rudimentary objects of MCLBYTES in size; each
 115  *      object represents a mcluster structure.  This cache does not
 116  *      preserve the contents of the objects during its transactions.
 117  *
 118  * MC_BIGCL:
 119  *      This is a cache of rudimentary objects of MBIGCLBYTES in size; each
 120  *      object represents a mbigcluster structure.  This cache does not
 121  *      preserve the contents of the objects during its transaction.
 122  *
 123  * MC_MBUF_CL:
 124  *      This is a cache of mbufs each having a cluster attached to it.
 125  *      It is backed by MC_MBUF and MC_CL rudimentary caches.  Several
 126  *      fields of the mbuf related to the external cluster are preserved
 127  *      during transactions.
 128  *
 129  * MC_MBUF_BIGCL:
 130  *      This is a cache of mbufs each having a big cluster attached to it.
 131  *      It is backed by MC_MBUF and MC_BIGCL rudimentary caches.  Several
 132  *      fields of the mbuf related to the external cluster are preserved
 133  *      during transactions.
 134  *
 135  * OBJECT ALLOCATION:
 136  *
 137  * Allocation requests are handled first at the per-CPU (mcache) layer
 138  * before falling back to the slab layer.  Performance is optimal when
 139  * the request is satisfied at the CPU layer because global data/lock
 140  * never gets accessed.  When the slab layer is entered for allocation,
 141  * the slab freelist will be checked first for available objects before
 142  * the VM backing store is invoked.  Slab layer operations are serialized
 143  * for all of the caches as the mbuf global lock is held most of the time.
 144  * Allocation paths are different depending on the class of objects:
 145  *
 146  * a. Rudimentary object:
 147  *
 148  *      { m_get_common(), m_clattach(), m_mclget(),
 149  *        m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
 150  *        composite object allocation }
 151  *                      |       ^
 152  *                      |       |
 153  *                      |       +-----------------------+
 154  *                      v                               |
 155  *         mcache_alloc/mcache_alloc_ext()      mbuf_slab_audit()
 156  *                      |                               ^
 157  *                      v                               |
 158  *                 [CPU cache] -------> (found?) -------+
 159  *                      |                               |
 160  *                      v                               |
 161  *               mbuf_slab_alloc()                      |
 162  *                      |                               |
 163  *                      v                               |
 164  *      +---------> [freelist] -------> (found?) -------+
 165  *      |               |
 166  *      |               v
 167  *      |           m_clalloc()
 168  *      |               |
 169  *      |               v
 170  *      +---<<---- kmem_mb_alloc()
 171  *
 172  * b. Composite object:
 173  *
 174  *      { m_getpackets_internal(), m_allocpacket_internal() }
 175  *                      |       ^
 176  *                      |       |
 177  *                      |       +------ (done) ---------+
 178  *                      v                               |
 179  *         mcache_alloc/mcache_alloc_ext()      mbuf_cslab_audit()
 180  *                      |                               ^
 181  *                      v                               |
 182  *                 [CPU cache] -------> (found?) -------+
 183  *                      |                               |
 184  *                      v                               |
 185  *               mbuf_cslab_alloc()                     |
 186  *                      |                               |
 187  *                      v                               |
 188  *                  [freelist] -------> (found?) -------+
 189  *                      |                               |
 190  *                      v                               |
 191  *              (rudimentary object)                    |
 192  *         mcache_alloc/mcache_alloc_ext() ------>>-----+
 193  *
 194  * Auditing notes: If auditing is enabled, buffers will be subjected to
 195  * integrity checks by the audit routine.  This is done by verifying their
 196  * contents against DEADBEEF (free) pattern before returning them to caller.
 197  * As part of this step, the routine will also record the transaction and
 198  * pattern-fill the buffers with BADDCAFE (uninitialized) pattern.  It will
 199  * also restore any constructed data structure fields if necessary.
 200  *
 201  * OBJECT DEALLOCATION:
 202  *
 203  * Freeing an object simply involves placing it into the CPU cache; this
 204  * pollutes the cache to benefit subsequent allocations.  The slab layer
 205  * will only be entered if the object is to be purged out of the cache.
 206  * During normal operations, this happens only when the CPU layer resizes
 207  * its bucket while it's adjusting to the allocation load.  Deallocation
 208  * paths are different depending on the class of objects:
 209  *
 210  * a. Rudimentary object:
 211  *
 212  *      { m_free(), m_freem_list(), composite object deallocation }
 213  *                      |       ^
 214  *                      |       |
 215  *                      |       +------ (done) ---------+
 216  *                      v                               |
 217  *         mcache_free/mcache_free_ext()                |
 218  *                      |                               |
 219  *                      v                               |
 220  *              mbuf_slab_audit()                       |
 221  *                      |                               |
 222  *                      v                               |
 223  *                 [CPU cache] ---> (not purging?) -----+
 224  *                      |                               |
 225  *                      v                               |
 226  *               mbuf_slab_free()                       |
 227  *                      |                               |
 228  *                      v                               |
 229  *                  [freelist] ----------->>------------+
 230  *       (objects never get purged to VM)
 231  *
 232  * b. Composite object:
 233  *
 234  *      { m_free(), m_freem_list() }
 235  *                      |       ^
 236  *                      |       |
 237  *                      |       +------ (done) ---------+
 238  *                      v                               |
 239  *         mcache_free/mcache_free_ext()                |
 240  *                      |                               |
 241  *                      v                               |
 242  *              mbuf_cslab_audit()                      |
 243  *                      |                               |
 244  *                      v                               |
 245  *                 [CPU cache] ---> (not purging?) -----+
 246  *                      |                               |
 247  *                      v                               |
 248  *               mbuf_cslab_free()                      |
 249  *                      |                               |
 250  *                      v                               |
 251  *                  [freelist] ---> (not purging?) -----+
 252  *                      |                               |
 253  *                      v                               |
 254  *              (rudimentary object)                    |
 255  *         mcache_free/mcache_free_ext() ------->>------+
 256  *
 257  * Auditing notes: If auditing is enabled, the audit routine will save
 258  * any constructed data structure fields (if necessary) before filling the
 259  * contents of the buffers with DEADBEEF (free) pattern and recording the
 260  * transaction.  Buffers that are freed (whether at CPU or slab layer) are
 261  * expected to contain the free pattern.
 262  *
 263  * DEBUGGING:
 264  *
 265  * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
 266  * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT).  Additionally,
 267  * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
 268  * i.e. modify the boot argument parameter to "mbuf_debug=0x13".  Leak
 269  * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g.
 270  * "mbuf_debug=0x113".  Note that debugging consumes more CPU and memory.
 271  *
 272  * Each object is associated with exactly one mcache_audit_t structure that
 273  * contains the information related to its last buffer transaction.  Given
 274  * an address of an object, the audit structure can be retrieved by finding
 275  * the position of the object relevant to the base address of the cluster:
 276  *
 277  *      +------------+                  +=============+
 278  *      | mbuf addr  |                  | mclaudit[i] |
 279  *      +------------+                  +=============+
 280  *            |                         | cl_audit[0] |
 281  *      i = MTOBG(addr)                 +-------------+
 282  *            |                 +-----> | cl_audit[1] | -----> mcache_audit_t
 283  *      b = BGTOM(i)            |       +-------------+
 284  *            |                 |       |     ...     |
 285  *      x = MCLIDX(b, addr)     |       +-------------+
 286  *            |                 |       | cl_audit[7] |
 287  *            +-----------------+       +-------------+
 288  *               (e.g. x == 1)
 289  *
 290  * The mclaudit[] array is allocated at initialization time, but its contents
 291  * get populated when the corresponding cluster is created.  Because a page
 292  * can be turned into NMBPBG number of mbufs, we preserve enough space for the
 293  * mbufs so that there is a 1-to-1 mapping between them.  A page that never
 294  * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
 295  * remaining entries unused.  For 16KB cluster, only one entry from the first
 296  * page is allocated and used for the entire object.
 297  */
 298
 299 /* TODO: should be in header file */
 300 /* kernel translater */
 301 extern vm_offset_t kmem_mb_alloc(vm_map_t, int, int);
 302 extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
 303 extern vm_map_t mb_map;         /* special map */
 304
 305 /* Global lock */
 306 decl_lck_mtx_data(static, mbuf_mlock_data);
 307 static lck_mtx_t *mbuf_mlock = &mbuf_mlock_data;
 308 static lck_attr_t *mbuf_mlock_attr;
 309 static lck_grp_t *mbuf_mlock_grp;
 310 static lck_grp_attr_t *mbuf_mlock_grp_attr;
 311
 312 /* Back-end (common) layer */
 313 static void *mbuf_worker_run;   /* wait channel for worker thread */
 314 static int mbuf_worker_ready;   /* worker thread is runnable */
 315 static int mbuf_expand_mcl;     /* number of cluster creation requets */
 316 static int mbuf_expand_big;     /* number of big cluster creation requests */
 317 static int mbuf_expand_16k;     /* number of 16KB cluster creation requests */
 318 static int ncpu;                /* number of CPUs */
 319 static ppnum_t *mcl_paddr;      /* Array of cluster physical addresses */
 320 static ppnum_t mcl_pages;       /* Size of array (# physical pages) */
 321 static ppnum_t mcl_paddr_base;  /* Handle returned by IOMapper::iovmAlloc() */
 322 static mcache_t *ref_cache;     /* Cache of cluster reference & flags */
 323 static mcache_t *mcl_audit_con_cache; /* Audit contents cache */
 324 static unsigned int mbuf_debug; /* patchable mbuf mcache flags */
 325 static unsigned int mb_normalized; /* number of packets "normalized" */
 326
 327 #define MB_GROWTH_AGGRESSIVE    1       /* Threshold: 1/2 of total */
 328 #define MB_GROWTH_NORMAL        2       /* Threshold: 3/4 of total */
 329
 330 typedef enum {
 331         MC_MBUF = 0,    /* Regular mbuf */
 332         MC_CL,          /* Cluster */
 333         MC_BIGCL,       /* Large (4KB) cluster */
 334         MC_16KCL,       /* Jumbo (16KB) cluster */
 335         MC_MBUF_CL,     /* mbuf + cluster */
 336         MC_MBUF_BIGCL,  /* mbuf + large (4KB) cluster */
 337         MC_MBUF_16KCL   /* mbuf + jumbo (16KB) cluster */
 338 } mbuf_class_t;
 339
 340 #define MBUF_CLASS_MIN          MC_MBUF
 341 #define MBUF_CLASS_MAX          MC_MBUF_16KCL
 342 #define MBUF_CLASS_LAST         MC_16KCL
 343 #define MBUF_CLASS_VALID(c) \
 344         ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
 345 #define MBUF_CLASS_COMPOSITE(c) \
 346         ((int)(c) > MBUF_CLASS_LAST)
 347
 348
 349 /*
 350  * mbuf specific mcache allocation request flags.
 351  */
 352 #define MCR_COMP        MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
 353
 354 /*
 355  * Per-cluster slab structure.
 356  *
 357  * A slab is a cluster control structure that contains one or more object
 358  * chunks; the available chunks are chained in the slab's freelist (sl_head).
 359  * Each time a chunk is taken out of the slab, the slab's reference count
 360  * gets incremented.  When all chunks have been taken out, the empty slab
 361  * gets removed (SLF_DETACHED) from the class's slab list.  A chunk that is
 362  * returned to a slab causes the slab's reference count to be decremented;
 363  * it also causes the slab to be reinserted back to class's slab list, if
 364  * it's not already done.
 365  *
 366  * Compartmentalizing of the object chunks into slabs allows us to easily
 367  * merge one or more slabs together when the adjacent slabs are idle, as
 368  * well as to convert or move a slab from one class to another; e.g. the
 369  * mbuf cluster slab can be converted to a regular cluster slab when all
 370  * mbufs in the slab have been freed.
 371  *
 372  * A slab may also span across multiple clusters for chunks larger than
 373  * a cluster's size.  In this case, only the slab of the first cluster is
 374  * used.  The rest of the slabs are marked with SLF_PARTIAL to indicate
 375  * that they are part of the larger slab.
 376  *
 377  * Each slab controls a page of memory.
 378  */
 379 typedef struct mcl_slab {
 380         struct mcl_slab *sl_next;       /* neighboring slab */
 381         u_int8_t        sl_class;       /* controlling mbuf class */
 382         int8_t          sl_refcnt;      /* outstanding allocations */
 383         int8_t          sl_chunks;      /* chunks (bufs) in this slab */
 384         u_int16_t       sl_flags;       /* slab flags (see below) */
 385         u_int16_t       sl_len;         /* slab length */
 386         void            *sl_base;       /* base of allocated memory */
 387         void            *sl_head;       /* first free buffer */
 388         TAILQ_ENTRY(mcl_slab) sl_link;  /* next/prev slab on freelist */
 389 } mcl_slab_t;
 390
 391 #define SLF_MAPPED      0x0001          /* backed by a mapped page */
 392 #define SLF_PARTIAL     0x0002          /* part of another slab */
 393 #define SLF_DETACHED    0x0004          /* not in slab freelist */
 394
 395 /*
 396  * The array of slabs are broken into groups of arrays per 1MB of kernel
 397  * memory to reduce the footprint.  Each group is allocated on demand
 398  * whenever a new piece of memory mapped in from the VM crosses the 1MB
 399  * boundary.
 400  */
 401 #define NSLABSPMB       ((1 << MBSHIFT) >> PGSHIFT)     /* 256 slabs/grp */
 402
 403 typedef struct mcl_slabg {
 404         mcl_slab_t      slg_slab[NSLABSPMB];    /* group of slabs */
 405 } mcl_slabg_t;
 406
 407 /*
 408  * Number of slabs needed to control a 16KB cluster object.
 409  */
 410 #define NSLABSP16KB     (M16KCLBYTES >> PGSHIFT)
 411
 412 /*
 413  * Per-cluster audit structure.
 414  */
 415 typedef struct {
 416         mcache_audit_t  *cl_audit[NMBPBG];      /* array of audits */
 417 } mcl_audit_t;
 418
 419 /*
 420  * Size of data from the beginning of an mbuf that covers m_hdr, pkthdr
 421  * and m_ext structures.  If auditing is enabled, we allocate a shadow
 422  * mbuf structure of this size inside each audit structure, and the
 423  * contents of the real mbuf gets copied into it when the mbuf is freed.
 424  * This allows us to pattern-fill the mbuf for integrity check, and to
 425  * preserve any constructed mbuf fields (e.g. mbuf + cluster cache case).
 426  * Note that we don't save the contents of clusters when they are freed;
 427  * we simply pattern-fill them.
 428  */
 429 #define AUDIT_CONTENTS_SIZE     ((MSIZE - MHLEN) + sizeof (_m_ext_t))
 430
 431 /*
 432  * mbuf specific mcache audit flags
 433  */
 434 #define MB_INUSE        0x01    /* object has not been returned to slab */
 435 #define MB_COMP_INUSE   0x02    /* object has not been returned to cslab */
 436 #define MB_SCVALID      0x04    /* object has valid saved contents */
 437
 438 /*
 439  * Each of the following two arrays hold up to nmbclusters elements.
 440  */
 441 static mcl_audit_t *mclaudit;   /* array of cluster audit information */
 442 static unsigned int maxclaudit; /* max # of entries in audit table */
 443 static mcl_slabg_t **slabstbl;  /* cluster slabs table */
 444 static unsigned int maxslabgrp; /* max # of entries in slabs table */
 445 static unsigned int slabgrp;    /* # of entries in slabs table */
 446
 447 /* Globals */
 448 int nclusters;                  /* # of clusters for non-jumbo (legacy) sizes */
 449 int njcl;                       /* # of clusters for jumbo sizes */
 450 int njclbytes;                  /* size of a jumbo cluster */
 451 union mbigcluster *mbutl;       /* first mapped cluster address */
 452 union mbigcluster *embutl;      /* ending virtual address of mclusters */
 453 int _max_linkhdr;               /* largest link-level header */
 454 int _max_protohdr;              /* largest protocol header */
 455 int max_hdr;                    /* largest link+protocol header */
 456 int max_datalen;                /* MHLEN - max_hdr */
 457
 458 static boolean_t mclverify;     /* debug: pattern-checking */
 459 static boolean_t mcltrace;      /* debug: stack tracing */
 460 static boolean_t mclfindleak;   /* debug: leak detection */
 461 static boolean_t mclexpleak;    /* debug: expose leak info to user space */
 462
 463 /* mbuf leak detection variables */
 464 static struct mleak_table mleak_table;
 465 static mleak_stat_t *mleak_stat;
 466
 467 #define MLEAK_STAT_SIZE(n) \
 468         ((size_t)(&((mleak_stat_t *)0)->ml_trace[n]))
 469
 470 struct mallocation {
 471         mcache_obj_t *element;  /* the alloc'ed element, NULL if unused */
 472         u_int32_t trace_index;  /* mtrace index for corresponding backtrace */
 473         u_int32_t count;        /* How many objects were requested */
 474         u_int64_t hitcount;     /* for determining hash effectiveness */
 475 };
 476
 477 struct mtrace {
 478         u_int64_t       collisions;
 479         u_int64_t       hitcount;
 480         u_int64_t       allocs;
 481         u_int64_t       depth;
 482         uintptr_t       addr[MLEAK_STACK_DEPTH];
 483 };
 484
 485 /* Size must be a power of two for the zhash to be able to just mask off bits */
 486 #define MLEAK_ALLOCATION_MAP_NUM        512
 487 #define MLEAK_TRACE_MAP_NUM             256
 488
 489 /*
 490  * Sample factor for how often to record a trace.  This is overwritable
 491  * by the boot-arg mleak_sample_factor.
 492  */
 493 #define MLEAK_SAMPLE_FACTOR             500
 494
 495 /*
 496  * Number of top leakers recorded.
 497  */
 498 #define MLEAK_NUM_TRACES                5
 499
 500 #define MB_LEAK_SPACING_64 "                    "
 501 #define MB_LEAK_SPACING_32 "            "
 502
 503
 504 #define MB_LEAK_HDR_32  "\n\
 505     trace [1]   trace [2]   trace [3]   trace [4]   trace [5]  \n\
 506     ----------  ----------  ----------  ----------  ---------- \n\
 507 "
 508
 509 #define MB_LEAK_HDR_64  "\n\
 510     trace [1]           trace [2]           trace [3]       \
 511         trace [4]           trace [5]      \n\
 512     ------------------  ------------------  ------------------  \
 513     ------------------  ------------------ \n\
 514 "
 515
 516 static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM;
 517 static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM;
 518
 519 /* Hashmaps of allocations and their corresponding traces */
 520 static struct mallocation *mleak_allocations;
 521 static struct mtrace *mleak_traces;
 522 static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES];
 523
 524 /* Lock to protect mleak tables from concurrent modification */
 525 decl_lck_mtx_data(static, mleak_lock_data);
 526 static lck_mtx_t *mleak_lock = &mleak_lock_data;
 527 static lck_attr_t *mleak_lock_attr;
 528 static lck_grp_t *mleak_lock_grp;
 529 static lck_grp_attr_t *mleak_lock_grp_attr;
 530
 531 extern u_int32_t high_sb_max;
 532
 533 /* TODO: should be in header file */
 534 int do_reclaim = 0;
 535
 536 /* The minimum number of objects that are allocated, to start. */
 537 #define MINCL           32
 538 #define MINBIGCL        (MINCL >> 1)
 539 #define MIN16KCL        (MINCL >> 2)
 540
 541 /* Low watermarks (only map in pages once free counts go below) */
 542 #define MBIGCL_LOWAT    MINBIGCL
 543 #define M16KCL_LOWAT    MIN16KCL
 544
 545 typedef struct {
 546         mbuf_class_t    mtbl_class;     /* class type */
 547         mcache_t        *mtbl_cache;    /* mcache for this buffer class */
 548         TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */
 549         mcache_obj_t    *mtbl_cobjlist; /* composite objects freelist */
 550         mb_class_stat_t *mtbl_stats;    /* statistics fetchable via sysctl */
 551         u_int32_t       mtbl_maxsize;   /* maximum buffer size */
 552         int             mtbl_minlimit;  /* minimum allowed */
 553         int             mtbl_maxlimit;  /* maximum allowed */
 554         u_int32_t       mtbl_wantpurge; /* purge during next reclaim */
 555 } mbuf_table_t;
 556
 557 #define m_class(c)      mbuf_table[c].mtbl_class
 558 #define m_cache(c)      mbuf_table[c].mtbl_cache
 559 #define m_slablist(c)   mbuf_table[c].mtbl_slablist
 560 #define m_cobjlist(c)   mbuf_table[c].mtbl_cobjlist
 561 #define m_maxsize(c)    mbuf_table[c].mtbl_maxsize
 562 #define m_minlimit(c)   mbuf_table[c].mtbl_minlimit
 563 #define m_maxlimit(c)   mbuf_table[c].mtbl_maxlimit
 564 #define m_wantpurge(c)  mbuf_table[c].mtbl_wantpurge
 565 #define m_cname(c)      mbuf_table[c].mtbl_stats->mbcl_cname
 566 #define m_size(c)       mbuf_table[c].mtbl_stats->mbcl_size
 567 #define m_total(c)      mbuf_table[c].mtbl_stats->mbcl_total
 568 #define m_active(c)     mbuf_table[c].mtbl_stats->mbcl_active
 569 #define m_infree(c)     mbuf_table[c].mtbl_stats->mbcl_infree
 570 #define m_slab_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_slab_cnt
 571 #define m_alloc_cnt(c)  mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
 572 #define m_free_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_free_cnt
 573 #define m_notified(c)   mbuf_table[c].mtbl_stats->mbcl_notified
 574 #define m_purge_cnt(c)  mbuf_table[c].mtbl_stats->mbcl_purge_cnt
 575 #define m_fail_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_fail_cnt
 576 #define m_ctotal(c)     mbuf_table[c].mtbl_stats->mbcl_ctotal
 577
 578 static mbuf_table_t mbuf_table[] = {
 579         /*
 580          * The caches for mbufs, regular clusters and big clusters.
 581          */
 582         { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
 583             NULL, NULL, 0, 0, 0, 0 },
 584         { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
 585             NULL, NULL, 0, 0, 0, 0 },
 586         { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
 587             NULL, NULL, 0, 0, 0, 0 },
 588         { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
 589             NULL, NULL, 0, 0, 0, 0 },
 590         /*
 591          * The following are special caches; they serve as intermediate
 592          * caches backed by the above rudimentary caches.  Each object
 593          * in the cache is an mbuf with a cluster attached to it.  Unlike
 594          * the above caches, these intermediate caches do not directly
 595          * deal with the slab structures; instead, the constructed
 596          * cached elements are simply stored in the freelists.
 597          */
 598         { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
 599         { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
 600         { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
 601 };
 602
 603 #define NELEM(a)        (sizeof (a) / sizeof ((a)[0]))
 604
 605 static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */
 606 static int mb_waiters;                  /* number of waiters */
 607
 608 #define MB_WDT_MAXTIME  10              /* # of secs before watchdog panic */
 609 static struct timeval mb_wdtstart;      /* watchdog start timestamp */
 610 static char *mbuf_dump_buf;
 611
 612 #define MBUF_DUMP_BUF_SIZE      2048
 613
 614 /*
 615  * mbuf watchdog is enabled by default on embedded platforms.  It is
 616  * also toggeable via the kern.ipc.mb_watchdog sysctl.
 617  */
 618 #if CONFIG_EMBEDDED
 619 static unsigned int mb_watchdog = 1;
 620 #else
 621 static unsigned int mb_watchdog = 0;
 622 #endif /* CONFIG_EMBEDDED */
 623
 624 /* The following are used to serialize m_clalloc() */
 625 static boolean_t mb_clalloc_busy;
 626 static void *mb_clalloc_waitchan = &mb_clalloc_busy;
 627 static int mb_clalloc_waiters;
 628
 629 static void mbuf_mtypes_sync(boolean_t);
 630 static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
 631 static void mbuf_stat_sync(void);
 632 static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
 633 static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS;
 634 static int mleak_table_sysctl SYSCTL_HANDLER_ARGS;
 635 static char *mbuf_dump(void);
 636 static void mbuf_table_init(void);
 637 static inline void m_incref(struct mbuf *);
 638 static inline u_int32_t m_decref(struct mbuf *);
 639 static int m_clalloc(const u_int32_t, const int, const u_int32_t);
 640 static void mbuf_worker_thread_init(void);
 641 static mcache_obj_t *slab_alloc(mbuf_class_t, int);
 642 static void slab_free(mbuf_class_t, mcache_obj_t *);
 643 static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***,
 644     unsigned int, int);
 645 static void mbuf_slab_free(void *, mcache_obj_t *, int);
 646 static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t);
 647 static void mbuf_slab_notify(void *, u_int32_t);
 648 static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
 649     unsigned int);
 650 static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
 651 static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***,
 652     unsigned int, int);
 653 static void mbuf_cslab_free(void *, mcache_obj_t *, int);
 654 static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t);
 655 static int freelist_populate(mbuf_class_t, unsigned int, int);
 656 static void freelist_init(mbuf_class_t);
 657 static boolean_t mbuf_cached_above(mbuf_class_t, int);
 658 static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
 659 static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
 660 static int m_howmany(int, size_t);
 661 static void mbuf_worker_thread(void);
 662 static void mbuf_watchdog(void);
 663 static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
 664
 665 static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **,
 666     size_t, unsigned int);
 667 static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *);
 668 static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t);
 669 static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t,
 670     boolean_t);
 671 static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t);
 672 static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *);
 673 static void mcl_audit_mcheck_panic(struct mbuf *);
 674 static void mcl_audit_verify_nextptr(void *, mcache_audit_t *);
 675
 676 static void mleak_activate(void);
 677 static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t);
 678 static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int);
 679 static void mleak_free(mcache_obj_t *);
 680 static void mleak_sort_traces(void);
 681 static void mleak_update_stats(void);
 682
 683 static mcl_slab_t *slab_get(void *);
 684 static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
 685     void *, void *, unsigned int, int, int);
 686 static void slab_insert(mcl_slab_t *, mbuf_class_t);
 687 static void slab_remove(mcl_slab_t *, mbuf_class_t);
 688 static boolean_t slab_inrange(mcl_slab_t *, void *);
 689 static void slab_nextptr_panic(mcl_slab_t *, void *);
 690 static void slab_detach(mcl_slab_t *);
 691 static boolean_t slab_is_detached(mcl_slab_t *);
 692
 693 static int m_copyback0(struct mbuf **, int, int, const void *, int, int);
 694 static struct mbuf *m_split0(struct mbuf *, int, int, int);
 695
 696 /* flags for m_copyback0 */
 697 #define M_COPYBACK0_COPYBACK    0x0001  /* copyback from cp */
 698 #define M_COPYBACK0_PRESERVE    0x0002  /* preserve original data */
 699 #define M_COPYBACK0_COW         0x0004  /* do copy-on-write */
 700 #define M_COPYBACK0_EXTEND      0x0008  /* extend chain */
 701
 702 /*
 703  * This flag is set for all mbufs that come out of and into the composite
 704  * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL.  mbufs that
 705  * are marked with such a flag have clusters attached to them, and will be
 706  * treated differently when they are freed; instead of being placed back
 707  * into the mbuf and cluster freelists, the composite mbuf + cluster objects
 708  * are placed back into the appropriate composite cache's freelist, and the
 709  * actual freeing is deferred until the composite objects are purged.  At
 710  * such a time, this flag will be cleared from the mbufs and the objects
 711  * will be freed into their own separate freelists.
 712  */
 713 #define EXTF_COMPOSITE  0x1
 714
 715 /*
 716  * This flag indicates that the external cluster is read-only, i.e. it is
 717  * or was referred to by more than one mbufs.  Once set, this flag is never
 718  * cleared.
 719  */
 720 #define EXTF_READONLY   0x2
 721 #define EXTF_MASK       (EXTF_COMPOSITE | EXTF_READONLY)
 722
 723 #define MEXT_RFA(m)             ((m)->m_ext.ext_refflags)
 724 #define MEXT_REF(m)             (MEXT_RFA(m)->refcnt)
 725 #define MEXT_FLAGS(m)           (MEXT_RFA(m)->flags)
 726 #define MBUF_IS_COMPOSITE(m)    \
 727         (MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE)
 728
 729 /*
 730  * Macros used to verify the integrity of the mbuf.
 731  */
 732 #define _MCHECK(m) {                                                    \
 733         if ((m)->m_type != MT_FREE) {                                   \
 734                 if (mclaudit == NULL)                                   \
 735                         panic("MCHECK: m_type=%d m=%p",                 \
 736                             (u_int16_t)(m)->m_type, m);                 \
 737                 else                                                    \
 738                         mcl_audit_mcheck_panic(m);                      \
 739         }                                                               \
 740 }
 741
 742 #define MBUF_IN_MAP(addr)                                               \
 743         ((void *)(addr) >= (void *)mbutl && (void *)(addr) < (void *)embutl)
 744
 745 #define MRANGE(addr) {                                                  \
 746         if (!MBUF_IN_MAP(addr))                                         \
 747                 panic("MRANGE: address out of range 0x%p", addr);       \
 748 }
 749
 750 /*
 751  * Macro version of mtod.
 752  */
 753 #define MTOD(m, t)      ((t)((m)->m_data))
 754
 755 /*
 756  * Macros to obtain (4KB) cluster index and base cluster address.
 757  */
 758
 759 #define MTOBG(x)        (((char *)(x) - (char *)mbutl) >> MBIGCLSHIFT)
 760 #define BGTOM(x)        ((union mbigcluster *)(mbutl + (x)))
 761
 762 /*
 763  * Macro to find the mbuf index relative to a base.
 764  */
 765 #define MCLIDX(c, m)    (((char *)(m) - (char *)(c)) >> MSIZESHIFT)
 766
 767 /*
 768  * Same thing for 2KB cluster index.
 769  */
 770 #define CLBGIDX(c, m)   (((char *)(m) - (char *)(c)) >> MCLSHIFT)
 771
 772 /*
 773  * Macros used during mbuf and cluster initialization.
 774  */
 775 #define MBUF_INIT(m, pkthdr, type) {                                    \
 776         _MCHECK(m);                                                     \
 777         (m)->m_next = (m)->m_nextpkt = NULL;                            \
 778         (m)->m_len = 0;                                                 \
 779         (m)->m_type = type;                                             \
 780         if ((pkthdr) == 0) {                                            \
 781                 (m)->m_data = (m)->m_dat;                               \
 782                 (m)->m_flags = 0;                                       \
 783         } else {                                                        \
 784                 (m)->m_data = (m)->m_pktdat;                            \
 785                 (m)->m_flags = M_PKTHDR;                                \
 786                 (m)->m_pkthdr.rcvif = NULL;                             \
 787                 (m)->m_pkthdr.len = 0;                                  \
 788                 (m)->m_pkthdr.header = NULL;                            \
 789                 (m)->m_pkthdr.csum_flags = 0;                           \
 790                 (m)->m_pkthdr.csum_data = 0;                            \
 791                 (m)->m_pkthdr.tso_segsz = 0;                            \
 792                 (m)->m_pkthdr.vlan_tag = 0;                             \
 793                 (m)->m_pkthdr.socket_id = 0;                            \
 794                 (m)->m_pkthdr.vt_nrecs = 0;                             \
 795                 (m)->m_pkthdr.aux_flags = 0;                            \
 796                 m_tag_init(m);                                          \
 797                 m_service_class_init(m);                                \
 798         }                                                               \
 799 }
 800
 801 #define MEXT_INIT(m, buf, size, free, arg, rfa, ref, flag) {            \
 802         (m)->m_data = (m)->m_ext.ext_buf = (buf);                       \
 803         (m)->m_flags |= M_EXT;                                          \
 804         (m)->m_ext.ext_size = (size);                                   \
 805         (m)->m_ext.ext_free = (free);                                   \
 806         (m)->m_ext.ext_arg = (arg);                                     \
 807         (m)->m_ext.ext_refs.forward = (m)->m_ext.ext_refs.backward =    \
 808             &(m)->m_ext.ext_refs;                                       \
 809         MEXT_RFA(m) = (rfa);                                            \
 810         MEXT_REF(m) = (ref);                                            \
 811         MEXT_FLAGS(m) = (flag);                                         \
 812 }
 813
 814 #define MBUF_CL_INIT(m, buf, rfa, ref, flag)    \
 815         MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, ref, flag)
 816
 817 #define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
 818         MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, ref, flag)
 819
 820 #define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
 821         MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, ref, flag)
 822
 823 /*
 824  * Macro to convert BSD malloc sleep flag to mcache's
 825  */
 826 #define MSLEEPF(f)      ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
 827
 828 /*
 829  * The structure that holds all mbuf class statistics exportable via sysctl.
 830  * Similar to mbstat structure, the mb_stat structure is protected by the
 831  * global mbuf lock.  It contains additional information about the classes
 832  * that allows for a more accurate view of the state of the allocator.
 833  */
 834 struct mb_stat *mb_stat;
 835 struct omb_stat *omb_stat;      /* For backwards compatibility */
 836
 837 #define MB_STAT_SIZE(n) \
 838         ((size_t)(&((mb_stat_t *)0)->mbs_class[n]))
 839 #define OMB_STAT_SIZE(n) \
 840         ((size_t)(&((struct omb_stat *)0)->mbs_class[n]))
 841
 842 /*
 843  * The legacy structure holding all of the mbuf allocation statistics.
 844  * The actual statistics used by the kernel are stored in the mbuf_table
 845  * instead, and are updated atomically while the global mbuf lock is held.
 846  * They are mirrored in mbstat to support legacy applications (e.g. netstat).
 847  * Unlike before, the kernel no longer relies on the contents of mbstat for
 848  * its operations (e.g. cluster expansion) because the structure is exposed
 849  * to outside and could possibly be modified, therefore making it unsafe.
 850  * With the exception of the mbstat.m_mtypes array (see below), all of the
 851  * statistics are updated as they change.
 852  */
 853 struct mbstat mbstat;
 854
 855 #define MBSTAT_MTYPES_MAX \
 856         (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
 857
 858 /*
 859  * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
 860  * atomically and stored in a per-CPU structure which is lock-free; this is
 861  * done in order to avoid writing to the global mbstat data structure which
 862  * would cause false sharing.  During sysctl request for kern.ipc.mbstat,
 863  * the statistics across all CPUs will be converged into the mbstat.m_mtypes
 864  * array and returned to the application.  Any updates for types greater or
 865  * equal than MT_MAX would be done atomically to the mbstat; this slows down
 866  * performance but is okay since the kernel uses only up to MT_MAX-1 while
 867  * anything beyond that (up to type 255) is considered a corner case.
 868  */
 869 typedef struct {
 870         unsigned int    cpu_mtypes[MT_MAX];
 871 } __attribute__((aligned(CPU_CACHE_SIZE), packed)) mtypes_cpu_t;
 872
 873 typedef struct {
 874         mtypes_cpu_t    mbs_cpu[1];
 875 } mbuf_mtypes_t;
 876
 877 static mbuf_mtypes_t *mbuf_mtypes;      /* per-CPU statistics */
 878
 879 #define MBUF_MTYPES_SIZE(n) \
 880         ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n]))
 881
 882 #define MTYPES_CPU(p) \
 883         ((mtypes_cpu_t *)(void *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number())))
 884
 885 #define mtype_stat_add(type, n) {                                       \
 886         if ((unsigned)(type) < MT_MAX) {                                \
 887                 mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes);            \
 888                 atomic_add_32(&mbs->cpu_mtypes[type], n);               \
 889         } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) {    \
 890                 atomic_add_16((int16_t *)&mbstat.m_mtypes[type], n);    \
 891         }                                                               \
 892 }
 893
 894 #define mtype_stat_sub(t, n)    mtype_stat_add(t, -(n))
 895 #define mtype_stat_inc(t)       mtype_stat_add(t, 1)
 896 #define mtype_stat_dec(t)       mtype_stat_sub(t, 1)
 897
 898 static void
 899 mbuf_mtypes_sync(boolean_t locked)
 900 {
 901         int m, n;
 902         mtypes_cpu_t mtc;
 903
 904         if (locked)
 905                 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
 906
 907         bzero(&mtc, sizeof (mtc));
 908         for (m = 0; m < ncpu; m++) {
 909                 mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m];
 910                 mtypes_cpu_t temp;
 911
 912                 bcopy(&scp->cpu_mtypes, &temp.cpu_mtypes,
 913                     sizeof (temp.cpu_mtypes));
 914
 915                 for (n = 0; n < MT_MAX; n++)
 916                         mtc.cpu_mtypes[n] += temp.cpu_mtypes[n];
 917         }
 918         if (!locked)
 919                 lck_mtx_lock(mbuf_mlock);
 920         for (n = 0; n < MT_MAX; n++)
 921                 mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
 922         if (!locked)
 923                 lck_mtx_unlock(mbuf_mlock);
 924 }
 925
 926 static int
 927 mbstat_sysctl SYSCTL_HANDLER_ARGS
 928 {
 929 #pragma unused(oidp, arg1, arg2)
 930         mbuf_mtypes_sync(FALSE);
 931
 932         return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat)));
 933 }
 934
 935 static void
 936 mbuf_stat_sync(void)
 937 {
 938         mb_class_stat_t *sp;
 939         mcache_cpu_t *ccp;
 940         mcache_t *cp;
 941         int k, m, bktsize;
 942
 943         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
 944
 945         for (k = 0; k < NELEM(mbuf_table); k++) {
 946                 cp = m_cache(k);
 947                 ccp = &cp->mc_cpu[0];
 948                 bktsize = ccp->cc_bktsize;
 949                 sp = mbuf_table[k].mtbl_stats;
 950
 951                 if (cp->mc_flags & MCF_NOCPUCACHE)
 952                         sp->mbcl_mc_state = MCS_DISABLED;
 953                 else if (cp->mc_purge_cnt > 0)
 954                         sp->mbcl_mc_state = MCS_PURGING;
 955                 else if (bktsize == 0)
 956                         sp->mbcl_mc_state = MCS_OFFLINE;
 957                 else
 958                         sp->mbcl_mc_state = MCS_ONLINE;
 959
 960                 sp->mbcl_mc_cached = 0;
 961                 for (m = 0; m < ncpu; m++) {
 962                         ccp = &cp->mc_cpu[m];
 963                         if (ccp->cc_objs > 0)
 964                                 sp->mbcl_mc_cached += ccp->cc_objs;
 965                         if (ccp->cc_pobjs > 0)
 966                                 sp->mbcl_mc_cached += ccp->cc_pobjs;
 967                 }
 968                 sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
 969                 sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
 970                     sp->mbcl_infree;
 971
 972                 sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
 973                 sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
 974                 sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
 975
 976                 /* Calculate total count specific to each class */
 977                 sp->mbcl_ctotal = sp->mbcl_total;
 978                 switch (m_class(k)) {
 979                 case MC_MBUF:
 980                         /* Deduct mbufs used in composite caches */
 981                         sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
 982                             m_total(MC_MBUF_BIGCL));
 983                         break;
 984
 985                 case MC_CL:
 986                         /* Deduct clusters used in composite cache */
 987                         sp->mbcl_ctotal -= m_total(MC_MBUF_CL);
 988                         break;
 989
 990                 case MC_BIGCL:
 991                         /* Deduct clusters used in composite cache */
 992                         sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
 993                         break;
 994
 995                 case MC_16KCL:
 996                         /* Deduct clusters used in composite cache */
 997                         sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
 998                         break;
 999
1000                 default:
1001                         break;
1002                 }
1003         }
1004 }
1005
1006 static int
1007 mb_stat_sysctl SYSCTL_HANDLER_ARGS
1008 {
1009 #pragma unused(oidp, arg1, arg2)
1010         void *statp;
1011         int k, statsz, proc64 = proc_is64bit(req->p);
1012
1013         lck_mtx_lock(mbuf_mlock);
1014         mbuf_stat_sync();
1015
1016         if (!proc64) {
1017                 struct omb_class_stat *oc;
1018                 struct mb_class_stat *c;
1019
1020                 omb_stat->mbs_cnt = mb_stat->mbs_cnt;
1021                 oc = &omb_stat->mbs_class[0];
1022                 c = &mb_stat->mbs_class[0];
1023                 for (k = 0; k < omb_stat->mbs_cnt; k++, oc++, c++) {
1024                         (void) snprintf(oc->mbcl_cname, sizeof (oc->mbcl_cname),
1025                             "%s", c->mbcl_cname);
1026                         oc->mbcl_size = c->mbcl_size;
1027                         oc->mbcl_total = c->mbcl_total;
1028                         oc->mbcl_active = c->mbcl_active;
1029                         oc->mbcl_infree = c->mbcl_infree;
1030                         oc->mbcl_slab_cnt = c->mbcl_slab_cnt;
1031                         oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt;
1032                         oc->mbcl_free_cnt = c->mbcl_free_cnt;
1033                         oc->mbcl_notified = c->mbcl_notified;
1034                         oc->mbcl_purge_cnt = c->mbcl_purge_cnt;
1035                         oc->mbcl_fail_cnt = c->mbcl_fail_cnt;
1036                         oc->mbcl_ctotal = c->mbcl_ctotal;
1037                         oc->mbcl_mc_state = c->mbcl_mc_state;
1038                         oc->mbcl_mc_cached = c->mbcl_mc_cached;
1039                         oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt;
1040                         oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt;
1041                         oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt;
1042                 }
1043                 statp = omb_stat;
1044                 statsz = OMB_STAT_SIZE(NELEM(mbuf_table));
1045         } else {
1046                 statp = mb_stat;
1047                 statsz = MB_STAT_SIZE(NELEM(mbuf_table));
1048         }
1049
1050         lck_mtx_unlock(mbuf_mlock);
1051
1052         return (SYSCTL_OUT(req, statp, statsz));
1053 }
1054
1055 static int
1056 mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS
1057 {
1058 #pragma unused(oidp, arg1, arg2)
1059         int i;
1060
1061         /* Ensure leak tracing turned on */
1062         if (!mclfindleak || !mclexpleak)
1063                 return (ENXIO);
1064
1065         lck_mtx_lock(mleak_lock);
1066         mleak_update_stats();
1067         i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES));
1068         lck_mtx_unlock(mleak_lock);
1069
1070         return (i);
1071 }
1072
1073 static int
1074 mleak_table_sysctl SYSCTL_HANDLER_ARGS
1075 {
1076 #pragma unused(oidp, arg1, arg2)
1077         int i = 0;
1078
1079         /* Ensure leak tracing turned on */
1080         if (!mclfindleak || !mclexpleak)
1081                 return (ENXIO);
1082
1083         lck_mtx_lock(mleak_lock);
1084         i = SYSCTL_OUT(req, &mleak_table, sizeof (mleak_table));
1085         lck_mtx_unlock(mleak_lock);
1086
1087         return (i);
1088 }
1089
1090 static inline void
1091 m_incref(struct mbuf *m)
1092 {
1093         UInt32 old, new;
1094         volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
1095
1096         do {
1097                 old = *addr;
1098                 new = old + 1;
1099                 ASSERT(new != 0);
1100         } while (!OSCompareAndSwap(old, new, addr));
1101
1102         /*
1103          * If cluster is shared, mark it with (sticky) EXTF_READONLY;
1104          * we don't clear the flag when the refcount goes back to 1
1105          * to simplify code calling m_mclhasreference().
1106          */
1107         if (new > 1 && !(MEXT_FLAGS(m) & EXTF_READONLY))
1108                 (void) OSBitOrAtomic(EXTF_READONLY, &MEXT_FLAGS(m));
1109 }
1110
1111 static inline u_int32_t
1112 m_decref(struct mbuf *m)
1113 {
1114         UInt32 old, new;
1115         volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
1116
1117         do {
1118                 old = *addr;
1119                 new = old - 1;
1120                 ASSERT(old != 0);
1121         } while (!OSCompareAndSwap(old, new, addr));
1122
1123         return (new);
1124 }
1125
1126 static void
1127 mbuf_table_init(void)
1128 {
1129         unsigned int b, c, s;
1130         int m;
1131
1132         MALLOC(omb_stat, struct omb_stat *, OMB_STAT_SIZE(NELEM(mbuf_table)),
1133             M_TEMP, M_WAITOK | M_ZERO);
1134         VERIFY(omb_stat != NULL);
1135
1136         MALLOC(mb_stat, mb_stat_t *, MB_STAT_SIZE(NELEM(mbuf_table)),
1137             M_TEMP, M_WAITOK | M_ZERO);
1138         VERIFY(mb_stat != NULL);
1139
1140         mb_stat->mbs_cnt = NELEM(mbuf_table);
1141         for (m = 0; m < NELEM(mbuf_table); m++)
1142                 mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
1143
1144 #if CONFIG_MBUF_JUMBO
1145         /*
1146          * Set aside 1/3 of the mbuf cluster map for jumbo clusters; we do
1147          * this only on platforms where jumbo cluster pool is enabled.
1148          */
1149         njcl = nmbclusters / 3;
1150         njclbytes = M16KCLBYTES;
1151 #endif /* CONFIG_MBUF_JUMBO */
1152
1153         /*
1154          * nclusters holds both the 2KB and 4KB pools, so ensure it's
1155          * a multiple of 4KB clusters.
1156          */
1157         nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG);
1158         if (njcl > 0) {
1159                 /*
1160                  * Each jumbo cluster takes 8 2KB clusters, so make
1161                  * sure that the pool size is evenly divisible by 8;
1162                  * njcl is in 2KB unit, hence treated as such.
1163                  */
1164                 njcl = P2ROUNDDOWN(nmbclusters - nclusters, 8);
1165
1166                 /* Update nclusters with rounded down value of njcl */
1167                 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG);
1168         }
1169
1170         /*
1171          * njcl is valid only on platforms with 16KB jumbo clusters, where
1172          * it is configured to 1/3 of the pool size.  On these platforms,
1173          * the remaining is used for 2KB and 4KB clusters.  On platforms
1174          * without 16KB jumbo clusters, the entire pool is used for both
1175          * 2KB and 4KB clusters.  A 4KB cluster can either be splitted into
1176          * 16 mbufs, or into 2 2KB clusters.
1177          *
1178          *  +---+---+------------ ... -----------+------- ... -------+
1179          *  | c | b |              s             |        njcl       |
1180          *  +---+---+------------ ... -----------+------- ... -------+
1181          *
1182          * 1/32th of the shared region is reserved for pure 2KB and 4KB
1183          * clusters (1/64th each.)
1184          */
1185         c = P2ROUNDDOWN((nclusters >> 6), 2);           /* in 2KB unit */
1186         b = P2ROUNDDOWN((nclusters >> (6 + NCLPBGSHIFT)), 2); /* in 4KB unit */
1187         s = nclusters - (c + (b << NCLPBGSHIFT));       /* in 2KB unit */
1188
1189         /*
1190          * 1/64th (c) is reserved for 2KB clusters.
1191          */
1192         m_minlimit(MC_CL) = c;
1193         m_maxlimit(MC_CL) = s + c;                      /* in 2KB unit */
1194         m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
1195         (void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
1196
1197         /*
1198          * Another 1/64th (b) of the map is reserved for 4KB clusters.
1199          * It cannot be turned into 2KB clusters or mbufs.
1200          */
1201         m_minlimit(MC_BIGCL) = b;
1202         m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b;  /* in 4KB unit */
1203         m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES;
1204         (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
1205
1206         /*
1207          * The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB)
1208          */
1209         m_minlimit(MC_MBUF) = 0;
1210         m_maxlimit(MC_MBUF) = (s << NMBPCLSHIFT);       /* in mbuf unit */
1211         m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE;
1212         (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
1213
1214         /*
1215          * Set limits for the composite classes.
1216          */
1217         m_minlimit(MC_MBUF_CL) = 0;
1218         m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL);
1219         m_maxsize(MC_MBUF_CL) = MCLBYTES;
1220         m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
1221         (void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
1222
1223         m_minlimit(MC_MBUF_BIGCL) = 0;
1224         m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
1225         m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES;
1226         m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
1227         (void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
1228
1229         /*
1230          * And for jumbo classes.
1231          */
1232         m_minlimit(MC_16KCL) = 0;
1233         m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT);  /* in 16KB unit */
1234         m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
1235         (void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
1236
1237         m_minlimit(MC_MBUF_16KCL) = 0;
1238         m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
1239         m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
1240         m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
1241         (void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
1242
1243         /*
1244          * Initialize the legacy mbstat structure.
1245          */
1246         bzero(&mbstat, sizeof (mbstat));
1247         mbstat.m_msize = m_maxsize(MC_MBUF);
1248         mbstat.m_mclbytes = m_maxsize(MC_CL);
1249         mbstat.m_minclsize = MINCLSIZE;
1250         mbstat.m_mlen = MLEN;
1251         mbstat.m_mhlen = MHLEN;
1252         mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
1253 }
1254
1255 #if defined(__LP64__)
1256 typedef struct ncl_tbl {
1257         uint64_t nt_maxmem;     /* memory (sane) size */
1258         uint32_t nt_mbpool;     /* mbuf pool size */
1259 } ncl_tbl_t;
1260
1261 /* Non-server */
1262 static ncl_tbl_t ncl_table[] = {
1263         { (1ULL << GBSHIFT)       /*  1 GB */,  (64 << MBSHIFT)  /*  64 MB */ },
1264         { (1ULL << (GBSHIFT + 3)) /*  8 GB */,  (96 << MBSHIFT)  /*  96 MB */ },
1265         { (1ULL << (GBSHIFT + 4)) /* 16 GB */,  (128 << MBSHIFT) /* 128 MB */ },
1266         { 0, 0 }
1267 };
1268
1269 /* Server */
1270 static ncl_tbl_t ncl_table_srv[] = {
1271         { (1ULL << GBSHIFT)       /*  1 GB */,  (96 << MBSHIFT)  /*  96 MB */ },
1272         { (1ULL << (GBSHIFT + 2)) /*  4 GB */,  (128 << MBSHIFT) /* 128 MB */ },
1273         { (1ULL << (GBSHIFT + 3)) /*  8 GB */,  (160 << MBSHIFT) /* 160 MB */ },
1274         { (1ULL << (GBSHIFT + 4)) /* 16 GB */,  (192 << MBSHIFT) /* 192 MB */ },
1275         { (1ULL << (GBSHIFT + 5)) /* 32 GB */,  (256 << MBSHIFT) /* 256 MB */ },
1276         { (1ULL << (GBSHIFT + 6)) /* 64 GB */,  (384 << MBSHIFT) /* 384 MB */ },
1277         { 0, 0 }
1278 };
1279 #endif /* __LP64__ */
1280
1281 __private_extern__ unsigned int
1282 mbuf_default_ncl(int server, uint64_t mem)
1283 {
1284 #if !defined(__LP64__)
1285 #pragma unused(server)
1286         unsigned int n;
1287         /*
1288          * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
1289          */
1290         if ((n = ((mem / 16) / MCLBYTES)) > 32768)
1291                 n = 32768;
1292 #else
1293         unsigned int n, i;
1294         ncl_tbl_t *tbl = (server ? ncl_table_srv : ncl_table);
1295         /*
1296          * 64-bit kernel (mbuf pool size based on table).
1297          */
1298         n = tbl[0].nt_mbpool;
1299         for (i = 0; tbl[i].nt_mbpool != 0; i++) {
1300                 if (mem < tbl[i].nt_maxmem)
1301                         break;
1302                 n = tbl[i].nt_mbpool;
1303         }
1304         n >>= MCLSHIFT;
1305 #endif /* !__LP64__ */
1306         return (n);
1307 }
1308
1309 __private_extern__ void
1310 mbinit(void)
1311 {
1312         unsigned int m;
1313         unsigned int initmcl = 0;
1314         void *buf;
1315         thread_t thread = THREAD_NULL;
1316
1317         /*
1318          * These MBUF_ values must be equal to their private counterparts.
1319          */
1320         _CASSERT(MBUF_EXT == M_EXT);
1321         _CASSERT(MBUF_PKTHDR == M_PKTHDR);
1322         _CASSERT(MBUF_EOR == M_EOR);
1323         _CASSERT(MBUF_LOOP == M_LOOP);
1324         _CASSERT(MBUF_BCAST == M_BCAST);
1325         _CASSERT(MBUF_MCAST == M_MCAST);
1326         _CASSERT(MBUF_FRAG == M_FRAG);
1327         _CASSERT(MBUF_FIRSTFRAG == M_FIRSTFRAG);
1328         _CASSERT(MBUF_LASTFRAG == M_LASTFRAG);
1329         _CASSERT(MBUF_PROMISC == M_PROMISC);
1330         _CASSERT(MBUF_HASFCS == M_HASFCS);
1331
1332         _CASSERT(MBUF_TYPE_FREE == MT_FREE);
1333         _CASSERT(MBUF_TYPE_DATA == MT_DATA);
1334         _CASSERT(MBUF_TYPE_HEADER == MT_HEADER);
1335         _CASSERT(MBUF_TYPE_SOCKET == MT_SOCKET);
1336         _CASSERT(MBUF_TYPE_PCB == MT_PCB);
1337         _CASSERT(MBUF_TYPE_RTABLE == MT_RTABLE);
1338         _CASSERT(MBUF_TYPE_HTABLE == MT_HTABLE);
1339         _CASSERT(MBUF_TYPE_ATABLE == MT_ATABLE);
1340         _CASSERT(MBUF_TYPE_SONAME == MT_SONAME);
1341         _CASSERT(MBUF_TYPE_SOOPTS == MT_SOOPTS);
1342         _CASSERT(MBUF_TYPE_FTABLE == MT_FTABLE);
1343         _CASSERT(MBUF_TYPE_RIGHTS == MT_RIGHTS);
1344         _CASSERT(MBUF_TYPE_IFADDR == MT_IFADDR);
1345         _CASSERT(MBUF_TYPE_CONTROL == MT_CONTROL);
1346         _CASSERT(MBUF_TYPE_OOBDATA == MT_OOBDATA);
1347
1348         _CASSERT(MBUF_TSO_IPV4 == CSUM_TSO_IPV4);
1349         _CASSERT(MBUF_TSO_IPV6 == CSUM_TSO_IPV6);
1350         _CASSERT(MBUF_CSUM_REQ_SUM16 == CSUM_TCP_SUM16);
1351         _CASSERT(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16);
1352         _CASSERT(MBUF_CSUM_REQ_IP == CSUM_IP);
1353         _CASSERT(MBUF_CSUM_REQ_TCP == CSUM_TCP);
1354         _CASSERT(MBUF_CSUM_REQ_UDP == CSUM_UDP);
1355         _CASSERT(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6);
1356         _CASSERT(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6);
1357         _CASSERT(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED);
1358         _CASSERT(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID);
1359         _CASSERT(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID);
1360         _CASSERT(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR);
1361
1362         _CASSERT(MBUF_WAITOK == M_WAIT);
1363         _CASSERT(MBUF_DONTWAIT == M_DONTWAIT);
1364         _CASSERT(MBUF_COPYALL == M_COPYALL);
1365
1366         _CASSERT(MBUF_PKTAUXF_INET_RESOLVE_RTR == MAUXF_INET_RESOLVE_RTR);
1367         _CASSERT(MBUF_PKTAUXF_INET6_RESOLVE_RTR == MAUXF_INET6_RESOLVE_RTR);
1368
1369         _CASSERT(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK);
1370         _CASSERT(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK);
1371         _CASSERT(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE);
1372         _CASSERT(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE);
1373         _CASSERT(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE);
1374         _CASSERT(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI);
1375         _CASSERT(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI);
1376         _CASSERT(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI);
1377         _CASSERT(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO);
1378         _CASSERT(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO);
1379
1380         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK);
1381         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE);
1382         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI);
1383         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO);
1384
1385         if (nmbclusters == 0)
1386                 nmbclusters = NMBCLUSTERS;
1387
1388         /* This should be a sane (at least even) value by now */
1389         VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1));
1390
1391         /* Setup the mbuf table */
1392         mbuf_table_init();
1393
1394         /* Global lock for common layer */
1395         mbuf_mlock_grp_attr = lck_grp_attr_alloc_init();
1396         mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr);
1397         mbuf_mlock_attr = lck_attr_alloc_init();
1398         lck_mtx_init(mbuf_mlock, mbuf_mlock_grp, mbuf_mlock_attr);
1399
1400         /*
1401          * Allocate cluster slabs table:
1402          *
1403          *      maxslabgrp = (N * 2048) / (1024 * 1024)
1404          *
1405          * Where N is nmbclusters rounded up to the nearest 512.  This yields
1406          * mcl_slab_g_t units, each one representing a MB of memory.
1407          */
1408         maxslabgrp =
1409             (P2ROUNDUP(nmbclusters, (MBSIZE >> 11)) << MCLSHIFT) >> MBSHIFT;
1410         MALLOC(slabstbl, mcl_slabg_t **, maxslabgrp * sizeof (mcl_slabg_t *),
1411             M_TEMP, M_WAITOK | M_ZERO);
1412         VERIFY(slabstbl != NULL);
1413
1414         /*
1415          * Allocate audit structures, if needed:
1416          *
1417          *      maxclaudit = (maxslabgrp * 1024 * 1024) / 4096
1418          *
1419          * This yields mcl_audit_t units, each one representing a page.
1420          */
1421         PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof (mbuf_debug));
1422         mbuf_debug |= mcache_getflags();
1423         if (mbuf_debug & MCF_DEBUG) {
1424                 maxclaudit = ((maxslabgrp << MBSHIFT) >> PGSHIFT);
1425                 MALLOC(mclaudit, mcl_audit_t *, maxclaudit * sizeof (*mclaudit),
1426                     M_TEMP, M_WAITOK | M_ZERO);
1427                 VERIFY(mclaudit != NULL);
1428
1429                 mcl_audit_con_cache = mcache_create("mcl_audit_contents",
1430                     AUDIT_CONTENTS_SIZE, 0, 0, MCR_SLEEP);
1431                 VERIFY(mcl_audit_con_cache != NULL);
1432         }
1433         mclverify = (mbuf_debug & MCF_VERIFY);
1434         mcltrace = (mbuf_debug & MCF_TRACE);
1435         mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG);
1436         mclexpleak = mclfindleak && (mbuf_debug & MCF_EXPLEAKLOG);
1437
1438         /* Enable mbuf leak logging, with a lock to protect the tables */
1439
1440         mleak_lock_grp_attr = lck_grp_attr_alloc_init();
1441         mleak_lock_grp = lck_grp_alloc_init("mleak_lock", mleak_lock_grp_attr);
1442         mleak_lock_attr = lck_attr_alloc_init();
1443         lck_mtx_init(mleak_lock, mleak_lock_grp, mleak_lock_attr);
1444
1445         mleak_activate();
1446
1447         /* Calculate the number of pages assigned to the cluster pool */
1448         mcl_pages = (nmbclusters * MCLBYTES) / CLBYTES;
1449         MALLOC(mcl_paddr, ppnum_t *, mcl_pages * sizeof (ppnum_t),
1450             M_TEMP, M_WAITOK);
1451         VERIFY(mcl_paddr != NULL);
1452
1453         /* Register with the I/O Bus mapper */
1454         mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
1455         bzero((char *)mcl_paddr, mcl_pages * sizeof (ppnum_t));
1456
1457         embutl = (union mbigcluster *)
1458             ((void *)((unsigned char *)mbutl + (nmbclusters * MCLBYTES)));
1459         VERIFY((((char *)embutl - (char *)mbutl) % MBIGCLBYTES) == 0);
1460
1461         /* Prime up the freelist */
1462         PE_parse_boot_argn("initmcl", &initmcl, sizeof (initmcl));
1463         if (initmcl != 0) {
1464                 initmcl >>= NCLPBGSHIFT;        /* become a 4K unit */
1465                 if (initmcl > m_maxlimit(MC_BIGCL))
1466                         initmcl = m_maxlimit(MC_BIGCL);
1467         }
1468         if (initmcl < m_minlimit(MC_BIGCL))
1469                 initmcl = m_minlimit(MC_BIGCL);
1470
1471         lck_mtx_lock(mbuf_mlock);
1472
1473         /*
1474          * For classes with non-zero minimum limits, populate their freelists
1475          * so that m_total(class) is at least m_minlimit(class).
1476          */
1477         VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0);
1478         freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT);
1479         VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
1480         freelist_init(m_class(MC_CL));
1481
1482         for (m = 0; m < NELEM(mbuf_table); m++) {
1483                 /* Make sure we didn't miss any */
1484                 VERIFY(m_minlimit(m_class(m)) == 0 ||
1485                     m_total(m_class(m)) >= m_minlimit(m_class(m)));
1486         }
1487
1488         lck_mtx_unlock(mbuf_mlock);
1489
1490         (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init,
1491             NULL, &thread);
1492         thread_deallocate(thread);
1493
1494         ref_cache = mcache_create("mext_ref", sizeof (struct ext_ref),
1495             0, 0, MCR_SLEEP);
1496
1497         /* Create the cache for each class */
1498         for (m = 0; m < NELEM(mbuf_table); m++) {
1499                 void *allocfunc, *freefunc, *auditfunc, *logfunc;
1500                 u_int32_t flags;
1501
1502                 flags = mbuf_debug;
1503                 if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL ||
1504                     m_class(m) == MC_MBUF_16KCL) {
1505                         allocfunc = mbuf_cslab_alloc;
1506                         freefunc = mbuf_cslab_free;
1507                         auditfunc = mbuf_cslab_audit;
1508                         logfunc = mleak_logger;
1509                 } else {
1510                         allocfunc = mbuf_slab_alloc;
1511                         freefunc = mbuf_slab_free;
1512                         auditfunc = mbuf_slab_audit;
1513                         logfunc = mleak_logger;
1514                 }
1515
1516                 /*
1517                  * Disable per-CPU caches for jumbo classes if there
1518                  * is no jumbo cluster pool available in the system.
1519                  * The cache itself is still created (but will never
1520                  * be populated) since it simplifies the code.
1521                  */
1522                 if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) &&
1523                     njcl == 0)
1524                         flags |= MCF_NOCPUCACHE;
1525
1526                 if (!mclfindleak)
1527                         flags |= MCF_NOLEAKLOG;
1528
1529                 m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
1530                     allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify,
1531                     (void *)(uintptr_t)m, flags, MCR_SLEEP);
1532         }
1533
1534         /*
1535          * Allocate structure for per-CPU statistics that's aligned
1536          * on the CPU cache boundary; this code assumes that we never
1537          * uninitialize this framework, since the original address
1538          * before alignment is not saved.
1539          */
1540         ncpu = ml_get_max_cpus();
1541         MALLOC(buf, void *, MBUF_MTYPES_SIZE(ncpu) + CPU_CACHE_SIZE,
1542             M_TEMP, M_WAITOK);
1543         VERIFY(buf != NULL);
1544
1545         mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf, CPU_CACHE_SIZE);
1546         bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu));
1547
1548         /*
1549          * Set the max limit on sb_max to be 1/16 th of the size of
1550          * memory allocated for mbuf clusters.
1551          */
1552         high_sb_max = (nmbclusters << (MCLSHIFT - 4));
1553         if (high_sb_max < sb_max) {
1554                 /* sb_max is too large for this configuration, scale it down */
1555                 if (high_sb_max > (1 << MBSHIFT)) {
1556                         /* We have atleast 16 M of mbuf pool */
1557                         sb_max = high_sb_max;
1558                 } else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) {
1559                         /*
1560                          * If we have more than 1M of mbufpool, cap the size of
1561                          * max sock buf at 1M
1562                          */
1563                         sb_max = high_sb_max = (1 << MBSHIFT);
1564                 } else {
1565                         sb_max = high_sb_max;
1566                 }
1567         }
1568
1569         /* allocate space for mbuf_dump_buf */
1570         MALLOC(mbuf_dump_buf, char *, MBUF_DUMP_BUF_SIZE, M_TEMP, M_WAITOK);
1571         VERIFY(mbuf_dump_buf != NULL);
1572
1573         printf("mbinit: done [%d MB total pool size, (%d/%d) split]\n",
1574             (nmbclusters << MCLSHIFT) >> MBSHIFT,
1575             (nclusters << MCLSHIFT) >> MBSHIFT,
1576             (njcl << MCLSHIFT) >> MBSHIFT);
1577 }
1578
1579 /*
1580  * Obtain a slab of object(s) from the class's freelist.
1581  */
1582 static mcache_obj_t *
1583 slab_alloc(mbuf_class_t class, int wait)
1584 {
1585         mcl_slab_t *sp;
1586         mcache_obj_t *buf;
1587
1588         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1589
1590         VERIFY(class != MC_16KCL || njcl > 0);
1591
1592         /* This should always be NULL for us */
1593         VERIFY(m_cobjlist(class) == NULL);
1594
1595         /*
1596          * Treat composite objects as having longer lifespan by using
1597          * a slab from the reverse direction, in hoping that this could
1598          * reduce the probability of fragmentation for slabs that hold
1599          * more than one buffer chunks (e.g. mbuf slabs).  For other
1600          * slabs, this probably doesn't make much of a difference.
1601          */
1602         if ((class == MC_MBUF || class == MC_CL) && (wait & MCR_COMP))
1603                 sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
1604         else
1605                 sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
1606
1607         if (sp == NULL) {
1608                 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1609                 /* The slab list for this class is empty */
1610                 return (NULL);
1611         }
1612
1613         VERIFY(m_infree(class) > 0);
1614         VERIFY(!slab_is_detached(sp));
1615         VERIFY(sp->sl_class == class &&
1616             (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1617         buf = sp->sl_head;
1618         VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
1619
1620         if (class == MC_MBUF) {
1621                 sp->sl_head = buf->obj_next;
1622                 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NMBPBG - 1));
1623         } else if (class == MC_CL) {
1624                 sp->sl_head = buf->obj_next;
1625                 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NCLPBG - 1));
1626         } else {
1627                 sp->sl_head = NULL;
1628         }
1629         if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
1630                 slab_nextptr_panic(sp, sp->sl_head);
1631                 /* In case sl_head is in the map but not in the slab */
1632                 VERIFY(slab_inrange(sp, sp->sl_head));
1633                 /* NOTREACHED */
1634         }
1635
1636         /* Increment slab reference */
1637         sp->sl_refcnt++;
1638
1639         if (mclaudit != NULL) {
1640                 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1641                 mca->mca_uflags = 0;
1642                 /* Save contents on mbuf objects only */
1643                 if (class == MC_MBUF)
1644                         mca->mca_uflags |= MB_SCVALID;
1645         }
1646
1647         if (class == MC_CL) {
1648                 mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1649                 /*
1650                  * A 2K cluster slab can have at most NCLPBG references.
1651                  */
1652                 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPBG &&
1653                     sp->sl_chunks == NCLPBG &&
1654                     sp->sl_len == m_maxsize(MC_BIGCL));
1655                 VERIFY(sp->sl_refcnt < NCLPBG || sp->sl_head == NULL);
1656         } else if (class == MC_BIGCL) {
1657                 mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
1658                     m_infree(MC_MBUF_BIGCL);
1659                 /*
1660                  * A 4K cluster slab can have at most 1 reference.
1661                  */
1662                 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1663                     sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1664         } else if (class == MC_16KCL) {
1665                 mcl_slab_t *nsp;
1666                 int k;
1667
1668                 --m_infree(MC_16KCL);
1669                 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1670                     sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1671                 /*
1672                  * Increment 2nd-Nth slab reference, where N is NSLABSP16KB.
1673                  * A 16KB big cluster takes NSLABSP16KB slabs, each having at
1674                  * most 1 reference.
1675                  */
1676                 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1677                         nsp = nsp->sl_next;
1678                         /* Next slab must already be present */
1679                         VERIFY(nsp != NULL);
1680                         nsp->sl_refcnt++;
1681                         VERIFY(!slab_is_detached(nsp));
1682                         VERIFY(nsp->sl_class == MC_16KCL &&
1683                             nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1684                             nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1685                             nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1686                             nsp->sl_head == NULL);
1687                 }
1688         } else {
1689                 VERIFY(class == MC_MBUF);
1690                 --m_infree(MC_MBUF);
1691                 /*
1692                  * If auditing is turned on, this check is
1693                  * deferred until later in mbuf_slab_audit().
1694                  */
1695                 if (mclaudit == NULL)
1696                         _MCHECK((struct mbuf *)buf);
1697                 /*
1698                  * Since we have incremented the reference count above,
1699                  * an mbuf slab (formerly a 4KB cluster slab that was cut
1700                  * up into mbufs) must have a reference count between 1
1701                  * and NMBPBG at this point.
1702                  */
1703                 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPBG &&
1704                     sp->sl_chunks == NMBPBG &&
1705                     sp->sl_len == m_maxsize(MC_BIGCL));
1706                 VERIFY(sp->sl_refcnt < NMBPBG || sp->sl_head == NULL);
1707         }
1708
1709         /* If empty, remove this slab from the class's freelist */
1710         if (sp->sl_head == NULL) {
1711                 VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPBG);
1712                 VERIFY(class != MC_CL || sp->sl_refcnt == NCLPBG);
1713                 slab_remove(sp, class);
1714         }
1715
1716         return (buf);
1717 }
1718
1719 /*
1720  * Place a slab of object(s) back into a class's slab list.
1721  */
1722 static void
1723 slab_free(mbuf_class_t class, mcache_obj_t *buf)
1724 {
1725         mcl_slab_t *sp;
1726
1727         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1728
1729         VERIFY(class != MC_16KCL || njcl > 0);
1730         VERIFY(buf->obj_next == NULL);
1731         sp = slab_get(buf);
1732         VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
1733             (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1734
1735         /* Decrement slab reference */
1736         sp->sl_refcnt--;
1737
1738         if (class == MC_CL) {
1739                 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1740                 /*
1741                  * A slab that has been splitted for 2KB clusters can have
1742                  * at most 1 outstanding reference at this point.
1743                  */
1744                 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPBG - 1) &&
1745                     sp->sl_chunks == NCLPBG &&
1746                     sp->sl_len == m_maxsize(MC_BIGCL));
1747                 VERIFY(sp->sl_refcnt < (NCLPBG - 1) ||
1748                     (slab_is_detached(sp) && sp->sl_head == NULL));
1749         } else if (class == MC_BIGCL) {
1750                 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1751                 /*
1752                  * A 4KB cluster slab can have at most 1 reference
1753                  * which must be 0 at this point.
1754                  */
1755                 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1756                     sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1757                 VERIFY(slab_is_detached(sp));
1758         } else if (class == MC_16KCL) {
1759                 mcl_slab_t *nsp;
1760                 int k;
1761                 /*
1762                  * A 16KB cluster takes NSLABSP16KB slabs, all must
1763                  * now have 0 reference.
1764                  */
1765                 VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES));
1766                 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1767                     sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1768                 VERIFY(slab_is_detached(sp));
1769                 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1770                         nsp = nsp->sl_next;
1771                         /* Next slab must already be present */
1772                         VERIFY(nsp != NULL);
1773                         nsp->sl_refcnt--;
1774                         VERIFY(slab_is_detached(nsp));
1775                         VERIFY(nsp->sl_class == MC_16KCL &&
1776                             (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
1777                             nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
1778                             nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1779                             nsp->sl_head == NULL);
1780                 }
1781         } else {
1782                 /*
1783                  * A slab that has been splitted for mbufs has at most NMBPBG
1784                  * reference counts.  Since we have decremented one reference
1785                  * above, it must now be between 0 and NMBPBG-1.
1786                  */
1787                 VERIFY(class == MC_MBUF);
1788                 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NMBPBG - 1) &&
1789                     sp->sl_chunks == NMBPBG &&
1790                     sp->sl_len == m_maxsize(MC_BIGCL));
1791                 VERIFY(sp->sl_refcnt < (NMBPBG - 1) ||
1792                     (slab_is_detached(sp) && sp->sl_head == NULL));
1793         }
1794
1795         /*
1796          * When auditing is enabled, ensure that the buffer still
1797          * contains the free pattern.  Otherwise it got corrupted
1798          * while at the CPU cache layer.
1799          */
1800         if (mclaudit != NULL) {
1801                 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1802                 if (mclverify) {
1803                         mcache_audit_free_verify(mca, buf, 0, m_maxsize(class));
1804                 }
1805                 mca->mca_uflags &= ~MB_SCVALID;
1806         }
1807
1808         if (class == MC_CL) {
1809                 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1810                 buf->obj_next = sp->sl_head;
1811         } else if (class == MC_BIGCL) {
1812                 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1813                     m_infree(MC_MBUF_BIGCL);
1814         } else if (class == MC_16KCL) {
1815                 ++m_infree(MC_16KCL);
1816         } else {
1817                 ++m_infree(MC_MBUF);
1818                 buf->obj_next = sp->sl_head;
1819         }
1820         sp->sl_head = buf;
1821
1822         /*
1823          * If a slab has been splitted to either one which holds 2KB clusters,
1824          * or one which holds mbufs, turn it back to one which holds a 4KB
1825          * cluster.
1826          */
1827         if (class == MC_MBUF && sp->sl_refcnt == 0 &&
1828             m_total(class) > m_minlimit(class) &&
1829             m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) {
1830                 int i = NMBPBG;
1831
1832                 m_total(MC_BIGCL)++;
1833                 mbstat.m_bigclusters = m_total(MC_BIGCL);
1834                 m_total(MC_MBUF) -= NMBPBG;
1835                 mbstat.m_mbufs = m_total(MC_MBUF);
1836                 m_infree(MC_MBUF) -= NMBPBG;
1837                 mtype_stat_add(MT_FREE, -((unsigned)NMBPBG));
1838
1839                 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
1840                 VERIFY(m_total(MC_MBUF) >= m_minlimit(MC_MBUF));
1841
1842                 while (i--) {
1843                         struct mbuf *m = sp->sl_head;
1844                         VERIFY(m != NULL);
1845                         sp->sl_head = m->m_next;
1846                         m->m_next = NULL;
1847                 }
1848                 VERIFY(sp->sl_head == NULL);
1849
1850                 /* Remove the slab from the mbuf class's slab list */
1851                 slab_remove(sp, class);
1852
1853                 /* Reinitialize it as a 4KB cluster slab */
1854                 slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base,
1855                     sp->sl_len, 0, 1);
1856
1857                 if (mclverify) {
1858                         mcache_set_pattern(MCACHE_FREE_PATTERN,
1859                             (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL));
1860                 }
1861                 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1862                     m_infree(MC_MBUF_BIGCL);
1863
1864                 VERIFY(slab_is_detached(sp));
1865                 /* And finally switch class */
1866                 class = MC_BIGCL;
1867         } else if (class == MC_CL && sp->sl_refcnt == 0 &&
1868             m_total(class) > m_minlimit(class) &&
1869             m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) {
1870                 int i = NCLPBG;
1871
1872                 m_total(MC_BIGCL)++;
1873                 mbstat.m_bigclusters = m_total(MC_BIGCL);
1874                 m_total(MC_CL) -= NCLPBG;
1875                 mbstat.m_clusters = m_total(MC_CL);
1876                 m_infree(MC_CL) -= NCLPBG;
1877                 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
1878                 VERIFY(m_total(MC_CL) >= m_minlimit(MC_CL));
1879
1880                 while (i--) {
1881                         union mcluster *c = sp->sl_head;
1882                         VERIFY(c != NULL);
1883                         sp->sl_head = c->mcl_next;
1884                         c->mcl_next = NULL;
1885                 }
1886                 VERIFY(sp->sl_head == NULL);
1887
1888                 /* Remove the slab from the 2KB cluster class's slab list */
1889                 slab_remove(sp, class);
1890
1891                 /* Reinitialize it as a 4KB cluster slab */
1892                 slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base,
1893                     sp->sl_len, 0, 1);
1894
1895                 if (mclverify) {
1896                         mcache_set_pattern(MCACHE_FREE_PATTERN,
1897                             (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL));
1898                 }
1899                 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1900                     m_infree(MC_MBUF_BIGCL);
1901
1902                 VERIFY(slab_is_detached(sp));
1903                 /* And finally switch class */
1904                 class = MC_BIGCL;
1905         }
1906
1907         /* Reinsert the slab to the class's slab list */
1908         if (slab_is_detached(sp))
1909                 slab_insert(sp, class);
1910 }
1911
1912 /*
1913  * Common allocator for rudimentary objects called by the CPU cache layer
1914  * during an allocation request whenever there is no available element in the
1915  * bucket layer.  It returns one or more elements from the appropriate global
1916  * freelist.  If the freelist is empty, it will attempt to populate it and
1917  * retry the allocation.
1918  */
1919 static unsigned int
1920 mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
1921 {
1922         mbuf_class_t class = (mbuf_class_t)arg;
1923         unsigned int need = num;
1924         mcache_obj_t **list = *plist;
1925
1926         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1927         ASSERT(need > 0);
1928
1929         lck_mtx_lock(mbuf_mlock);
1930
1931         for (;;) {
1932                 if ((*list = slab_alloc(class, wait)) != NULL) {
1933                         (*list)->obj_next = NULL;
1934                         list = *plist = &(*list)->obj_next;
1935
1936                         if (--need == 0) {
1937                                 /*
1938                                  * If the number of elements in freelist has
1939                                  * dropped below low watermark, asynchronously
1940                                  * populate the freelist now rather than doing
1941                                  * it later when we run out of elements.
1942                                  */
1943                                 if (!mbuf_cached_above(class, wait) &&
1944                                     m_infree(class) < m_total(class) >> 5) {
1945                                         (void) freelist_populate(class, 1,
1946                                             M_DONTWAIT);
1947                                 }
1948                                 break;
1949                         }
1950                 } else {
1951                         VERIFY(m_infree(class) == 0 || class == MC_CL);
1952
1953                         (void) freelist_populate(class, 1,
1954                             (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
1955
1956                         if (m_infree(class) > 0)
1957                                 continue;
1958
1959                         /* Check if there's anything at the cache layer */
1960                         if (mbuf_cached_above(class, wait))
1961                                 break;
1962
1963                         /* watchdog checkpoint */
1964                         mbuf_watchdog();
1965
1966                         /* We have nothing and cannot block; give up */
1967                         if (wait & MCR_NOSLEEP) {
1968                                 if (!(wait & MCR_TRYHARD)) {
1969                                         m_fail_cnt(class)++;
1970                                         mbstat.m_drops++;
1971                                         break;
1972                                 }
1973                         }
1974
1975                         /*
1976                          * If the freelist is still empty and the caller is
1977                          * willing to be blocked, sleep on the wait channel
1978                          * until an element is available.  Otherwise, if
1979                          * MCR_TRYHARD is set, do our best to satisfy the
1980                          * request without having to go to sleep.
1981                          */
1982                         if (mbuf_worker_ready &&
1983                             mbuf_sleep(class, need, wait))
1984                                 break;
1985
1986                         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1987                 }
1988         }
1989
1990         m_alloc_cnt(class) += num - need;
1991         lck_mtx_unlock(mbuf_mlock);
1992
1993         return (num - need);
1994 }
1995
1996 /*
1997  * Common de-allocator for rudimentary objects called by the CPU cache
1998  * layer when one or more elements need to be returned to the appropriate
1999  * global freelist.
2000  */
2001 static void
2002 mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
2003 {
2004         mbuf_class_t class = (mbuf_class_t)arg;
2005         mcache_obj_t *nlist;
2006         unsigned int num = 0;
2007         int w;
2008
2009         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2010
2011         lck_mtx_lock(mbuf_mlock);
2012
2013         for (;;) {
2014                 nlist = list->obj_next;
2015                 list->obj_next = NULL;
2016                 slab_free(class, list);
2017                 ++num;
2018                 if ((list = nlist) == NULL)
2019                         break;
2020         }
2021         m_free_cnt(class) += num;
2022
2023         if ((w = mb_waiters) > 0)
2024                 mb_waiters = 0;
2025
2026         lck_mtx_unlock(mbuf_mlock);
2027
2028         if (w != 0)
2029                 wakeup(mb_waitchan);
2030 }
2031
2032 /*
2033  * Common auditor for rudimentary objects called by the CPU cache layer
2034  * during an allocation or free request.  For the former, this is called
2035  * after the objects are obtained from either the bucket or slab layer
2036  * and before they are returned to the caller.  For the latter, this is
2037  * called immediately during free and before placing the objects into
2038  * the bucket or slab layer.
2039  */
2040 static void
2041 mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2042 {
2043         mbuf_class_t class = (mbuf_class_t)arg;
2044         mcache_audit_t *mca;
2045
2046         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2047
2048         while (list != NULL) {
2049                 lck_mtx_lock(mbuf_mlock);
2050                 mca = mcl_audit_buf2mca(class, list);
2051
2052                 /* Do the sanity checks */
2053                 if (class == MC_MBUF) {
2054                         mcl_audit_mbuf(mca, list, FALSE, alloc);
2055                         ASSERT(mca->mca_uflags & MB_SCVALID);
2056                 } else {
2057                         mcl_audit_cluster(mca, list, m_maxsize(class),
2058                             alloc, TRUE);
2059                         ASSERT(!(mca->mca_uflags & MB_SCVALID));
2060                 }
2061                 /* Record this transaction */
2062                 if (mcltrace)
2063                         mcache_buffer_log(mca, list, m_cache(class));
2064
2065                 if (alloc)
2066                         mca->mca_uflags |= MB_INUSE;
2067                 else
2068                         mca->mca_uflags &= ~MB_INUSE;
2069                 /* Unpair the object (unconditionally) */
2070                 mca->mca_uptr = NULL;
2071                 lck_mtx_unlock(mbuf_mlock);
2072
2073                 list = list->obj_next;
2074         }
2075 }
2076
2077 /*
2078  * Common notify routine for all caches.  It is called by mcache when
2079  * one or more objects get freed.  We use this indication to trigger
2080  * the wakeup of any sleeping threads so that they can retry their
2081  * allocation requests.
2082  */
2083 static void
2084 mbuf_slab_notify(void *arg, u_int32_t reason)
2085 {
2086         mbuf_class_t class = (mbuf_class_t)arg;
2087         int w;
2088
2089         ASSERT(MBUF_CLASS_VALID(class));
2090
2091         if (reason != MCN_RETRYALLOC)
2092                 return;
2093
2094         lck_mtx_lock(mbuf_mlock);
2095         if ((w = mb_waiters) > 0) {
2096                 m_notified(class)++;
2097                 mb_waiters = 0;
2098         }
2099         lck_mtx_unlock(mbuf_mlock);
2100
2101         if (w != 0)
2102                 wakeup(mb_waitchan);
2103 }
2104
2105 /*
2106  * Obtain object(s) from the composite class's freelist.
2107  */
2108 static unsigned int
2109 cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
2110 {
2111         unsigned int need = num;
2112         mcl_slab_t *sp, *clsp, *nsp;
2113         struct mbuf *m;
2114         mcache_obj_t **list = *plist;
2115         void *cl;
2116
2117         VERIFY(need > 0);
2118         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2119         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2120
2121         /* Get what we can from the freelist */
2122         while ((*list = m_cobjlist(class)) != NULL) {
2123                 MRANGE(*list);
2124
2125                 m = (struct mbuf *)*list;
2126                 sp = slab_get(m);
2127                 cl = m->m_ext.ext_buf;
2128                 clsp = slab_get(cl);
2129                 VERIFY(m->m_flags == M_EXT && cl != NULL);
2130                 VERIFY(MEXT_RFA(m) != NULL && MBUF_IS_COMPOSITE(m));
2131
2132                 if (class == MC_MBUF_CL) {
2133                         VERIFY(clsp->sl_refcnt >= 1 &&
2134                             clsp->sl_refcnt <= NCLPBG);
2135                 } else {
2136                         VERIFY(clsp->sl_refcnt == 1);
2137                 }
2138
2139                 if (class == MC_MBUF_16KCL) {
2140                         int k;
2141                         for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2142                                 nsp = nsp->sl_next;
2143                                 /* Next slab must already be present */
2144                                 VERIFY(nsp != NULL);
2145                                 VERIFY(nsp->sl_refcnt == 1);
2146                         }
2147                 }
2148
2149                 if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
2150                     !MBUF_IN_MAP(m_cobjlist(class))) {
2151                         slab_nextptr_panic(sp, m_cobjlist(class));
2152                         /* NOTREACHED */
2153                 }
2154                 (*list)->obj_next = NULL;
2155                 list = *plist = &(*list)->obj_next;
2156
2157                 if (--need == 0)
2158                         break;
2159         }
2160         m_infree(class) -= (num - need);
2161
2162         return (num - need);
2163 }
2164
2165 /*
2166  * Place object(s) back into a composite class's freelist.
2167  */
2168 static unsigned int
2169 cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
2170 {
2171         mcache_obj_t *o, *tail;
2172         unsigned int num = 0;
2173         struct mbuf *m, *ms;
2174         mcache_audit_t *mca = NULL;
2175         mcache_obj_t *ref_list = NULL;
2176         mcl_slab_t *clsp, *nsp;
2177         void *cl;
2178         mbuf_class_t cl_class;
2179
2180         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2181         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2182         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2183
2184         if (class == MC_MBUF_CL) {
2185                 cl_class = MC_CL;
2186         } else if (class == MC_MBUF_BIGCL) {
2187                 cl_class = MC_BIGCL;
2188         } else {
2189                 VERIFY(class == MC_MBUF_16KCL);
2190                 cl_class = MC_16KCL;
2191         }
2192
2193         o = tail = list;
2194
2195         while ((m = ms = (struct mbuf *)o) != NULL) {
2196                 mcache_obj_t *rfa, *nexto = o->obj_next;
2197
2198                 /* Do the mbuf sanity checks */
2199                 if (mclaudit != NULL) {
2200                         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2201                         if (mclverify) {
2202                                 mcache_audit_free_verify(mca, m, 0,
2203                                     m_maxsize(MC_MBUF));
2204                         }
2205                         ms = (struct mbuf *)mca->mca_contents;
2206                 }
2207
2208                 /* Do the cluster sanity checks */
2209                 cl = ms->m_ext.ext_buf;
2210                 clsp = slab_get(cl);
2211                 if (mclverify) {
2212                         size_t size = m_maxsize(cl_class);
2213                         mcache_audit_free_verify(mcl_audit_buf2mca(cl_class,
2214                             (mcache_obj_t *)cl), cl, 0, size);
2215                 }
2216                 VERIFY(ms->m_type == MT_FREE);
2217                 VERIFY(ms->m_flags == M_EXT);
2218                 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2219                 if (cl_class == MC_CL) {
2220                         VERIFY(clsp->sl_refcnt >= 1 &&
2221                             clsp->sl_refcnt <= NCLPBG);
2222                 } else {
2223                         VERIFY(clsp->sl_refcnt == 1);
2224                 }
2225                 if (cl_class == MC_16KCL) {
2226                         int k;
2227                         for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2228                                 nsp = nsp->sl_next;
2229                                 /* Next slab must already be present */
2230                                 VERIFY(nsp != NULL);
2231                                 VERIFY(nsp->sl_refcnt == 1);
2232                         }
2233                 }
2234
2235                 /*
2236                  * If we're asked to purge, restore the actual mbuf using
2237                  * contents of the shadow structure (if auditing is enabled)
2238                  * and clear EXTF_COMPOSITE flag from the mbuf, as we are
2239                  * about to free it and the attached cluster into their caches.
2240                  */
2241                 if (purged) {
2242                         /* Restore constructed mbuf fields */
2243                         if (mclaudit != NULL)
2244                                 mcl_audit_restore_mbuf(m, mca, TRUE);
2245
2246                         MEXT_REF(m) = 0;
2247                         MEXT_FLAGS(m) = 0;
2248
2249                         rfa = (mcache_obj_t *)(void *)MEXT_RFA(m);
2250                         rfa->obj_next = ref_list;
2251                         ref_list = rfa;
2252                         MEXT_RFA(m) = NULL;
2253
2254                         m->m_type = MT_FREE;
2255                         m->m_flags = m->m_len = 0;
2256                         m->m_next = m->m_nextpkt = NULL;
2257
2258                         /* Save mbuf fields and make auditing happy */
2259                         if (mclaudit != NULL)
2260                                 mcl_audit_mbuf(mca, o, FALSE, FALSE);
2261
2262                         VERIFY(m_total(class) > 0);
2263                         m_total(class)--;
2264
2265                         /* Free the mbuf */
2266                         o->obj_next = NULL;
2267                         slab_free(MC_MBUF, o);
2268
2269                         /* And free the cluster */
2270                         ((mcache_obj_t *)cl)->obj_next = NULL;
2271                         if (class == MC_MBUF_CL)
2272                                 slab_free(MC_CL, cl);
2273                         else if (class == MC_MBUF_BIGCL)
2274                                 slab_free(MC_BIGCL, cl);
2275                         else
2276                                 slab_free(MC_16KCL, cl);
2277                 }
2278
2279                 ++num;
2280                 tail = o;
2281                 o = nexto;
2282         }
2283
2284         if (!purged) {
2285                 tail->obj_next = m_cobjlist(class);
2286                 m_cobjlist(class) = list;
2287                 m_infree(class) += num;
2288         } else if (ref_list != NULL) {
2289                 mcache_free_ext(ref_cache, ref_list);
2290         }
2291
2292         return (num);
2293 }
2294
2295 /*
2296  * Common allocator for composite objects called by the CPU cache layer
2297  * during an allocation request whenever there is no available element in
2298  * the bucket layer.  It returns one or more composite elements from the
2299  * appropriate global freelist.  If the freelist is empty, it will attempt
2300  * to obtain the rudimentary objects from their caches and construct them
2301  * into composite mbuf + cluster objects.
2302  */
2303 static unsigned int
2304 mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed,
2305     int wait)
2306 {
2307         mbuf_class_t class = (mbuf_class_t)arg;
2308         mbuf_class_t cl_class = 0;
2309         unsigned int num = 0, cnum = 0, want = needed;
2310         mcache_obj_t *ref_list = NULL;
2311         mcache_obj_t *mp_list = NULL;
2312         mcache_obj_t *clp_list = NULL;
2313         mcache_obj_t **list;
2314         struct ext_ref *rfa;
2315         struct mbuf *m;
2316         void *cl;
2317
2318         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2319         ASSERT(needed > 0);
2320
2321         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2322
2323         /* There should not be any slab for this class */
2324         VERIFY(m_slab_cnt(class) == 0 &&
2325             m_slablist(class).tqh_first == NULL &&
2326             m_slablist(class).tqh_last == NULL);
2327
2328         lck_mtx_lock(mbuf_mlock);
2329
2330         /* Try using the freelist first */
2331         num = cslab_alloc(class, plist, needed);
2332         list = *plist;
2333         if (num == needed) {
2334                 m_alloc_cnt(class) += num;
2335                 lck_mtx_unlock(mbuf_mlock);
2336                 return (needed);
2337         }
2338
2339         lck_mtx_unlock(mbuf_mlock);
2340
2341         /*
2342          * We could not satisfy the request using the freelist alone;
2343          * allocate from the appropriate rudimentary caches and use
2344          * whatever we can get to construct the composite objects.
2345          */
2346         needed -= num;
2347
2348         /*
2349          * Mark these allocation requests as coming from a composite cache.
2350          * Also, if the caller is willing to be blocked, mark the request
2351          * with MCR_FAILOK such that we don't end up sleeping at the mbuf
2352          * slab layer waiting for the individual object when one or more
2353          * of the already-constructed composite objects are available.
2354          */
2355         wait |= MCR_COMP;
2356         if (!(wait & MCR_NOSLEEP))
2357                 wait |= MCR_FAILOK;
2358
2359         /* allocate mbufs */
2360         needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
2361         if (needed == 0) {
2362                 ASSERT(mp_list == NULL);
2363                 goto fail;
2364         }
2365
2366         /* allocate clusters */
2367         if (class == MC_MBUF_CL) {
2368                 cl_class = MC_CL;
2369         } else if (class == MC_MBUF_BIGCL) {
2370                 cl_class = MC_BIGCL;
2371         } else {
2372                 VERIFY(class == MC_MBUF_16KCL);
2373                 cl_class = MC_16KCL;
2374         }
2375         needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait);
2376         if (needed == 0) {
2377                 ASSERT(clp_list == NULL);
2378                 goto fail;
2379         }
2380
2381         needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
2382         if (needed == 0) {
2383                 ASSERT(ref_list == NULL);
2384                 goto fail;
2385         }
2386
2387         /*
2388          * By this time "needed" is MIN(mbuf, cluster, ref).  Any left
2389          * overs will get freed accordingly before we return to caller.
2390          */
2391         for (cnum = 0; cnum < needed; cnum++) {
2392                 struct mbuf *ms;
2393
2394                 m = ms = (struct mbuf *)mp_list;
2395                 mp_list = mp_list->obj_next;
2396
2397                 cl = clp_list;
2398                 clp_list = clp_list->obj_next;
2399                 ((mcache_obj_t *)cl)->obj_next = NULL;
2400
2401                 rfa = (struct ext_ref *)ref_list;
2402                 ref_list = ref_list->obj_next;
2403                 ((mcache_obj_t *)(void *)rfa)->obj_next = NULL;
2404
2405                 /*
2406                  * If auditing is enabled, construct the shadow mbuf
2407                  * in the audit structure instead of in the actual one.
2408                  * mbuf_cslab_audit() will take care of restoring the
2409                  * contents after the integrity check.
2410                  */
2411                 if (mclaudit != NULL) {
2412                         mcache_audit_t *mca, *cl_mca;
2413
2414                         lck_mtx_lock(mbuf_mlock);
2415                         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2416                         ms = ((struct mbuf *)mca->mca_contents);
2417                         cl_mca = mcl_audit_buf2mca(MC_CL, (mcache_obj_t *)cl);
2418
2419                         /*
2420                          * Pair them up.  Note that this is done at the time
2421                          * the mbuf+cluster objects are constructed.  This
2422                          * information should be treated as "best effort"
2423                          * debugging hint since more than one mbufs can refer
2424                          * to a cluster.  In that case, the cluster might not
2425                          * be freed along with the mbuf it was paired with.
2426                          */
2427                         mca->mca_uptr = cl_mca;
2428                         cl_mca->mca_uptr = mca;
2429
2430                         ASSERT(mca->mca_uflags & MB_SCVALID);
2431                         ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
2432                         lck_mtx_unlock(mbuf_mlock);
2433
2434                         /* Technically, they are in the freelist */
2435                         if (mclverify) {
2436                                 size_t size;
2437
2438                                 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
2439                                     m_maxsize(MC_MBUF));
2440
2441                                 if (class == MC_MBUF_CL)
2442                                         size = m_maxsize(MC_CL);
2443                                 else if (class == MC_MBUF_BIGCL)
2444                                         size = m_maxsize(MC_BIGCL);
2445                                 else
2446                                         size = m_maxsize(MC_16KCL);
2447
2448                                 mcache_set_pattern(MCACHE_FREE_PATTERN, cl,
2449                                     size);
2450                         }
2451                 }
2452
2453                 MBUF_INIT(ms, 0, MT_FREE);
2454                 if (class == MC_MBUF_16KCL) {
2455                         MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2456                 } else if (class == MC_MBUF_BIGCL) {
2457                         MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2458                 } else {
2459                         MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2460                 }
2461                 VERIFY(ms->m_flags == M_EXT);
2462                 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2463
2464                 *list = (mcache_obj_t *)m;
2465                 (*list)->obj_next = NULL;
2466                 list = *plist = &(*list)->obj_next;
2467         }
2468
2469 fail:
2470         /*
2471          * Free up what's left of the above.
2472          */
2473         if (mp_list != NULL)
2474                 mcache_free_ext(m_cache(MC_MBUF), mp_list);
2475         if (clp_list != NULL)
2476                 mcache_free_ext(m_cache(cl_class), clp_list);
2477         if (ref_list != NULL)
2478                 mcache_free_ext(ref_cache, ref_list);
2479
2480         lck_mtx_lock(mbuf_mlock);
2481         if (num > 0 || cnum > 0) {
2482                 m_total(class) += cnum;
2483                 VERIFY(m_total(class) <= m_maxlimit(class));
2484                 m_alloc_cnt(class) += num + cnum;
2485         }
2486         if ((num + cnum) < want)
2487                 m_fail_cnt(class) += (want - (num + cnum));
2488         lck_mtx_unlock(mbuf_mlock);
2489
2490         return (num + cnum);
2491 }
2492
2493 /*
2494  * Common de-allocator for composite objects called by the CPU cache
2495  * layer when one or more elements need to be returned to the appropriate
2496  * global freelist.
2497  */
2498 static void
2499 mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
2500 {
2501         mbuf_class_t class = (mbuf_class_t)arg;
2502         unsigned int num;
2503         int w;
2504
2505         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2506
2507         lck_mtx_lock(mbuf_mlock);
2508
2509         num = cslab_free(class, list, purged);
2510         m_free_cnt(class) += num;
2511
2512         if ((w = mb_waiters) > 0)
2513                 mb_waiters = 0;
2514
2515         lck_mtx_unlock(mbuf_mlock);
2516
2517         if (w != 0)
2518                 wakeup(mb_waitchan);
2519 }
2520
2521 /*
2522  * Common auditor for composite objects called by the CPU cache layer
2523  * during an allocation or free request.  For the former, this is called
2524  * after the objects are obtained from either the bucket or slab layer
2525  * and before they are returned to the caller.  For the latter, this is
2526  * called immediately during free and before placing the objects into
2527  * the bucket or slab layer.
2528  */
2529 static void
2530 mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2531 {
2532         mbuf_class_t class = (mbuf_class_t)arg;
2533         mcache_audit_t *mca;
2534         struct mbuf *m, *ms;
2535         mcl_slab_t *clsp, *nsp;
2536         size_t size;
2537         void *cl;
2538
2539         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2540
2541         while ((m = ms = (struct mbuf *)list) != NULL) {
2542                 lck_mtx_lock(mbuf_mlock);
2543                 /* Do the mbuf sanity checks and record its transaction */
2544                 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2545                 mcl_audit_mbuf(mca, m, TRUE, alloc);
2546                 if (mcltrace)
2547                         mcache_buffer_log(mca, m, m_cache(class));
2548
2549                 if (alloc)
2550                         mca->mca_uflags |= MB_COMP_INUSE;
2551                 else
2552                         mca->mca_uflags &= ~MB_COMP_INUSE;
2553
2554                 /*
2555                  * Use the shadow mbuf in the audit structure if we are
2556                  * freeing, since the contents of the actual mbuf has been
2557                  * pattern-filled by the above call to mcl_audit_mbuf().
2558                  */
2559                 if (!alloc && mclverify)
2560                         ms = (struct mbuf *)mca->mca_contents;
2561
2562                 /* Do the cluster sanity checks and record its transaction */
2563                 cl = ms->m_ext.ext_buf;
2564                 clsp = slab_get(cl);
2565                 VERIFY(ms->m_flags == M_EXT && cl != NULL);
2566                 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2567                 if (class == MC_MBUF_CL)
2568                         VERIFY(clsp->sl_refcnt >= 1 &&
2569                             clsp->sl_refcnt <= NCLPBG);
2570                 else
2571                         VERIFY(clsp->sl_refcnt == 1);
2572
2573                 if (class == MC_MBUF_16KCL) {
2574                         int k;
2575                         for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2576                                 nsp = nsp->sl_next;
2577                                 /* Next slab must already be present */
2578                                 VERIFY(nsp != NULL);
2579                                 VERIFY(nsp->sl_refcnt == 1);
2580                         }
2581                 }
2582
2583                 mca = mcl_audit_buf2mca(MC_CL, cl);
2584                 if (class == MC_MBUF_CL)
2585                         size = m_maxsize(MC_CL);
2586                 else if (class == MC_MBUF_BIGCL)
2587                         size = m_maxsize(MC_BIGCL);
2588                 else
2589                         size = m_maxsize(MC_16KCL);
2590                 mcl_audit_cluster(mca, cl, size, alloc, FALSE);
2591                 if (mcltrace)
2592                         mcache_buffer_log(mca, cl, m_cache(class));
2593
2594                 if (alloc)
2595                         mca->mca_uflags |= MB_COMP_INUSE;
2596                 else
2597                         mca->mca_uflags &= ~MB_COMP_INUSE;
2598                 lck_mtx_unlock(mbuf_mlock);
2599
2600                 list = list->obj_next;
2601         }
2602 }
2603
2604 /*
2605  * Allocate some number of mbuf clusters and place on cluster freelist.
2606  */
2607 static int
2608 m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
2609 {
2610         int i;
2611         vm_size_t size = 0;
2612         int numpages = 0, large_buffer = (bufsize == m_maxsize(MC_16KCL));
2613         vm_offset_t page = 0;
2614         mcache_audit_t *mca_list = NULL;
2615         mcache_obj_t *con_list = NULL;
2616         mcl_slab_t *sp;
2617
2618         VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
2619             bufsize == m_maxsize(MC_16KCL));
2620
2621         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2622
2623         /*
2624          * Multiple threads may attempt to populate the cluster map one
2625          * after another.  Since we drop the lock below prior to acquiring
2626          * the physical page(s), our view of the cluster map may no longer
2627          * be accurate, and we could end up over-committing the pages beyond
2628          * the maximum allowed for each class.  To prevent it, this entire
2629          * operation (including the page mapping) is serialized.
2630          */
2631         while (mb_clalloc_busy) {
2632                 mb_clalloc_waiters++;
2633                 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
2634                     (PZERO-1), "m_clalloc", NULL);
2635                 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2636         }
2637
2638         /* We are busy now; tell everyone else to go away */
2639         mb_clalloc_busy = TRUE;
2640
2641         /*
2642          * Honor the caller's wish to block or not block.  We have a way
2643          * to grow the pool asynchronously using the mbuf worker thread.
2644          */
2645         i = m_howmany(num, bufsize);
2646         if (i == 0 || (wait & M_DONTWAIT))
2647                 goto out;
2648
2649         lck_mtx_unlock(mbuf_mlock);
2650
2651         size = round_page(i * bufsize);
2652         page = kmem_mb_alloc(mb_map, size, large_buffer);
2653
2654         /*
2655          * If we did ask for "n" 16KB physically contiguous chunks
2656          * and didn't get them, then please try again without this
2657          * restriction.
2658          */
2659         if (large_buffer && page == 0)
2660                 page = kmem_mb_alloc(mb_map, size, 0);
2661
2662         if (page == 0) {
2663                 if (bufsize == m_maxsize(MC_BIGCL)) {
2664                         /* Try for 1 page if failed, only 4KB request */
2665                         size = NBPG;
2666                         page = kmem_mb_alloc(mb_map, size, 0);
2667                 }
2668
2669                 if (page == 0) {
2670                         lck_mtx_lock(mbuf_mlock);
2671                         goto out;
2672                 }
2673         }
2674
2675         VERIFY(IS_P2ALIGNED(page, NBPG));
2676         numpages = size / NBPG;
2677
2678         /* If auditing is enabled, allocate the audit structures now */
2679         if (mclaudit != NULL) {
2680                 int needed;
2681
2682                 /*
2683                  * Yes, I realize this is a waste of memory for clusters
2684                  * that never get transformed into mbufs, as we may end
2685                  * up with NMBPBG-1 unused audit structures per cluster.
2686                  * But doing so tremendously simplifies the allocation
2687                  * strategy, since at this point we are not holding the
2688                  * mbuf lock and the caller is okay to be blocked.
2689                  */
2690                 if (bufsize == m_maxsize(MC_BIGCL)) {
2691                         needed = numpages * NMBPBG;
2692
2693                         i = mcache_alloc_ext(mcl_audit_con_cache,
2694                             &con_list, needed, MCR_SLEEP);
2695
2696                         VERIFY(con_list != NULL && i == needed);
2697                 } else {
2698                         needed = numpages / NSLABSP16KB;
2699                 }
2700
2701                 i = mcache_alloc_ext(mcache_audit_cache,
2702                     (mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
2703
2704                 VERIFY(mca_list != NULL && i == needed);
2705         }
2706
2707         lck_mtx_lock(mbuf_mlock);
2708
2709         for (i = 0; i < numpages; i++, page += NBPG) {
2710                 ppnum_t offset = ((char *)page - (char *)mbutl) / NBPG;
2711                 ppnum_t new_page = pmap_find_phys(kernel_pmap, page);
2712
2713                 /*
2714                  * In the case of no mapper being available the following
2715                  * code noops and returns the input page; if there is a
2716                  * mapper the appropriate I/O page is returned.
2717                  */
2718                 VERIFY(offset < mcl_pages);
2719                 if (mcl_paddr_base) {
2720                     bzero((void *)(uintptr_t) page, page_size);
2721                     new_page = IOMapperInsertPage(mcl_paddr_base, offset, new_page);
2722                 }
2723                 mcl_paddr[offset] = new_page << PGSHIFT;
2724
2725                 /* Pattern-fill this fresh page */
2726                 if (mclverify) {
2727                         mcache_set_pattern(MCACHE_FREE_PATTERN,
2728                             (caddr_t)page, NBPG);
2729                 }
2730                 if (bufsize == m_maxsize(MC_BIGCL)) {
2731                         union mbigcluster *mbc = (union mbigcluster *)page;
2732
2733                         /* One for the entire page */
2734                         sp = slab_get(mbc);
2735                         if (mclaudit != NULL) {
2736                                 mcl_audit_init(mbc, &mca_list, &con_list,
2737                                     AUDIT_CONTENTS_SIZE, NMBPBG);
2738                         }
2739                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2740                         slab_init(sp, MC_BIGCL, SLF_MAPPED,
2741                             mbc, mbc, bufsize, 0, 1);
2742
2743                         /* Insert this slab */
2744                         slab_insert(sp, MC_BIGCL);
2745
2746                         /* Update stats now since slab_get() drops the lock */
2747                         mbstat.m_bigclfree = ++m_infree(MC_BIGCL) +
2748                             m_infree(MC_MBUF_BIGCL);
2749                         mbstat.m_bigclusters = ++m_total(MC_BIGCL);
2750                         VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
2751                 } else if ((i % NSLABSP16KB) == 0) {
2752                         union m16kcluster *m16kcl = (union m16kcluster *)page;
2753                         mcl_slab_t *nsp;
2754                         int k;
2755
2756                         VERIFY(njcl > 0);
2757                         /* One for the entire 16KB */
2758                         sp = slab_get(m16kcl);
2759                         if (mclaudit != NULL)
2760                                 mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
2761
2762                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2763                         slab_init(sp, MC_16KCL, SLF_MAPPED,
2764                             m16kcl, m16kcl, bufsize, 0, 1);
2765
2766                         /*
2767                          * 2nd-Nth page's slab is part of the first one,
2768                          * where N is NSLABSP16KB.
2769                          */
2770                         for (k = 1; k < NSLABSP16KB; k++) {
2771                                 nsp = slab_get(((union mbigcluster *)page) + k);
2772                                 VERIFY(nsp->sl_refcnt == 0 &&
2773                                     nsp->sl_flags == 0);
2774                                 slab_init(nsp, MC_16KCL,
2775                                     SLF_MAPPED | SLF_PARTIAL,
2776                                     m16kcl, NULL, 0, 0, 0);
2777                         }
2778
2779                         /* Insert this slab */
2780                         slab_insert(sp, MC_16KCL);
2781
2782                         /* Update stats now since slab_get() drops the lock */
2783                         m_infree(MC_16KCL)++;
2784                         m_total(MC_16KCL)++;
2785                         VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
2786                 }
2787         }
2788         VERIFY(mca_list == NULL && con_list == NULL);
2789
2790         /* We're done; let others enter */
2791         mb_clalloc_busy = FALSE;
2792         if (mb_clalloc_waiters > 0) {
2793                 mb_clalloc_waiters = 0;
2794                 wakeup(mb_clalloc_waitchan);
2795         }
2796
2797         if (bufsize == m_maxsize(MC_BIGCL))
2798                 return (numpages);
2799
2800         VERIFY(bufsize == m_maxsize(MC_16KCL));
2801         return (numpages / NSLABSP16KB);
2802
2803 out:
2804         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2805
2806         /* We're done; let others enter */
2807         mb_clalloc_busy = FALSE;
2808         if (mb_clalloc_waiters > 0) {
2809                 mb_clalloc_waiters = 0;
2810                 wakeup(mb_clalloc_waitchan);
2811         }
2812
2813         /*
2814          * When non-blocking we kick a thread if we have to grow the
2815          * pool or if the number of free clusters is less than requested.
2816          */
2817         if (bufsize == m_maxsize(MC_BIGCL)) {
2818                 if (i > 0) {
2819                         /*
2820                          * Remember total number of 4KB clusters needed
2821                          * at this time.
2822                          */
2823                         i += m_total(MC_BIGCL);
2824                         if (i > mbuf_expand_big) {
2825                                 mbuf_expand_big = i;
2826                                 if (mbuf_worker_ready)
2827                                         wakeup((caddr_t)&mbuf_worker_run);
2828                         }
2829                 }
2830
2831                 if (m_infree(MC_BIGCL) >= num)
2832                         return (1);
2833         } else {
2834                 if (i > 0) {
2835                         /*
2836                          * Remember total number of 16KB clusters needed
2837                          * at this time.
2838                          */
2839                         i += m_total(MC_16KCL);
2840                         if (i > mbuf_expand_16k) {
2841                                 mbuf_expand_16k = i;
2842                                 if (mbuf_worker_ready)
2843                                         wakeup((caddr_t)&mbuf_worker_run);
2844                         }
2845                 }
2846
2847                 if (m_infree(MC_16KCL) >= num)
2848                         return (1);
2849         }
2850         return (0);
2851 }
2852
2853 /*
2854  * Populate the global freelist of the corresponding buffer class.
2855  */
2856 static int
2857 freelist_populate(mbuf_class_t class, unsigned int num, int wait)
2858 {
2859         mcache_obj_t *o = NULL;
2860         int i, numpages = 0, count;
2861
2862         VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL ||
2863             class == MC_16KCL);
2864
2865         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2866
2867         switch (class) {
2868         case MC_MBUF:
2869         case MC_CL:
2870         case MC_BIGCL:
2871                 numpages = (num * m_size(class) + NBPG - 1) / NBPG;
2872                 i = m_clalloc(numpages, wait, m_maxsize(MC_BIGCL));
2873
2874                 /* Respect the 4KB clusters minimum limit */
2875                 if (m_total(MC_BIGCL) == m_maxlimit(MC_BIGCL) &&
2876                     m_infree(MC_BIGCL) <= m_minlimit(MC_BIGCL)) {
2877                         if (class != MC_BIGCL || (wait & MCR_COMP))
2878                                 return (0);
2879                 }
2880                 if (class == MC_BIGCL)
2881                         return (i != 0);
2882                 break;
2883
2884         case MC_16KCL:
2885                 return (m_clalloc(num, wait, m_maxsize(class)) != 0);
2886                 /* NOTREACHED */
2887
2888         default:
2889                 VERIFY(0);
2890                 /* NOTREACHED */
2891         }
2892
2893         VERIFY(class == MC_MBUF || class == MC_CL);
2894
2895         /* how many objects will we cut the page into? */
2896         int numobj = (class == MC_MBUF ? NMBPBG : NCLPBG);
2897
2898         for (count = 0; count < numpages; count++) {
2899
2900                 /* respect totals, minlimit, maxlimit */
2901                 if (m_total(MC_BIGCL) <= m_minlimit(MC_BIGCL) ||
2902                     m_total(class) >= m_maxlimit(class))
2903                         break;
2904
2905                 if ((o = slab_alloc(MC_BIGCL, wait)) == NULL)
2906                         break;
2907
2908                 struct mbuf *m = (struct mbuf *)o;
2909                 union mcluster *c = (union mcluster *)o;
2910                 mcl_slab_t *sp = slab_get(o);
2911                 mcache_audit_t *mca = NULL;
2912
2913                 VERIFY(slab_is_detached(sp) &&
2914                     (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
2915
2916                 /*
2917                  * Make sure that the cluster is unmolested
2918                  * while in freelist
2919                  */
2920                 if (mclverify) {
2921                         mca = mcl_audit_buf2mca(MC_BIGCL, o);
2922                         mcache_audit_free_verify(mca, o, 0,
2923                             m_maxsize(MC_BIGCL));
2924                 }
2925
2926                 /* Reinitialize it as an mbuf or 2K slab */
2927                 slab_init(sp, class, sp->sl_flags,
2928                     sp->sl_base, NULL, sp->sl_len, 0, numobj);
2929
2930                 VERIFY(o == (mcache_obj_t *)sp->sl_base);
2931                 VERIFY(sp->sl_head == NULL);
2932
2933                 VERIFY(m_total(MC_BIGCL) > 0);
2934                 m_total(MC_BIGCL)--;
2935                 mbstat.m_bigclusters = m_total(MC_BIGCL);
2936
2937                 m_total(class) += numobj;
2938                 m_infree(class) += numobj;
2939
2940                 VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
2941                 VERIFY(m_total(class) <= m_maxlimit(class));
2942
2943                 i = numobj;
2944                 if (class == MC_MBUF) {
2945                         mbstat.m_mbufs = m_total(MC_MBUF);
2946                         mtype_stat_add(MT_FREE, NMBPBG);
2947                         while (i--) {
2948                                 /*
2949                                  * If auditing is enabled, construct the
2950                                  * shadow mbuf in the audit structure
2951                                  * instead of the actual one.
2952                                  * mbuf_slab_audit() will take care of
2953                                  * restoring the contents after the
2954                                  * integrity check.
2955                                  */
2956                                 if (mclaudit != NULL) {
2957                                         struct mbuf *ms;
2958                                         mca = mcl_audit_buf2mca(MC_MBUF,
2959                                             (mcache_obj_t *)m);
2960                                         ms = ((struct mbuf *)
2961                                             mca->mca_contents);
2962                                         ms->m_type = MT_FREE;
2963                                 } else {
2964                                         m->m_type = MT_FREE;
2965                                 }
2966                                 m->m_next = sp->sl_head;
2967                                 sp->sl_head = (void *)m++;
2968                         }
2969                 } else { /* MC_CL */
2970                         mbstat.m_clfree =
2971                             m_infree(MC_CL) + m_infree(MC_MBUF_CL);
2972                         mbstat.m_clusters = m_total(MC_CL);
2973                         while (i--) {
2974                                 c->mcl_next = sp->sl_head;
2975                                 sp->sl_head = (void *)c++;
2976                         }
2977                 }
2978
2979                 /* Insert into the mbuf or 2k slab list */
2980                 slab_insert(sp, class);
2981
2982                 if ((i = mb_waiters) > 0)
2983                         mb_waiters = 0;
2984                 if (i != 0)
2985                         wakeup(mb_waitchan);
2986         }
2987         return (count != 0);
2988 }
2989
2990 /*
2991  * For each class, initialize the freelist to hold m_minlimit() objects.
2992  */
2993 static void
2994 freelist_init(mbuf_class_t class)
2995 {
2996         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2997
2998         VERIFY(class == MC_CL || class == MC_BIGCL);
2999         VERIFY(m_total(class) == 0);
3000         VERIFY(m_minlimit(class) > 0);
3001
3002         while (m_total(class) < m_minlimit(class))
3003                 (void) freelist_populate(class, m_minlimit(class), M_WAIT);
3004
3005         VERIFY(m_total(class) >= m_minlimit(class));
3006 }
3007
3008 /*
3009  * (Inaccurately) check if it might be worth a trip back to the
3010  * mcache layer due the availability of objects there.  We'll
3011  * end up back here if there's nothing up there.
3012  */
3013 static boolean_t
3014 mbuf_cached_above(mbuf_class_t class, int wait)
3015 {
3016         switch (class) {
3017         case MC_MBUF:
3018                 if (wait & MCR_COMP)
3019                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)) ||
3020                             !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3021                 break;
3022
3023         case MC_CL:
3024                 if (wait & MCR_COMP)
3025                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)));
3026                 break;
3027
3028         case MC_BIGCL:
3029                 if (wait & MCR_COMP)
3030                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3031                 break;
3032
3033         case MC_16KCL:
3034                 if (wait & MCR_COMP)
3035                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL)));
3036                 break;
3037
3038         case MC_MBUF_CL:
3039         case MC_MBUF_BIGCL:
3040         case MC_MBUF_16KCL:
3041                 break;
3042
3043         default:
3044                 VERIFY(0);
3045                 /* NOTREACHED */
3046         }
3047
3048         return (!mcache_bkt_isempty(m_cache(class)));
3049 }
3050
3051 /*
3052  * If possible, convert constructed objects to raw ones.
3053  */
3054 static boolean_t
3055 mbuf_steal(mbuf_class_t class, unsigned int num)
3056 {
3057         mcache_obj_t *top = NULL;
3058         mcache_obj_t **list = &top;
3059         unsigned int tot = 0;
3060
3061         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3062
3063         switch (class) {
3064         case MC_MBUF:
3065         case MC_CL:
3066         case MC_BIGCL:
3067         case MC_16KCL:
3068                 return (FALSE);
3069
3070         case MC_MBUF_CL:
3071         case MC_MBUF_BIGCL:
3072         case MC_MBUF_16KCL:
3073                 /* Get the required number of constructed objects if possible */
3074                 if (m_infree(class) > m_minlimit(class)) {
3075                         tot = cslab_alloc(class, &list,
3076                             MIN(num, m_infree(class)));
3077                 }
3078
3079                 /* And destroy them to get back the raw objects */
3080                 if (top != NULL)
3081                         (void) cslab_free(class, top, 1);
3082                 break;
3083
3084         default:
3085                 VERIFY(0);
3086                 /* NOTREACHED */
3087         }
3088
3089         return (tot == num);
3090 }
3091
3092 static void
3093 m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
3094 {
3095         int m, bmap = 0;
3096
3097         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3098
3099         VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
3100         VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
3101         VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3102
3103         /*
3104          * This logic can be made smarter; for now, simply mark
3105          * all other related classes as potential victims.
3106          */
3107         switch (class) {
3108         case MC_MBUF:
3109                 m_wantpurge(MC_CL)++;
3110                 m_wantpurge(MC_BIGCL)++;
3111                 m_wantpurge(MC_MBUF_CL)++;
3112                 m_wantpurge(MC_MBUF_BIGCL)++;
3113                 break;
3114
3115         case MC_CL:
3116                 m_wantpurge(MC_MBUF)++;
3117                 m_wantpurge(MC_BIGCL)++;
3118                 m_wantpurge(MC_MBUF_BIGCL)++;
3119                 if (!comp)
3120                         m_wantpurge(MC_MBUF_CL)++;
3121                 break;
3122
3123         case MC_BIGCL:
3124                 m_wantpurge(MC_MBUF)++;
3125                 m_wantpurge(MC_CL)++;
3126                 m_wantpurge(MC_MBUF_CL)++;
3127                 if (!comp)
3128                         m_wantpurge(MC_MBUF_BIGCL)++;
3129                 break;
3130
3131         case MC_16KCL:
3132                 if (!comp)
3133                         m_wantpurge(MC_MBUF_16KCL)++;
3134                 break;
3135
3136         default:
3137                 VERIFY(0);
3138                 /* NOTREACHED */
3139         }
3140
3141         /*
3142          * Run through each marked class and check if we really need to
3143          * purge (and therefore temporarily disable) the per-CPU caches
3144          * layer used by the class.  If so, remember the classes since
3145          * we are going to drop the lock below prior to purging.
3146          */
3147         for (m = 0; m < NELEM(mbuf_table); m++) {
3148                 if (m_wantpurge(m) > 0) {
3149                         m_wantpurge(m) = 0;
3150                         /*
3151                          * Try hard to steal the required number of objects
3152                          * from the freelist of other mbuf classes.  Only
3153                          * purge and disable the per-CPU caches layer when
3154                          * we don't have enough; it's the last resort.
3155                          */
3156                         if (!mbuf_steal(m, num))
3157                                 bmap |= (1 << m);
3158                 }
3159         }
3160
3161         lck_mtx_unlock(mbuf_mlock);
3162
3163         if (bmap != 0) {
3164                 /* drain is performed in pfslowtimo(), to avoid deadlocks */
3165                 do_reclaim = 1;
3166
3167                 /* Sigh; we have no other choices but to ask mcache to purge */
3168                 for (m = 0; m < NELEM(mbuf_table); m++) {
3169                         if ((bmap & (1 << m)) &&
3170                             mcache_purge_cache(m_cache(m))) {
3171                                 lck_mtx_lock(mbuf_mlock);
3172                                 m_purge_cnt(m)++;
3173                                 mbstat.m_drain++;
3174                                 lck_mtx_unlock(mbuf_mlock);
3175                         }
3176                 }
3177         } else {
3178                 /*
3179                  * Request mcache to reap extra elements from all of its caches;
3180                  * note that all reaps are serialized and happen only at a fixed
3181                  * interval.
3182                  */
3183                 mcache_reap();
3184         }
3185         lck_mtx_lock(mbuf_mlock);
3186 }
3187
3188 static inline struct mbuf *
3189 m_get_common(int wait, short type, int hdr)
3190 {
3191         struct mbuf *m;
3192         int mcflags = MSLEEPF(wait);
3193
3194         /* Is this due to a non-blocking retry?  If so, then try harder */
3195         if (mcflags & MCR_NOSLEEP)
3196                 mcflags |= MCR_TRYHARD;
3197
3198         m = mcache_alloc(m_cache(MC_MBUF), mcflags);
3199         if (m != NULL) {
3200                 MBUF_INIT(m, hdr, type);
3201                 mtype_stat_inc(type);
3202                 mtype_stat_dec(MT_FREE);
3203 #if CONFIG_MACF_NET
3204                 if (hdr && mac_init_mbuf(m, wait) != 0) {
3205                         m_free(m);
3206                         return (NULL);
3207                 }
3208 #endif /* MAC_NET */
3209         }
3210         return (m);
3211 }
3212
3213 /*
3214  * Space allocation routines; these are also available as macros
3215  * for critical paths.
3216  */
3217 #define _M_GET(wait, type)      m_get_common(wait, type, 0)
3218 #define _M_GETHDR(wait, type)   m_get_common(wait, type, 1)
3219 #define _M_RETRY(wait, type)    _M_GET(wait, type)
3220 #define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
3221 #define _MGET(m, how, type)     ((m) = _M_GET(how, type))
3222 #define _MGETHDR(m, how, type)  ((m) = _M_GETHDR(how, type))
3223
3224 struct mbuf *
3225 m_get(int wait, int type)
3226 {
3227         return (_M_GET(wait, type));
3228 }
3229
3230 struct mbuf *
3231 m_gethdr(int wait, int type)
3232 {
3233         return (_M_GETHDR(wait, type));
3234 }
3235
3236 struct mbuf *
3237 m_retry(int wait, int type)
3238 {
3239         return (_M_RETRY(wait, type));
3240 }
3241
3242 struct mbuf *
3243 m_retryhdr(int wait, int type)
3244 {
3245         return (_M_RETRYHDR(wait, type));
3246 }
3247
3248 struct mbuf *
3249 m_getclr(int wait, int type)
3250 {
3251         struct mbuf *m;
3252
3253         _MGET(m, wait, type);
3254         if (m != NULL)
3255                 bzero(MTOD(m, caddr_t), MLEN);
3256         return (m);
3257 }
3258
3259 struct mbuf *
3260 m_free(struct mbuf *m)
3261 {
3262         struct mbuf *n = m->m_next;
3263
3264         if (m->m_type == MT_FREE)
3265                 panic("m_free: freeing an already freed mbuf");
3266
3267         /* Free the aux data and tags if there is any */
3268         if (m->m_flags & M_PKTHDR) {
3269                 m_tag_delete_chain(m, NULL);
3270         }
3271
3272         if (m->m_flags & M_EXT) {
3273                 u_int32_t refcnt;
3274                 u_int32_t composite;
3275
3276                 refcnt = m_decref(m);
3277                 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3278                 if (refcnt == 0 && !composite) {
3279                         if (m->m_ext.ext_free == NULL) {
3280                                 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3281                         } else if (m->m_ext.ext_free == m_bigfree) {
3282                                 mcache_free(m_cache(MC_BIGCL),
3283                                     m->m_ext.ext_buf);
3284                         } else if (m->m_ext.ext_free == m_16kfree) {
3285                                 mcache_free(m_cache(MC_16KCL),
3286                                     m->m_ext.ext_buf);
3287                         } else {
3288                                 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
3289                                     m->m_ext.ext_size, m->m_ext.ext_arg);
3290                         }
3291                         mcache_free(ref_cache, MEXT_RFA(m));
3292                         MEXT_RFA(m) = NULL;
3293                 } else if (refcnt == 0 && composite) {
3294                         VERIFY(m->m_type != MT_FREE);
3295
3296                         mtype_stat_dec(m->m_type);
3297                         mtype_stat_inc(MT_FREE);
3298
3299                         m->m_type = MT_FREE;
3300                         m->m_flags = M_EXT;
3301                         m->m_len = 0;
3302                         m->m_next = m->m_nextpkt = NULL;
3303
3304                         MEXT_FLAGS(m) &= ~EXTF_READONLY;
3305
3306                         /* "Free" into the intermediate cache */
3307                         if (m->m_ext.ext_free == NULL) {
3308                                 mcache_free(m_cache(MC_MBUF_CL), m);
3309                         } else if (m->m_ext.ext_free == m_bigfree) {
3310                                 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3311                         } else {
3312                                 VERIFY(m->m_ext.ext_free == m_16kfree);
3313                                 mcache_free(m_cache(MC_MBUF_16KCL), m);
3314                         }
3315                         return (n);
3316                 }
3317         }
3318
3319         if (m->m_type != MT_FREE) {
3320                 mtype_stat_dec(m->m_type);
3321                 mtype_stat_inc(MT_FREE);
3322         }
3323
3324         m->m_type = MT_FREE;
3325         m->m_flags = m->m_len = 0;
3326         m->m_next = m->m_nextpkt = NULL;
3327
3328         mcache_free(m_cache(MC_MBUF), m);
3329
3330         return (n);
3331 }
3332
3333 __private_extern__ struct mbuf *
3334 m_clattach(struct mbuf *m, int type, caddr_t extbuf,
3335     void (*extfree)(caddr_t, u_int, caddr_t), u_int extsize, caddr_t extarg,
3336     int wait)
3337 {
3338         struct ext_ref *rfa = NULL;
3339
3340         if (m == NULL && (m = _M_GETHDR(wait, type)) == NULL)
3341                 return (NULL);
3342
3343         if (m->m_flags & M_EXT) {
3344                 u_int32_t refcnt;
3345                 u_int32_t composite;
3346
3347                 refcnt = m_decref(m);
3348                 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3349                 if (refcnt == 0 && !composite) {
3350                         if (m->m_ext.ext_free == NULL) {
3351                                 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3352                         } else if (m->m_ext.ext_free == m_bigfree) {
3353                                 mcache_free(m_cache(MC_BIGCL),
3354                                     m->m_ext.ext_buf);
3355                         } else if (m->m_ext.ext_free == m_16kfree) {
3356                                 mcache_free(m_cache(MC_16KCL),
3357                                     m->m_ext.ext_buf);
3358                         } else {
3359                                 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
3360                                     m->m_ext.ext_size, m->m_ext.ext_arg);
3361                         }
3362                         /* Re-use the reference structure */
3363                         rfa = MEXT_RFA(m);
3364                 } else if (refcnt == 0 && composite) {
3365                         VERIFY(m->m_type != MT_FREE);
3366
3367                         mtype_stat_dec(m->m_type);
3368                         mtype_stat_inc(MT_FREE);
3369
3370                         m->m_type = MT_FREE;
3371                         m->m_flags = M_EXT;
3372                         m->m_len = 0;
3373                         m->m_next = m->m_nextpkt = NULL;
3374
3375                         MEXT_FLAGS(m) &= ~EXTF_READONLY;
3376
3377                         /* "Free" into the intermediate cache */
3378                         if (m->m_ext.ext_free == NULL) {
3379                                 mcache_free(m_cache(MC_MBUF_CL), m);
3380                         } else if (m->m_ext.ext_free == m_bigfree) {
3381                                 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3382                         } else {
3383                                 VERIFY(m->m_ext.ext_free == m_16kfree);
3384                                 mcache_free(m_cache(MC_MBUF_16KCL), m);
3385                         }
3386                         /*
3387                          * Allocate a new mbuf, since we didn't divorce
3388                          * the composite mbuf + cluster pair above.
3389                          */
3390                         if ((m = _M_GETHDR(wait, type)) == NULL)
3391                                 return (NULL);
3392                 }
3393         }
3394
3395         if (rfa == NULL &&
3396             (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
3397                 m_free(m);
3398                 return (NULL);
3399         }
3400
3401         MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa, 1, 0);
3402
3403         return (m);
3404 }
3405
3406 /*
3407  * Perform `fast' allocation mbuf clusters from a cache of recently-freed
3408  * clusters. (If the cache is empty, new clusters are allocated en-masse.)
3409  */
3410 struct mbuf *
3411 m_getcl(int wait, int type, int flags)
3412 {
3413         struct mbuf *m;
3414         int mcflags = MSLEEPF(wait);
3415         int hdr = (flags & M_PKTHDR);
3416
3417         /* Is this due to a non-blocking retry?  If so, then try harder */
3418         if (mcflags & MCR_NOSLEEP)
3419                 mcflags |= MCR_TRYHARD;
3420
3421         m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags);
3422         if (m != NULL) {
3423                 u_int32_t flag;
3424                 struct ext_ref *rfa;
3425                 void *cl;
3426
3427                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3428                 cl = m->m_ext.ext_buf;
3429                 rfa = MEXT_RFA(m);
3430
3431                 ASSERT(cl != NULL && rfa != NULL);
3432                 VERIFY(MBUF_IS_COMPOSITE(m) && m->m_ext.ext_free == NULL);
3433
3434                 flag = MEXT_FLAGS(m);
3435
3436                 MBUF_INIT(m, hdr, type);
3437                 MBUF_CL_INIT(m, cl, rfa, 1, flag);
3438
3439                 mtype_stat_inc(type);
3440                 mtype_stat_dec(MT_FREE);
3441 #if CONFIG_MACF_NET
3442                 if (hdr && mac_init_mbuf(m, wait) != 0) {
3443                         m_freem(m);
3444                         return (NULL);
3445                 }
3446 #endif /* MAC_NET */
3447         }
3448         return (m);
3449 }
3450
3451 /* m_mclget() add an mbuf cluster to a normal mbuf */
3452 struct mbuf *
3453 m_mclget(struct mbuf *m, int wait)
3454 {
3455         struct ext_ref *rfa;
3456
3457         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3458                 return (m);
3459
3460         m->m_ext.ext_buf = m_mclalloc(wait);
3461         if (m->m_ext.ext_buf != NULL) {
3462                 MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3463         } else {
3464                 mcache_free(ref_cache, rfa);
3465         }
3466         return (m);
3467 }
3468
3469 /* Allocate an mbuf cluster */
3470 caddr_t
3471 m_mclalloc(int wait)
3472 {
3473         int mcflags = MSLEEPF(wait);
3474
3475         /* Is this due to a non-blocking retry?  If so, then try harder */
3476         if (mcflags & MCR_NOSLEEP)
3477                 mcflags |= MCR_TRYHARD;
3478
3479         return (mcache_alloc(m_cache(MC_CL), mcflags));
3480 }
3481
3482 /* Free an mbuf cluster */
3483 void
3484 m_mclfree(caddr_t p)
3485 {
3486         mcache_free(m_cache(MC_CL), p);
3487 }
3488
3489 /*
3490  * mcl_hasreference() checks if a cluster of an mbuf is referenced by
3491  * another mbuf; see comments in m_incref() regarding EXTF_READONLY.
3492  */
3493 int
3494 m_mclhasreference(struct mbuf *m)
3495 {
3496         if (!(m->m_flags & M_EXT))
3497                 return (0);
3498
3499         ASSERT(MEXT_RFA(m) != NULL);
3500
3501         return ((MEXT_FLAGS(m) & EXTF_READONLY) ? 1 : 0);
3502 }
3503
3504 __private_extern__ caddr_t
3505 m_bigalloc(int wait)
3506 {
3507         int mcflags = MSLEEPF(wait);
3508
3509         /* Is this due to a non-blocking retry?  If so, then try harder */
3510         if (mcflags & MCR_NOSLEEP)
3511                 mcflags |= MCR_TRYHARD;
3512
3513         return (mcache_alloc(m_cache(MC_BIGCL), mcflags));
3514 }
3515
3516 __private_extern__ void
3517 m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3518 {
3519         mcache_free(m_cache(MC_BIGCL), p);
3520 }
3521
3522 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
3523 __private_extern__ struct mbuf *
3524 m_mbigget(struct mbuf *m, int wait)
3525 {
3526         struct ext_ref *rfa;
3527
3528         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3529                 return (m);
3530
3531         m->m_ext.ext_buf =  m_bigalloc(wait);
3532         if (m->m_ext.ext_buf != NULL) {
3533                 MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3534         } else {
3535                 mcache_free(ref_cache, rfa);
3536         }
3537         return (m);
3538 }
3539
3540 __private_extern__ caddr_t
3541 m_16kalloc(int wait)
3542 {
3543         int mcflags = MSLEEPF(wait);
3544
3545         /* Is this due to a non-blocking retry?  If so, then try harder */
3546         if (mcflags & MCR_NOSLEEP)
3547                 mcflags |= MCR_TRYHARD;
3548
3549         return (mcache_alloc(m_cache(MC_16KCL), mcflags));
3550 }
3551
3552 __private_extern__ void
3553 m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3554 {
3555         mcache_free(m_cache(MC_16KCL), p);
3556 }
3557
3558 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
3559 __private_extern__ struct mbuf *
3560 m_m16kget(struct mbuf *m, int wait)
3561 {
3562         struct ext_ref *rfa;
3563
3564         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3565                 return (m);
3566
3567         m->m_ext.ext_buf =  m_16kalloc(wait);
3568         if (m->m_ext.ext_buf != NULL) {
3569                 MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3570         } else {
3571                 mcache_free(ref_cache, rfa);
3572         }
3573         return (m);
3574 }
3575
3576 /*
3577  * "Move" mbuf pkthdr from "from" to "to".
3578  * "from" must have M_PKTHDR set, and "to" must be empty.
3579  */
3580 void
3581 m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
3582 {
3583         /* We will be taking over the tags of 'to' */
3584         if (to->m_flags & M_PKTHDR)
3585                 m_tag_delete_chain(to, NULL);
3586         to->m_pkthdr = from->m_pkthdr;          /* especially tags */
3587         m_tag_init(from);                       /* purge tags from src */
3588         m_service_class_init(from);             /* reset svc class from src */
3589         from->m_pkthdr.aux_flags = 0;           /* clear aux flags from src */
3590         to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3591         if ((to->m_flags & M_EXT) == 0)
3592                 to->m_data = to->m_pktdat;
3593 }
3594
3595 /*
3596  * Duplicate "from"'s mbuf pkthdr in "to".
3597  * "from" must have M_PKTHDR set, and "to" must be empty.
3598  * In particular, this does a deep copy of the packet tags.
3599  */
3600 static int
3601 m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
3602 {
3603         if (to->m_flags & M_PKTHDR)
3604                 m_tag_delete_chain(to, NULL);
3605         to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3606         if ((to->m_flags & M_EXT) == 0)
3607                 to->m_data = to->m_pktdat;
3608         to->m_pkthdr = from->m_pkthdr;
3609         m_tag_init(to);
3610         return (m_tag_copy_chain(to, from, how));
3611 }
3612
3613 void
3614 m_copy_pftag(struct mbuf *to, struct mbuf *from)
3615 {
3616         to->m_pkthdr.pf_mtag = from->m_pkthdr.pf_mtag;
3617         to->m_pkthdr.pf_mtag.pftag_hdr = NULL;
3618         to->m_pkthdr.pf_mtag.pftag_flags &= ~(PF_TAG_HDR_INET|PF_TAG_HDR_INET6);
3619 }
3620
3621 /*
3622  * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
3623  * if wantall is not set, return whatever number were available.  Set up the
3624  * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
3625  * are chained on the m_nextpkt field.  Any packets requested beyond this
3626  * are chained onto the last packet header's m_next field.  The size of
3627  * the cluster is controlled by the parameter bufsize.
3628  */
3629 __private_extern__ struct mbuf *
3630 m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
3631     int wait, int wantall, size_t bufsize)
3632 {
3633         struct mbuf *m;
3634         struct mbuf **np, *top;
3635         unsigned int pnum, needed = *num_needed;
3636         mcache_obj_t *mp_list = NULL;
3637         int mcflags = MSLEEPF(wait);
3638         u_int32_t flag;
3639         struct ext_ref *rfa;
3640         mcache_t *cp;
3641         void *cl;
3642
3643         ASSERT(bufsize == m_maxsize(MC_CL) ||
3644             bufsize == m_maxsize(MC_BIGCL) ||
3645             bufsize == m_maxsize(MC_16KCL));
3646
3647         /*
3648          * Caller must first check for njcl because this
3649          * routine is internal and not exposed/used via KPI.
3650          */
3651         VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0);
3652
3653         top = NULL;
3654         np = &top;
3655         pnum = 0;
3656
3657         /*
3658          * The caller doesn't want all the requested buffers; only some.
3659          * Try hard to get what we can, but don't block.  This effectively
3660          * overrides MCR_SLEEP, since this thread will not go to sleep
3661          * if we can't get all the buffers.
3662          */
3663         if (!wantall || (mcflags & MCR_NOSLEEP))
3664                 mcflags |= MCR_TRYHARD;
3665
3666         /* Allocate the composite mbuf + cluster elements from the cache */
3667         if (bufsize == m_maxsize(MC_CL))
3668                 cp = m_cache(MC_MBUF_CL);
3669         else if (bufsize == m_maxsize(MC_BIGCL))
3670                 cp = m_cache(MC_MBUF_BIGCL);
3671         else
3672                 cp = m_cache(MC_MBUF_16KCL);
3673         needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
3674
3675         for (pnum = 0; pnum < needed; pnum++) {
3676                 m = (struct mbuf *)mp_list;
3677                 mp_list = mp_list->obj_next;
3678
3679                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3680                 cl = m->m_ext.ext_buf;
3681                 rfa = MEXT_RFA(m);
3682
3683                 ASSERT(cl != NULL && rfa != NULL);
3684                 VERIFY(MBUF_IS_COMPOSITE(m));
3685
3686                 flag = MEXT_FLAGS(m);
3687
3688                 MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
3689                 if (bufsize == m_maxsize(MC_16KCL)) {
3690                         MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
3691                 } else if (bufsize == m_maxsize(MC_BIGCL)) {
3692                         MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
3693                 } else {
3694                         MBUF_CL_INIT(m, cl, rfa, 1, flag);
3695                 }
3696
3697                 if (num_with_pkthdrs > 0) {
3698                         --num_with_pkthdrs;
3699 #if CONFIG_MACF_NET
3700                         if (mac_mbuf_label_init(m, wait) != 0) {
3701                                 m_freem(m);
3702                                 break;
3703                         }
3704 #endif /* MAC_NET */
3705                 }
3706
3707                 *np = m;
3708                 if (num_with_pkthdrs > 0)
3709                         np = &m->m_nextpkt;
3710                 else
3711                         np = &m->m_next;
3712         }
3713         ASSERT(pnum != *num_needed || mp_list == NULL);
3714         if (mp_list != NULL)
3715                 mcache_free_ext(cp, mp_list);
3716
3717         if (pnum > 0) {
3718                 mtype_stat_add(MT_DATA, pnum);
3719                 mtype_stat_sub(MT_FREE, pnum);
3720         }
3721
3722         if (wantall && (pnum != *num_needed)) {
3723                 if (top != NULL)
3724                         m_freem_list(top);
3725                 return (NULL);
3726         }
3727
3728         if (pnum > *num_needed) {
3729                 printf("%s: File a radar related to <rdar://10146739>. \
3730                         needed = %u, pnum = %u, num_needed = %u \n",
3731                         __func__, needed, pnum, *num_needed);
3732         }
3733
3734         *num_needed = pnum;
3735         return (top);
3736 }
3737
3738 /*
3739  * Return list of mbuf linked by m_nextpkt.  Try for numlist, and if
3740  * wantall is not set, return whatever number were available.  The size of
3741  * each mbuf in the list is controlled by the parameter packetlen.  Each
3742  * mbuf of the list may have a chain of mbufs linked by m_next.  Each mbuf
3743  * in the chain is called a segment.  If maxsegments is not null and the
3744  * value pointed to is not null, this specify the maximum number of segments
3745  * for a chain of mbufs.  If maxsegments is zero or the value pointed to
3746  * is zero the caller does not have any restriction on the number of segments.
3747  * The actual  number of segments of a mbuf chain is return in the value
3748  * pointed to by maxsegments.
3749  */
3750 __private_extern__ struct mbuf *
3751 m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
3752     unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
3753 {
3754         struct mbuf **np, *top, *first = NULL;
3755         size_t bufsize, r_bufsize;
3756         unsigned int num = 0;
3757         unsigned int nsegs = 0;
3758         unsigned int needed, resid;
3759         int mcflags = MSLEEPF(wait);
3760         mcache_obj_t *mp_list = NULL, *rmp_list = NULL;
3761         mcache_t *cp = NULL, *rcp = NULL;
3762
3763         if (*numlist == 0)
3764                 return (NULL);
3765
3766         top = NULL;
3767         np = &top;
3768
3769         if (wantsize == 0) {
3770                 if (packetlen <= MINCLSIZE) {
3771                         bufsize = packetlen;
3772                 } else if (packetlen > m_maxsize(MC_CL)) {
3773                         /* Use 4KB if jumbo cluster pool isn't available */
3774                         if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0)
3775                                 bufsize = m_maxsize(MC_BIGCL);
3776                         else
3777                                 bufsize = m_maxsize(MC_16KCL);
3778                 } else {
3779                         bufsize = m_maxsize(MC_CL);
3780                 }
3781         } else if (wantsize == m_maxsize(MC_CL) ||
3782             wantsize == m_maxsize(MC_BIGCL) ||
3783             (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) {
3784                 bufsize = wantsize;
3785         } else {
3786                 return (NULL);
3787         }
3788
3789         if (bufsize <= MHLEN) {
3790                 nsegs = 1;
3791         } else if (bufsize <= MINCLSIZE) {
3792                 if (maxsegments != NULL && *maxsegments == 1) {
3793                         bufsize = m_maxsize(MC_CL);
3794                         nsegs = 1;
3795                 } else {
3796                         nsegs = 2;
3797                 }
3798         } else if (bufsize == m_maxsize(MC_16KCL)) {
3799                 VERIFY(njcl > 0);
3800                 nsegs = ((packetlen - 1) >> (PGSHIFT + 2)) + 1;
3801         } else if (bufsize == m_maxsize(MC_BIGCL)) {
3802                 nsegs = ((packetlen - 1) >> PGSHIFT) + 1;
3803         } else {
3804                 nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
3805         }
3806         if (maxsegments != NULL) {
3807                 if (*maxsegments && nsegs > *maxsegments) {
3808                         *maxsegments = nsegs;
3809                         return (NULL);
3810                 }
3811                 *maxsegments = nsegs;
3812         }
3813
3814         /*
3815          * The caller doesn't want all the requested buffers; only some.
3816          * Try hard to get what we can, but don't block.  This effectively
3817          * overrides MCR_SLEEP, since this thread will not go to sleep
3818          * if we can't get all the buffers.
3819          */
3820         if (!wantall || (mcflags & MCR_NOSLEEP))
3821                 mcflags |= MCR_TRYHARD;
3822
3823         /*
3824          * Simple case where all elements in the lists/chains are mbufs.
3825          * Unless bufsize is greater than MHLEN, each segment chain is made
3826          * up of exactly 1 mbuf.  Otherwise, each segment chain is made up
3827          * of 2 mbufs; the second one is used for the residual data, i.e.
3828          * the remaining data that cannot fit into the first mbuf.
3829          */
3830         if (bufsize <= MINCLSIZE) {
3831                 /* Allocate the elements in one shot from the mbuf cache */
3832                 ASSERT(bufsize <= MHLEN || nsegs == 2);
3833                 cp = m_cache(MC_MBUF);
3834                 needed = mcache_alloc_ext(cp, &mp_list,
3835                     (*numlist) * nsegs, mcflags);
3836
3837                 /*
3838                  * The number of elements must be even if we are to use an
3839                  * mbuf (instead of a cluster) to store the residual data.
3840                  * If we couldn't allocate the requested number of mbufs,
3841                  * trim the number down (if it's odd) in order to avoid
3842                  * creating a partial segment chain.
3843                  */
3844                 if (bufsize > MHLEN && (needed & 0x1))
3845                         needed--;
3846
3847                 while (num < needed) {
3848                         struct mbuf *m;
3849
3850                         m = (struct mbuf *)mp_list;
3851                         mp_list = mp_list->obj_next;
3852                         ASSERT(m != NULL);
3853
3854                         MBUF_INIT(m, 1, MT_DATA);
3855 #if CONFIG_MACF_NET
3856                         if (mac_init_mbuf(m, wait) != 0) {
3857                                 m_free(m);
3858                                 break;
3859                         }
3860 #endif /* MAC_NET */
3861                         num++;
3862                         if (bufsize > MHLEN) {
3863                                 /* A second mbuf for this segment chain */
3864                                 m->m_next = (struct mbuf *)mp_list;
3865                                 mp_list = mp_list->obj_next;
3866                                 ASSERT(m->m_next != NULL);
3867
3868                                 MBUF_INIT(m->m_next, 0, MT_DATA);
3869                                 num++;
3870                         }
3871                         *np = m;
3872                         np = &m->m_nextpkt;
3873                 }
3874                 ASSERT(num != *numlist || mp_list == NULL);
3875
3876                 if (num > 0) {
3877                         mtype_stat_add(MT_DATA, num);
3878                         mtype_stat_sub(MT_FREE, num);
3879                 }
3880                 num /= nsegs;
3881
3882                 /* We've got them all; return to caller */
3883                 if (num == *numlist)
3884                         return (top);
3885
3886                 goto fail;
3887         }
3888
3889         /*
3890          * Complex cases where elements are made up of one or more composite
3891          * mbufs + cluster, depending on packetlen.  Each N-segment chain can
3892          * be illustrated as follows:
3893          *
3894          * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
3895          *
3896          * Every composite mbuf + cluster element comes from the intermediate
3897          * cache (either MC_MBUF_CL or MC_MBUF_BIGCL).  For space efficiency,
3898          * the last composite element will come from the MC_MBUF_CL cache,
3899          * unless the residual data is larger than 2KB where we use the
3900          * big cluster composite cache (MC_MBUF_BIGCL) instead.  Residual
3901          * data is defined as extra data beyond the first element that cannot
3902          * fit into the previous element, i.e. there is no residual data if
3903          * the chain only has 1 segment.
3904          */
3905         r_bufsize = bufsize;
3906         resid = packetlen > bufsize ? packetlen % bufsize : 0;
3907         if (resid > 0) {
3908                 /* There is residual data; figure out the cluster size */
3909                 if (wantsize == 0 && packetlen > MINCLSIZE) {
3910                         /*
3911                          * Caller didn't request that all of the segments
3912                          * in the chain use the same cluster size; use the
3913                          * smaller of the cluster sizes.
3914                          */
3915                         if (njcl > 0 && resid > m_maxsize(MC_BIGCL))
3916                                 r_bufsize = m_maxsize(MC_16KCL);
3917                         else if (resid > m_maxsize(MC_CL))
3918                                 r_bufsize = m_maxsize(MC_BIGCL);
3919                         else
3920                                 r_bufsize = m_maxsize(MC_CL);
3921                 } else {
3922                         /* Use the same cluster size as the other segments */
3923                         resid = 0;
3924                 }
3925         }
3926
3927         needed = *numlist;
3928         if (resid > 0) {
3929                 /*
3930                  * Attempt to allocate composite mbuf + cluster elements for
3931                  * the residual data in each chain; record the number of such
3932                  * elements that can be allocated so that we know how many
3933                  * segment chains we can afford to create.
3934                  */
3935                 if (r_bufsize <= m_maxsize(MC_CL))
3936                         rcp = m_cache(MC_MBUF_CL);
3937                 else if (r_bufsize <= m_maxsize(MC_BIGCL))
3938                         rcp = m_cache(MC_MBUF_BIGCL);
3939                 else
3940                         rcp = m_cache(MC_MBUF_16KCL);
3941                 needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
3942
3943                 if (needed == 0)
3944                         goto fail;
3945
3946                 /* This is temporarily reduced for calculation */
3947                 ASSERT(nsegs > 1);
3948                 nsegs--;
3949         }
3950
3951         /*
3952          * Attempt to allocate the rest of the composite mbuf + cluster
3953          * elements for the number of segment chains that we need.
3954          */
3955         if (bufsize <= m_maxsize(MC_CL))
3956                 cp = m_cache(MC_MBUF_CL);
3957         else if (bufsize <= m_maxsize(MC_BIGCL))
3958                 cp = m_cache(MC_MBUF_BIGCL);
3959         else
3960                 cp = m_cache(MC_MBUF_16KCL);
3961         needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
3962
3963         /* Round it down to avoid creating a partial segment chain */
3964         needed = (needed / nsegs) * nsegs;
3965         if (needed == 0)
3966                 goto fail;
3967
3968         if (resid > 0) {
3969                 /*
3970                  * We're about to construct the chain(s); take into account
3971                  * the number of segments we have created above to hold the
3972                  * residual data for each chain, as well as restore the
3973                  * original count of segments per chain.
3974                  */
3975                 ASSERT(nsegs > 0);
3976                 needed += needed / nsegs;
3977                 nsegs++;
3978         }
3979
3980         for (;;) {
3981                 struct mbuf *m;
3982                 u_int32_t flag;
3983                 struct ext_ref *rfa;
3984                 void *cl;
3985                 int pkthdr;
3986
3987                 ++num;
3988                 if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
3989                         m = (struct mbuf *)mp_list;
3990                         mp_list = mp_list->obj_next;
3991                 } else {
3992                         m = (struct mbuf *)rmp_list;
3993                         rmp_list = rmp_list->obj_next;
3994                 }
3995                 ASSERT(m != NULL);
3996                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3997                 VERIFY(m->m_ext.ext_free == NULL ||
3998                     m->m_ext.ext_free == m_bigfree ||
3999                     m->m_ext.ext_free == m_16kfree);
4000
4001                 cl = m->m_ext.ext_buf;
4002                 rfa = MEXT_RFA(m);
4003
4004                 ASSERT(cl != NULL && rfa != NULL);
4005                 VERIFY(MBUF_IS_COMPOSITE(m));
4006
4007                 flag = MEXT_FLAGS(m);
4008
4009                 pkthdr = (nsegs == 1 || (num % nsegs) == 1);
4010                 if (pkthdr)
4011                         first = m;
4012                 MBUF_INIT(m, pkthdr, MT_DATA);
4013                 if (m->m_ext.ext_free == m_16kfree) {
4014                         MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
4015                 } else if (m->m_ext.ext_free == m_bigfree) {
4016                         MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
4017                 } else {
4018                         MBUF_CL_INIT(m, cl, rfa, 1, flag);
4019                 }
4020 #if CONFIG_MACF_NET
4021                 if (pkthdr && mac_init_mbuf(m, wait) != 0) {
4022                         --num;
4023                         m_freem(m);
4024                         break;
4025                 }
4026 #endif /* MAC_NET */
4027
4028                 *np = m;
4029                 if ((num % nsegs) == 0)
4030                         np = &first->m_nextpkt;
4031                 else
4032                         np = &m->m_next;
4033
4034                 if (num == needed)
4035                         break;
4036         }
4037
4038         if (num > 0) {
4039                 mtype_stat_add(MT_DATA, num);
4040                 mtype_stat_sub(MT_FREE, num);
4041         }
4042
4043         num /= nsegs;
4044
4045         /* We've got them all; return to caller */
4046         if (num == *numlist) {
4047                 ASSERT(mp_list == NULL && rmp_list == NULL);
4048                 return (top);
4049         }
4050
4051 fail:
4052         /* Free up what's left of the above */
4053         if (mp_list != NULL)
4054                 mcache_free_ext(cp, mp_list);
4055         if (rmp_list != NULL)
4056                 mcache_free_ext(rcp, rmp_list);
4057         if (wantall && top != NULL) {
4058                 m_freem(top);
4059                 return (NULL);
4060         }
4061         *numlist = num;
4062         return (top);
4063 }
4064
4065 /*
4066  * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
4067  * packets on receive ring.
4068  */
4069 __private_extern__ struct mbuf *
4070 m_getpacket_how(int wait)
4071 {
4072         unsigned int num_needed = 1;
4073
4074         return (m_getpackets_internal(&num_needed, 1, wait, 1,
4075             m_maxsize(MC_CL)));
4076 }
4077
4078 /*
4079  * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
4080  * packets on receive ring.
4081  */
4082 struct mbuf *
4083 m_getpacket(void)
4084 {
4085         unsigned int num_needed = 1;
4086
4087         return (m_getpackets_internal(&num_needed, 1, M_WAIT, 1,
4088             m_maxsize(MC_CL)));
4089 }
4090
4091 /*
4092  * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
4093  * if this can't be met, return whatever number were available.  Set up the
4094  * first num_with_pkthdrs with mbuf hdrs configured as packet headers.  These
4095  * are chained on the m_nextpkt field.  Any packets requested beyond this are
4096  * chained onto the last packet header's m_next field.
4097  */
4098 struct mbuf *
4099 m_getpackets(int num_needed, int num_with_pkthdrs, int how)
4100 {
4101         unsigned int n = num_needed;
4102
4103         return (m_getpackets_internal(&n, num_with_pkthdrs, how, 0,
4104             m_maxsize(MC_CL)));
4105 }
4106
4107 /*
4108  * Return a list of mbuf hdrs set up as packet hdrs chained together
4109  * on the m_nextpkt field
4110  */
4111 struct mbuf *
4112 m_getpackethdrs(int num_needed, int how)
4113 {
4114         struct mbuf *m;
4115         struct mbuf **np, *top;
4116
4117         top = NULL;
4118         np = &top;
4119
4120         while (num_needed--) {
4121                 m = _M_RETRYHDR(how, MT_DATA);
4122                 if (m == NULL)
4123                         break;
4124
4125                 *np = m;
4126                 np = &m->m_nextpkt;
4127         }
4128
4129         return (top);
4130 }
4131
4132 /*
4133  * Free an mbuf list (m_nextpkt) while following m_next.  Returns the count
4134  * for mbufs packets freed.  Used by the drivers.
4135  */
4136 int
4137 m_freem_list(struct mbuf *m)
4138 {
4139         struct mbuf *nextpkt;
4140         mcache_obj_t *mp_list = NULL;
4141         mcache_obj_t *mcl_list = NULL;
4142         mcache_obj_t *mbc_list = NULL;
4143         mcache_obj_t *m16k_list = NULL;
4144         mcache_obj_t *m_mcl_list = NULL;
4145         mcache_obj_t *m_mbc_list = NULL;
4146         mcache_obj_t *m_m16k_list = NULL;
4147         mcache_obj_t *ref_list = NULL;
4148         int pktcount = 0;
4149         int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
4150
4151         while (m != NULL) {
4152                 pktcount++;
4153
4154                 nextpkt = m->m_nextpkt;
4155                 m->m_nextpkt = NULL;
4156
4157                 while (m != NULL) {
4158                         struct mbuf *next = m->m_next;
4159                         mcache_obj_t *o, *rfa;
4160                         u_int32_t refcnt, composite;
4161
4162                         if (m->m_type == MT_FREE)
4163                                 panic("m_free: freeing an already freed mbuf");
4164
4165                         if (m->m_type != MT_FREE)
4166                                 mt_free++;
4167
4168                         if (m->m_flags & M_PKTHDR) {
4169                                 m_tag_delete_chain(m, NULL);
4170                         }
4171
4172                         if (!(m->m_flags & M_EXT))
4173                                 goto simple_free;
4174
4175                         o = (mcache_obj_t *)(void *)m->m_ext.ext_buf;
4176                         refcnt = m_decref(m);
4177                         composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
4178                         if (refcnt == 0 && !composite) {
4179                                 if (m->m_ext.ext_free == NULL) {
4180                                         o->obj_next = mcl_list;
4181                                         mcl_list = o;
4182                                 } else if (m->m_ext.ext_free == m_bigfree) {
4183                                         o->obj_next = mbc_list;
4184                                         mbc_list = o;
4185                                 } else if (m->m_ext.ext_free == m_16kfree) {
4186                                         o->obj_next = m16k_list;
4187                                         m16k_list = o;
4188                                 } else {
4189                                         (*(m->m_ext.ext_free))((caddr_t)o,
4190                                             m->m_ext.ext_size,
4191                                             m->m_ext.ext_arg);
4192                                 }
4193                                 rfa = (mcache_obj_t *)(void *)MEXT_RFA(m);
4194                                 rfa->obj_next = ref_list;
4195                                 ref_list = rfa;
4196                                 MEXT_RFA(m) = NULL;
4197                         } else if (refcnt == 0 && composite) {
4198                                 VERIFY(m->m_type != MT_FREE);
4199                                 /*
4200                                  * Amortize the costs of atomic operations
4201                                  * by doing them at the end, if possible.
4202                                  */
4203                                 if (m->m_type == MT_DATA)
4204                                         mt_data++;
4205                                 else if (m->m_type == MT_HEADER)
4206                                         mt_header++;
4207                                 else if (m->m_type == MT_SONAME)
4208                                         mt_soname++;
4209                                 else if (m->m_type == MT_TAG)
4210                                         mt_tag++;
4211                                 else
4212                                         mtype_stat_dec(m->m_type);
4213
4214                                 m->m_type = MT_FREE;
4215                                 m->m_flags = M_EXT;
4216                                 m->m_len = 0;
4217                                 m->m_next = m->m_nextpkt = NULL;
4218
4219                                 MEXT_FLAGS(m) &= ~EXTF_READONLY;
4220
4221                                 /* "Free" into the intermediate cache */
4222                                 o = (mcache_obj_t *)m;
4223                                 if (m->m_ext.ext_free == NULL) {
4224                                         o->obj_next = m_mcl_list;
4225                                         m_mcl_list = o;
4226                                 } else if (m->m_ext.ext_free == m_bigfree) {
4227                                         o->obj_next = m_mbc_list;
4228                                         m_mbc_list = o;
4229                                 } else {
4230                                         VERIFY(m->m_ext.ext_free == m_16kfree);
4231                                         o->obj_next = m_m16k_list;
4232                                         m_m16k_list = o;
4233                                 }
4234                                 m = next;
4235                                 continue;
4236                         }
4237 simple_free:
4238                         /*
4239                          * Amortize the costs of atomic operations
4240                          * by doing them at the end, if possible.
4241                          */
4242                         if (m->m_type == MT_DATA)
4243                                 mt_data++;
4244                         else if (m->m_type == MT_HEADER)
4245                                 mt_header++;
4246                         else if (m->m_type == MT_SONAME)
4247                                 mt_soname++;
4248                         else if (m->m_type == MT_TAG)
4249                                 mt_tag++;
4250                         else if (m->m_type != MT_FREE)
4251                                 mtype_stat_dec(m->m_type);
4252
4253                         m->m_type = MT_FREE;
4254                         m->m_flags = m->m_len = 0;
4255                         m->m_next = m->m_nextpkt = NULL;
4256
4257                         ((mcache_obj_t *)m)->obj_next = mp_list;
4258                         mp_list = (mcache_obj_t *)m;
4259
4260                         m = next;
4261                 }
4262
4263                 m = nextpkt;
4264         }
4265
4266         if (mt_free > 0)
4267                 mtype_stat_add(MT_FREE, mt_free);
4268         if (mt_data > 0)
4269                 mtype_stat_sub(MT_DATA, mt_data);
4270         if (mt_header > 0)
4271                 mtype_stat_sub(MT_HEADER, mt_header);
4272         if (mt_soname > 0)
4273                 mtype_stat_sub(MT_SONAME, mt_soname);
4274         if (mt_tag > 0)
4275                 mtype_stat_sub(MT_TAG, mt_tag);
4276
4277         if (mp_list != NULL)
4278                 mcache_free_ext(m_cache(MC_MBUF), mp_list);
4279         if (mcl_list != NULL)
4280                 mcache_free_ext(m_cache(MC_CL), mcl_list);
4281         if (mbc_list != NULL)
4282                 mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
4283         if (m16k_list != NULL)
4284                 mcache_free_ext(m_cache(MC_16KCL), m16k_list);
4285         if (m_mcl_list != NULL)
4286                 mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
4287         if (m_mbc_list != NULL)
4288                 mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
4289         if (m_m16k_list != NULL)
4290                 mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
4291         if (ref_list != NULL)
4292                 mcache_free_ext(ref_cache, ref_list);
4293
4294         return (pktcount);
4295 }
4296
4297 void
4298 m_freem(struct mbuf *m)
4299 {
4300         while (m != NULL)
4301                 m = m_free(m);
4302 }
4303
4304 /*
4305  * Mbuffer utility routines.
4306  */
4307
4308 /*
4309  * Compute the amount of space available before the current start
4310  * of data in an mbuf.
4311  */
4312 int
4313 m_leadingspace(struct mbuf *m)
4314 {
4315         if (m->m_flags & M_EXT) {
4316                 if (MCLHASREFERENCE(m))
4317                         return (0);
4318                 return (m->m_data - m->m_ext.ext_buf);
4319         }
4320         if (m->m_flags & M_PKTHDR)
4321                 return (m->m_data - m->m_pktdat);
4322         return (m->m_data - m->m_dat);
4323 }
4324
4325 /*
4326  * Compute the amount of space available after the end of data in an mbuf.
4327  */
4328 int
4329 m_trailingspace(struct mbuf *m)
4330 {
4331         if (m->m_flags & M_EXT) {
4332                 if (MCLHASREFERENCE(m))
4333                         return (0);
4334                 return (m->m_ext.ext_buf + m->m_ext.ext_size -
4335                     (m->m_data + m->m_len));
4336         }
4337         return (&m->m_dat[MLEN] - (m->m_data + m->m_len));
4338 }
4339
4340 /*
4341  * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
4342  * copy junk along.  Does not adjust packet header length.
4343  */
4344 struct mbuf *
4345 m_prepend(struct mbuf *m, int len, int how)
4346 {
4347         struct mbuf *mn;
4348
4349         _MGET(mn, how, m->m_type);
4350         if (mn == NULL) {
4351                 m_freem(m);
4352                 return (NULL);
4353         }
4354         if (m->m_flags & M_PKTHDR) {
4355                 M_COPY_PKTHDR(mn, m);
4356                 m->m_flags &= ~M_PKTHDR;
4357         }
4358         mn->m_next = m;
4359         m = mn;
4360         if (len < MHLEN)
4361                 MH_ALIGN(m, len);
4362         m->m_len = len;
4363         return (m);
4364 }
4365
4366 /*
4367  * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
4368  * chain, copy junk along, and adjust length.
4369  */
4370 struct mbuf *
4371 m_prepend_2(struct mbuf *m, int len, int how)
4372 {
4373         if (M_LEADINGSPACE(m) >= len) {
4374                 m->m_data -= len;
4375                 m->m_len += len;
4376         } else {
4377                 m = m_prepend(m, len, how);
4378         }
4379         if ((m) && (m->m_flags & M_PKTHDR))
4380                 m->m_pkthdr.len += len;
4381         return (m);
4382 }
4383
4384 /*
4385  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
4386  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
4387  * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
4388  */
4389 int MCFail;
4390
4391 struct mbuf *
4392 m_copym(struct mbuf *m, int off0, int len, int wait)
4393 {
4394         struct mbuf *n, *mhdr = NULL, **np;
4395         int off = off0;
4396         struct mbuf *top;
4397         int copyhdr = 0;
4398
4399         if (off < 0 || len < 0)
4400                 panic("m_copym: invalid offset %d or len %d", off, len);
4401
4402         if (off == 0 && (m->m_flags & M_PKTHDR)) {
4403                 mhdr = m;
4404                 copyhdr = 1;
4405         }
4406
4407         while (off >= m->m_len) {
4408                 if (m->m_next == NULL)
4409                         panic("m_copym: invalid mbuf chain");
4410                 off -= m->m_len;
4411                 m = m->m_next;
4412         }
4413         np = &top;
4414         top = NULL;
4415
4416         while (len > 0) {
4417                 if (m == NULL) {
4418                         if (len != M_COPYALL)
4419                                 panic("m_copym: len != M_COPYALL");
4420                         break;
4421                 }
4422
4423                 n = _M_RETRY(wait, m->m_type);
4424                 *np = n;
4425
4426                 if (n == NULL)
4427                         goto nospace;
4428
4429                 if (copyhdr != 0) {
4430                         M_COPY_PKTHDR(n, mhdr);
4431                         if (len == M_COPYALL)
4432                                 n->m_pkthdr.len -= off0;
4433                         else
4434                                 n->m_pkthdr.len = len;
4435                         copyhdr = 0;
4436                 }
4437                 if (len == M_COPYALL) {
4438                         if (MIN(len, (m->m_len - off)) == len) {
4439                                 printf("m->m_len %d - off %d = %d, %d\n",
4440                                     m->m_len, off, m->m_len - off,
4441                                     MIN(len, (m->m_len - off)));
4442                         }
4443                 }
4444                 n->m_len = MIN(len, (m->m_len - off));
4445                 if (n->m_len == M_COPYALL) {
4446                         printf("n->m_len == M_COPYALL, fixing\n");
4447                         n->m_len = MHLEN;
4448                 }
4449                 if (m->m_flags & M_EXT) {
4450                         n->m_ext = m->m_ext;
4451                         m_incref(m);
4452                         n->m_data = m->m_data + off;
4453                         n->m_flags |= M_EXT;
4454                 } else {
4455                         bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4456                             (unsigned)n->m_len);
4457                 }
4458                 if (len != M_COPYALL)
4459                         len -= n->m_len;
4460                 off = 0;
4461                 m = m->m_next;
4462                 np = &n->m_next;
4463         }
4464
4465         if (top == NULL)
4466                 MCFail++;
4467
4468         return (top);
4469 nospace:
4470
4471         m_freem(top);
4472         MCFail++;
4473         return (NULL);
4474 }
4475
4476 /*
4477  * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
4478  * within this routine also, the last mbuf and offset accessed are passed
4479  * out and can be passed back in to avoid having to rescan the entire mbuf
4480  * list (normally hung off of the socket)
4481  */
4482 struct mbuf *
4483 m_copym_with_hdrs(struct mbuf *m, int off0, int len0, int wait,
4484     struct mbuf **m_lastm, int *m_off)
4485 {
4486         struct mbuf *n, **np = NULL;
4487         int off = off0, len = len0;
4488         struct mbuf *top = NULL;
4489         int mcflags = MSLEEPF(wait);
4490         int copyhdr = 0;
4491         int type = 0;
4492         mcache_obj_t *list = NULL;
4493         int needed = 0;
4494
4495         if (off == 0 && (m->m_flags & M_PKTHDR))
4496                 copyhdr = 1;
4497
4498         if (*m_lastm != NULL) {
4499                 m = *m_lastm;
4500                 off = *m_off;
4501         } else {
4502                 while (off >= m->m_len) {
4503                         off -= m->m_len;
4504                         m = m->m_next;
4505                 }
4506         }
4507
4508         n = m;
4509         while (len > 0) {
4510                 needed++;
4511                 ASSERT(n != NULL);
4512                 len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
4513                 n = n->m_next;
4514         }
4515         needed++;
4516         len = len0;
4517
4518         /*
4519          * If the caller doesn't want to be put to sleep, mark it with
4520          * MCR_TRYHARD so that we may reclaim buffers from other places
4521          * before giving up.
4522          */
4523         if (mcflags & MCR_NOSLEEP)
4524                 mcflags |= MCR_TRYHARD;
4525
4526         if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
4527             mcflags) != needed)
4528                 goto nospace;
4529
4530         needed = 0;
4531         while (len > 0) {
4532                 n = (struct mbuf *)list;
4533                 list = list->obj_next;
4534                 ASSERT(n != NULL && m != NULL);
4535
4536                 type = (top == NULL) ? MT_HEADER : m->m_type;
4537                 MBUF_INIT(n, (top == NULL), type);
4538 #if CONFIG_MACF_NET
4539                 if (top == NULL && mac_mbuf_label_init(n, wait) != 0) {
4540                         mtype_stat_inc(MT_HEADER);
4541                         mtype_stat_dec(MT_FREE);
4542                         m_free(n);
4543                         goto nospace;
4544                 }
4545 #endif /* MAC_NET */
4546
4547                 if (top == NULL) {
4548                         top = n;
4549                         np = &top->m_next;
4550                         continue;
4551                 } else {
4552                         needed++;
4553                         *np = n;
4554                 }
4555
4556                 if (copyhdr) {
4557                         M_COPY_PKTHDR(n, m);
4558                         n->m_pkthdr.len = len;
4559                         copyhdr = 0;
4560                 }
4561                 n->m_len = MIN(len, (m->m_len - off));
4562
4563                 if (m->m_flags & M_EXT) {
4564                         n->m_ext = m->m_ext;
4565                         m_incref(m);
4566                         n->m_data = m->m_data + off;
4567                         n->m_flags |= M_EXT;
4568                 } else {
4569                         bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4570                             (unsigned)n->m_len);
4571                 }
4572                 len -= n->m_len;
4573
4574                 if (len == 0) {
4575                         if ((off + n->m_len) == m->m_len) {
4576                                 *m_lastm = m->m_next;
4577                                 *m_off  = 0;
4578                         } else {
4579                                 *m_lastm = m;
4580                                 *m_off  = off + n->m_len;
4581                         }
4582                         break;
4583                 }
4584                 off = 0;
4585                 m = m->m_next;
4586                 np = &n->m_next;
4587         }
4588
4589         mtype_stat_inc(MT_HEADER);
4590         mtype_stat_add(type, needed);
4591         mtype_stat_sub(MT_FREE, needed + 1);
4592
4593         ASSERT(list == NULL);
4594         return (top);
4595
4596 nospace:
4597         if (list != NULL)
4598                 mcache_free_ext(m_cache(MC_MBUF), list);
4599         if (top != NULL)
4600                 m_freem(top);
4601         MCFail++;
4602         return (NULL);
4603 }
4604
4605 /*
4606  * Copy data from an mbuf chain starting "off" bytes from the beginning,
4607  * continuing for "len" bytes, into the indicated buffer.
4608  */
4609 void
4610 m_copydata(struct mbuf *m, int off, int len, void *vp)
4611 {
4612         unsigned count;
4613         char *cp = vp;
4614
4615         if (off < 0 || len < 0)
4616                 panic("m_copydata: invalid offset %d or len %d", off, len);
4617
4618         while (off > 0) {
4619                 if (m == NULL)
4620                         panic("m_copydata: invalid mbuf chain");
4621                 if (off < m->m_len)
4622                         break;
4623                 off -= m->m_len;
4624                 m = m->m_next;
4625         }
4626         while (len > 0) {
4627                 if (m == NULL)
4628                         panic("m_copydata: invalid mbuf chain");
4629                 count = MIN(m->m_len - off, len);
4630                 bcopy(MTOD(m, caddr_t) + off, cp, count);
4631                 len -= count;
4632                 cp += count;
4633                 off = 0;
4634                 m = m->m_next;
4635         }
4636 }
4637
4638 /*
4639  * Concatenate mbuf chain n to m.  Both chains must be of the same type
4640  * (e.g. MT_DATA).  Any m_pkthdr is not updated.
4641  */
4642 void
4643 m_cat(struct mbuf *m, struct mbuf *n)
4644 {
4645         while (m->m_next)
4646                 m = m->m_next;
4647         while (n) {
4648                 if ((m->m_flags & M_EXT) ||
4649                     m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
4650                         /* just join the two chains */
4651                         m->m_next = n;
4652                         return;
4653                 }
4654                 /* splat the data from one into the other */
4655                 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4656                     (u_int)n->m_len);
4657                 m->m_len += n->m_len;
4658                 n = m_free(n);
4659         }
4660 }
4661
4662 void
4663 m_adj(struct mbuf *mp, int req_len)
4664 {
4665         int len = req_len;
4666         struct mbuf *m;
4667         int count;
4668
4669         if ((m = mp) == NULL)
4670                 return;
4671         if (len >= 0) {
4672                 /*
4673                  * Trim from head.
4674                  */
4675                 while (m != NULL && len > 0) {
4676                         if (m->m_len <= len) {
4677                                 len -= m->m_len;
4678                                 m->m_len = 0;
4679                                 m = m->m_next;
4680                         } else {
4681                                 m->m_len -= len;
4682                                 m->m_data += len;
4683                                 len = 0;
4684                         }
4685                 }
4686                 m = mp;
4687                 if (m->m_flags & M_PKTHDR)
4688                         m->m_pkthdr.len -= (req_len - len);
4689         } else {
4690                 /*
4691                  * Trim from tail.  Scan the mbuf chain,
4692                  * calculating its length and finding the last mbuf.
4693                  * If the adjustment only affects this mbuf, then just
4694                  * adjust and return.  Otherwise, rescan and truncate
4695                  * after the remaining size.
4696                  */
4697                 len = -len;
4698                 count = 0;
4699                 for (;;) {
4700                         count += m->m_len;
4701                         if (m->m_next == (struct mbuf *)0)
4702                                 break;
4703                         m = m->m_next;
4704                 }
4705                 if (m->m_len >= len) {
4706                         m->m_len -= len;
4707                         m = mp;
4708                         if (m->m_flags & M_PKTHDR)
4709                                 m->m_pkthdr.len -= len;
4710                         return;
4711                 }
4712                 count -= len;
4713                 if (count < 0)
4714                         count = 0;
4715                 /*
4716                  * Correct length for chain is "count".
4717                  * Find the mbuf with last data, adjust its length,
4718                  * and toss data from remaining mbufs on chain.
4719                  */
4720                 m = mp;
4721                 if (m->m_flags & M_PKTHDR)
4722                         m->m_pkthdr.len = count;
4723                 for (; m; m = m->m_next) {
4724                         if (m->m_len >= count) {
4725                                 m->m_len = count;
4726                                 break;
4727                         }
4728                         count -= m->m_len;
4729                 }
4730                 while ((m = m->m_next))
4731                         m->m_len = 0;
4732         }
4733 }
4734
4735 /*
4736  * Rearange an mbuf chain so that len bytes are contiguous
4737  * and in the data area of an mbuf (so that mtod and dtom
4738  * will work for a structure of size len).  Returns the resulting
4739  * mbuf chain on success, frees it and returns null on failure.
4740  * If there is room, it will add up to max_protohdr-len extra bytes to the
4741  * contiguous region in an attempt to avoid being called next time.
4742  */
4743 int MPFail;
4744
4745 struct mbuf *
4746 m_pullup(struct mbuf *n, int len)
4747 {
4748         struct mbuf *m;
4749         int count;
4750         int space;
4751
4752         /*
4753          * If first mbuf has no cluster, and has room for len bytes
4754          * without shifting current data, pullup into it,
4755          * otherwise allocate a new mbuf to prepend to the chain.
4756          */
4757         if ((n->m_flags & M_EXT) == 0 &&
4758             n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
4759                 if (n->m_len >= len)
4760                         return (n);
4761                 m = n;
4762                 n = n->m_next;
4763                 len -= m->m_len;
4764         } else {
4765                 if (len > MHLEN)
4766                         goto bad;
4767                 _MGET(m, M_DONTWAIT, n->m_type);
4768                 if (m == 0)
4769                         goto bad;
4770                 m->m_len = 0;
4771                 if (n->m_flags & M_PKTHDR) {
4772                         M_COPY_PKTHDR(m, n);
4773                         n->m_flags &= ~M_PKTHDR;
4774                 }
4775         }
4776         space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
4777         do {
4778                 count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
4779                 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4780                     (unsigned)count);
4781                 len -= count;
4782                 m->m_len += count;
4783                 n->m_len -= count;
4784                 space -= count;
4785                 if (n->m_len)
4786                         n->m_data += count;
4787                 else
4788                         n = m_free(n);
4789         } while (len > 0 && n);
4790         if (len > 0) {
4791                 (void) m_free(m);
4792                 goto bad;
4793         }
4794         m->m_next = n;
4795         return (m);
4796 bad:
4797         m_freem(n);
4798         MPFail++;
4799         return (0);
4800 }
4801
4802 /*
4803  * Like m_pullup(), except a new mbuf is always allocated, and we allow
4804  * the amount of empty space before the data in the new mbuf to be specified
4805  * (in the event that the caller expects to prepend later).
4806  */
4807 __private_extern__ int MSFail = 0;
4808
4809 __private_extern__ struct mbuf *
4810 m_copyup(struct mbuf *n, int len, int dstoff)
4811 {
4812         struct mbuf *m;
4813         int count, space;
4814
4815         if (len > (MHLEN - dstoff))
4816                 goto bad;
4817         MGET(m, M_DONTWAIT, n->m_type);
4818         if (m == NULL)
4819                 goto bad;
4820         m->m_len = 0;
4821         if (n->m_flags & M_PKTHDR) {
4822                 m_copy_pkthdr(m, n);
4823                 n->m_flags &= ~M_PKTHDR;
4824         }
4825         m->m_data += dstoff;
4826         space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
4827         do {
4828                 count = min(min(max(len, max_protohdr), space), n->m_len);
4829                 memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
4830                     (unsigned)count);
4831                 len -= count;
4832                 m->m_len += count;
4833                 n->m_len -= count;
4834                 space -= count;
4835                 if (n->m_len)
4836                         n->m_data += count;
4837                 else
4838                         n = m_free(n);
4839         } while (len > 0 && n);
4840         if (len > 0) {
4841                 (void) m_free(m);
4842                 goto bad;
4843         }
4844         m->m_next = n;
4845         return (m);
4846 bad:
4847         m_freem(n);
4848         MSFail++;
4849         return (NULL);
4850 }
4851
4852 /*
4853  * Partition an mbuf chain in two pieces, returning the tail --
4854  * all but the first len0 bytes.  In case of failure, it returns NULL and
4855  * attempts to restore the chain to its original state.
4856  */
4857 struct mbuf *
4858 m_split(struct mbuf *m0, int len0, int wait)
4859 {
4860         return (m_split0(m0, len0, wait, 1));
4861 }
4862
4863 static struct mbuf *
4864 m_split0(struct mbuf *m0, int len0, int wait, int copyhdr)
4865 {
4866         struct mbuf *m, *n;
4867         unsigned len = len0, remain;
4868
4869         for (m = m0; m && len > m->m_len; m = m->m_next)
4870                 len -= m->m_len;
4871         if (m == NULL)
4872                 return (NULL);
4873         remain = m->m_len - len;
4874         if (copyhdr && (m0->m_flags & M_PKTHDR)) {
4875                 _MGETHDR(n, wait, m0->m_type);
4876                 if (n == NULL)
4877                         return (NULL);
4878                 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
4879                 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
4880                 m0->m_pkthdr.len = len0;
4881                 if (m->m_flags & M_EXT)
4882                         goto extpacket;
4883                 if (remain > MHLEN) {
4884                         /* m can't be the lead packet */
4885                         MH_ALIGN(n, 0);
4886                         n->m_next = m_split(m, len, wait);
4887                         if (n->m_next == NULL) {
4888                                 (void) m_free(n);
4889                                 return (NULL);
4890                         } else
4891                                 return (n);
4892                 } else
4893                         MH_ALIGN(n, remain);
4894         } else if (remain == 0) {
4895                 n = m->m_next;
4896                 m->m_next = NULL;
4897                 return (n);
4898         } else {
4899                 _MGET(n, wait, m->m_type);
4900                 if (n == NULL)
4901                         return (NULL);
4902                 M_ALIGN(n, remain);
4903         }
4904 extpacket:
4905         if (m->m_flags & M_EXT) {
4906                 n->m_flags |= M_EXT;
4907                 n->m_ext = m->m_ext;
4908                 m_incref(m);
4909                 n->m_data = m->m_data + len;
4910         } else {
4911                 bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain);
4912         }
4913         n->m_len = remain;
4914         m->m_len = len;
4915         n->m_next = m->m_next;
4916         m->m_next = NULL;
4917         return (n);
4918 }
4919
4920 /*
4921  * Routine to copy from device local memory into mbufs.
4922  */
4923 struct mbuf *
4924 m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
4925     void (*copy)(const void *, void *, size_t))
4926 {
4927         struct mbuf *m;
4928         struct mbuf *top = NULL, **mp = &top;
4929         int off = off0, len;
4930         char *cp;
4931         char *epkt;
4932
4933         cp = buf;
4934         epkt = cp + totlen;
4935         if (off) {
4936                 /*
4937                  * If 'off' is non-zero, packet is trailer-encapsulated,
4938                  * so we have to skip the type and length fields.
4939                  */
4940                 cp += off + 2 * sizeof (u_int16_t);
4941                 totlen -= 2 * sizeof (u_int16_t);
4942         }
4943         _MGETHDR(m, M_DONTWAIT, MT_DATA);
4944         if (m == NULL)
4945                 return (NULL);
4946         m->m_pkthdr.rcvif = ifp;
4947         m->m_pkthdr.len = totlen;
4948         m->m_len = MHLEN;
4949
4950         while (totlen > 0) {
4951                 if (top != NULL) {
4952                         _MGET(m, M_DONTWAIT, MT_DATA);
4953                         if (m == NULL) {
4954                                 m_freem(top);
4955                                 return (NULL);
4956                         }
4957                         m->m_len = MLEN;
4958                 }
4959                 len = MIN(totlen, epkt - cp);
4960                 if (len >= MINCLSIZE) {
4961                         MCLGET(m, M_DONTWAIT);
4962                         if (m->m_flags & M_EXT) {
4963                                 m->m_len = len = MIN(len, m_maxsize(MC_CL));
4964                         } else {
4965                                 /* give up when it's out of cluster mbufs */
4966                                 if (top != NULL)
4967                                         m_freem(top);
4968                                 m_freem(m);
4969                                 return (NULL);
4970                         }
4971                 } else {
4972                         /*
4973                          * Place initial small packet/header at end of mbuf.
4974                          */
4975                         if (len < m->m_len) {
4976                                 if (top == NULL &&
4977                                     len + max_linkhdr <= m->m_len)
4978                                         m->m_data += max_linkhdr;
4979                                 m->m_len = len;
4980                         } else {
4981                                 len = m->m_len;
4982                         }
4983                 }
4984                 if (copy)
4985                         copy(cp, MTOD(m, caddr_t), (unsigned)len);
4986                 else
4987                         bcopy(cp, MTOD(m, caddr_t), (unsigned)len);
4988                 cp += len;
4989                 *mp = m;
4990                 mp = &m->m_next;
4991                 totlen -= len;
4992                 if (cp == epkt)
4993                         cp = buf;
4994         }
4995         return (top);
4996 }
4997
4998 #ifndef MBUF_GROWTH_NORMAL_THRESH
4999 #define MBUF_GROWTH_NORMAL_THRESH 25
5000 #endif
5001
5002 /*
5003  * Cluster freelist allocation check.
5004  */
5005 static int
5006 m_howmany(int num, size_t bufsize)
5007 {
5008         int i = 0, j = 0;
5009         u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters;
5010         u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree;
5011         u_int32_t sumclusters, freeclusters;
5012         u_int32_t percent_pool, percent_kmem;
5013         u_int32_t mb_growth, mb_growth_thresh;
5014
5015         VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
5016             bufsize == m_maxsize(MC_16KCL));
5017
5018         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5019
5020         /* Numbers in 2K cluster units */
5021         m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
5022         m_clusters = m_total(MC_CL);
5023         m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
5024         m_16kclusters = m_total(MC_16KCL);
5025         sumclusters = m_mbclusters + m_clusters + m_bigclusters;
5026
5027         m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT;
5028         m_clfree = m_infree(MC_CL);
5029         m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT;
5030         m_16kclfree = m_infree(MC_16KCL);
5031         freeclusters = m_mbfree + m_clfree + m_bigclfree;
5032
5033         /* Bail if we've maxed out the mbuf memory map */
5034         if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) ||
5035             (njcl > 0 && bufsize == m_maxsize(MC_16KCL) &&
5036             (m_16kclusters << NCLPJCLSHIFT) >= njcl)) {
5037                 return (0);
5038         }
5039
5040         if (bufsize == m_maxsize(MC_BIGCL)) {
5041                 /* Under minimum */
5042                 if (m_bigclusters < m_minlimit(MC_BIGCL))
5043                         return (m_minlimit(MC_BIGCL) - m_bigclusters);
5044
5045                 percent_pool =
5046                     ((sumclusters - freeclusters) * 100) / sumclusters;
5047                 percent_kmem = (sumclusters * 100) / nclusters;
5048
5049                 /*
5050                  * If a light/normal user, grow conservatively (75%)
5051                  * If a heavy user, grow aggressively (50%)
5052                  */
5053                 if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH)
5054                         mb_growth = MB_GROWTH_NORMAL;
5055                 else
5056                         mb_growth = MB_GROWTH_AGGRESSIVE;
5057
5058                 if (percent_kmem < 5) {
5059                         /* For initial allocations */
5060                         i = num;
5061                 } else {
5062                         /* Return if >= MBIGCL_LOWAT clusters available */
5063                         if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT &&
5064                             m_total(MC_BIGCL) >=
5065                             MBIGCL_LOWAT + m_minlimit(MC_BIGCL))
5066                                 return (0);
5067
5068                         /* Ensure at least num clusters are accessible */
5069                         if (num >= m_infree(MC_BIGCL))
5070                                 i = num - m_infree(MC_BIGCL);
5071                         if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL))
5072                                 j = num - (m_total(MC_BIGCL) -
5073                                     m_minlimit(MC_BIGCL));
5074
5075                         i = MAX(i, j);
5076
5077                         /*
5078                          * Grow pool if percent_pool > 75 (normal growth)
5079                          * or percent_pool > 50 (aggressive growth).
5080                          */
5081                         mb_growth_thresh = 100 - (100 / (1 << mb_growth));
5082                         if (percent_pool > mb_growth_thresh)
5083                                 j = ((sumclusters + num) >> mb_growth) -
5084                                     freeclusters;
5085                         i = MAX(i, j);
5086                 }
5087
5088                 /* Check to ensure we didn't go over limits */
5089                 if (i + m_bigclusters >= m_maxlimit(MC_BIGCL))
5090                         i = m_maxlimit(MC_BIGCL) - m_bigclusters;
5091                 if ((i << 1) + sumclusters >= nclusters)
5092                         i = (nclusters - sumclusters) >> 1;
5093                 VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
5094                 VERIFY(sumclusters + (i << 1) <= nclusters);
5095
5096         } else { /* 16K CL */
5097                 VERIFY(njcl > 0);
5098                 /* Under minimum */
5099                 if (m_16kclusters < MIN16KCL)
5100                         return (MIN16KCL - m_16kclusters);
5101                 if (m_16kclfree >= M16KCL_LOWAT)
5102                         return (0);
5103
5104                 /* Ensure at least num clusters are available */
5105                 if (num >= m_16kclfree)
5106                         i = num - m_16kclfree;
5107
5108                 /* Always grow 16KCL pool aggressively */
5109                 if (((m_16kclusters + num) >> 1) > m_16kclfree)
5110                         j = ((m_16kclusters + num) >> 1) - m_16kclfree;
5111                 i = MAX(i, j);
5112
5113                 /* Check to ensure we don't go over limit */
5114                 if (i + m_16kclusters >= m_maxlimit(MC_16KCL))
5115                         i = m_maxlimit(MC_16KCL) - m_16kclusters;
5116                 VERIFY((m_total(MC_16KCL) + i) <= m_maxlimit(MC_16KCL));
5117         }
5118         return (i);
5119 }
5120 /*
5121  * Return the number of bytes in the mbuf chain, m.
5122  */
5123 unsigned int
5124 m_length(struct mbuf *m)
5125 {
5126         struct mbuf *m0;
5127         unsigned int pktlen;
5128
5129         if (m->m_flags & M_PKTHDR)
5130                 return (m->m_pkthdr.len);
5131
5132         pktlen = 0;
5133         for (m0 = m; m0 != NULL; m0 = m0->m_next)
5134                 pktlen += m0->m_len;
5135         return (pktlen);
5136 }
5137
5138 /*
5139  * Copy data from a buffer back into the indicated mbuf chain,
5140  * starting "off" bytes from the beginning, extending the mbuf
5141  * chain if necessary.
5142  */
5143 void
5144 m_copyback(struct mbuf *m0, int off, int len, const void *cp)
5145 {
5146 #if DEBUG
5147         struct mbuf *origm = m0;
5148         int error;
5149 #endif /* DEBUG */
5150
5151         if (m0 == NULL)
5152                 return;
5153
5154 #if DEBUG
5155         error =
5156 #endif /* DEBUG */
5157         m_copyback0(&m0, off, len, cp,
5158             M_COPYBACK0_COPYBACK | M_COPYBACK0_EXTEND, M_DONTWAIT);
5159
5160 #if DEBUG
5161         if (error != 0 || (m0 != NULL && origm != m0))
5162                 panic("m_copyback");
5163 #endif /* DEBUG */
5164 }
5165
5166 struct mbuf *
5167 m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp, int how)
5168 {
5169         int error;
5170
5171         /* don't support chain expansion */
5172         VERIFY(off + len <= m_length(m0));
5173
5174         error = m_copyback0(&m0, off, len, cp,
5175             M_COPYBACK0_COPYBACK | M_COPYBACK0_COW, how);
5176         if (error) {
5177                 /*
5178                  * no way to recover from partial success.
5179                  * just free the chain.
5180                  */
5181                 m_freem(m0);
5182                 return (NULL);
5183         }
5184         return (m0);
5185 }
5186
5187 /*
5188  * m_makewritable: ensure the specified range writable.
5189  */
5190 int
5191 m_makewritable(struct mbuf **mp, int off, int len, int how)
5192 {
5193         int error;
5194 #if DEBUG
5195         struct mbuf *n;
5196         int origlen, reslen;
5197
5198         origlen = m_length(*mp);
5199 #endif /* DEBUG */
5200
5201 #if 0 /* M_COPYALL is large enough */
5202         if (len == M_COPYALL)
5203                 len = m_length(*mp) - off; /* XXX */
5204 #endif
5205
5206         error = m_copyback0(mp, off, len, NULL,
5207             M_COPYBACK0_PRESERVE | M_COPYBACK0_COW, how);
5208
5209 #if DEBUG
5210         reslen = 0;
5211         for (n = *mp; n; n = n->m_next)
5212                 reslen += n->m_len;
5213         if (origlen != reslen)
5214                 panic("m_makewritable: length changed");
5215         if (((*mp)->m_flags & M_PKTHDR) && reslen != (*mp)->m_pkthdr.len)
5216                 panic("m_makewritable: inconsist");
5217 #endif /* DEBUG */
5218
5219         return (error);
5220 }
5221
5222 static int
5223 m_copyback0(struct mbuf **mp0, int off, int len, const void *vp, int flags,
5224     int how)
5225 {
5226         int mlen;
5227         struct mbuf *m, *n;
5228         struct mbuf **mp;
5229         int totlen = 0;
5230         const char *cp = vp;
5231
5232         VERIFY(mp0 != NULL);
5233         VERIFY(*mp0 != NULL);
5234         VERIFY((flags & M_COPYBACK0_PRESERVE) == 0 || cp == NULL);
5235         VERIFY((flags & M_COPYBACK0_COPYBACK) == 0 || cp != NULL);
5236
5237         /*
5238          * we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
5239          * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
5240          */
5241
5242         VERIFY((~flags & (M_COPYBACK0_EXTEND|M_COPYBACK0_COW)) != 0);
5243
5244         mp = mp0;
5245         m = *mp;
5246         while (off > (mlen = m->m_len)) {
5247                 off -= mlen;
5248                 totlen += mlen;
5249                 if (m->m_next == NULL) {
5250                         int tspace;
5251 extend:
5252                         if (!(flags & M_COPYBACK0_EXTEND))
5253                                 goto out;
5254
5255                         /*
5256                          * try to make some space at the end of "m".
5257                          */
5258
5259                         mlen = m->m_len;
5260                         if (off + len >= MINCLSIZE &&
5261                             !(m->m_flags & M_EXT) && m->m_len == 0) {
5262                                 MCLGET(m, how);
5263                         }
5264                         tspace = M_TRAILINGSPACE(m);
5265                         if (tspace > 0) {
5266                                 tspace = MIN(tspace, off + len);
5267                                 VERIFY(tspace > 0);
5268                                 bzero(mtod(m, char *) + m->m_len,
5269                                     MIN(off, tspace));
5270                                 m->m_len += tspace;
5271                                 off += mlen;
5272                                 totlen -= mlen;
5273                                 continue;
5274                         }
5275
5276                         /*
5277                          * need to allocate an mbuf.
5278                          */
5279
5280                         if (off + len >= MINCLSIZE) {
5281                                 n = m_getcl(how, m->m_type, 0);
5282                         } else {
5283                                 n = _M_GET(how, m->m_type);
5284                         }
5285                         if (n == NULL) {
5286                                 goto out;
5287                         }
5288                         n->m_len = 0;
5289                         n->m_len = MIN(M_TRAILINGSPACE(n), off + len);
5290                         bzero(mtod(n, char *), MIN(n->m_len, off));
5291                         m->m_next = n;
5292                 }
5293                 mp = &m->m_next;
5294                 m = m->m_next;
5295         }
5296         while (len > 0) {
5297                 mlen = m->m_len - off;
5298                 if (mlen != 0 && m_mclhasreference(m)) {
5299                         char *datap;
5300                         int eatlen;
5301
5302                         /*
5303                          * this mbuf is read-only.
5304                          * allocate a new writable mbuf and try again.
5305                          */
5306
5307 #if defined(DIAGNOSTIC)
5308                         if (!(flags & M_COPYBACK0_COW))
5309                                 panic("m_copyback0: read-only");
5310 #endif /* defined(DIAGNOSTIC) */
5311
5312                         /*
5313                          * if we're going to write into the middle of
5314                          * a mbuf, split it first.
5315                          */
5316                         if (off > 0 && len < mlen) {
5317                                 n = m_split0(m, off, how, 0);
5318                                 if (n == NULL)
5319                                         goto enobufs;
5320                                 m->m_next = n;
5321                                 mp = &m->m_next;
5322                                 m = n;
5323                                 off = 0;
5324                                 continue;
5325                         }
5326
5327                         /*
5328                          * XXX TODO coalesce into the trailingspace of
5329                          * the previous mbuf when possible.
5330                          */
5331
5332                         /*
5333                          * allocate a new mbuf.  copy packet header if needed.
5334                          */
5335                         n = _M_GET(how, m->m_type);
5336                         if (n == NULL)
5337                                 goto enobufs;
5338                         if (off == 0 && (m->m_flags & M_PKTHDR)) {
5339                                 M_COPY_PKTHDR(n, m);
5340                                 n->m_len = MHLEN;
5341                         } else {
5342                                 if (len >= MINCLSIZE)
5343                                         MCLGET(n, M_DONTWAIT);
5344                                 n->m_len =
5345                                     (n->m_flags & M_EXT) ? MCLBYTES : MLEN;
5346                         }
5347                         if (n->m_len > len)
5348                                 n->m_len = len;
5349
5350                         /*
5351                          * free the region which has been overwritten.
5352                          * copying data from old mbufs if requested.
5353                          */
5354                         if (flags & M_COPYBACK0_PRESERVE)
5355                                 datap = mtod(n, char *);
5356                         else
5357                                 datap = NULL;
5358                         eatlen = n->m_len;
5359                         VERIFY(off == 0 || eatlen >= mlen);
5360                         if (off > 0) {
5361                                 VERIFY(len >= mlen);
5362                                 m->m_len = off;
5363                                 m->m_next = n;
5364                                 if (datap) {
5365                                         m_copydata(m, off, mlen, datap);
5366                                         datap += mlen;
5367                                 }
5368                                 eatlen -= mlen;
5369                                 mp = &m->m_next;
5370                                 m = m->m_next;
5371                         }
5372                         while (m != NULL && m_mclhasreference(m) &&
5373                             n->m_type == m->m_type && eatlen > 0) {
5374                                 mlen = MIN(eatlen, m->m_len);
5375                                 if (datap) {
5376                                         m_copydata(m, 0, mlen, datap);
5377                                         datap += mlen;
5378                                 }
5379                                 m->m_data += mlen;
5380                                 m->m_len -= mlen;
5381                                 eatlen -= mlen;
5382                                 if (m->m_len == 0)
5383                                         *mp = m = m_free(m);
5384                         }
5385                         if (eatlen > 0)
5386                                 n->m_len -= eatlen;
5387                         n->m_next = m;
5388                         *mp = m = n;
5389                         continue;
5390                 }
5391                 mlen = MIN(mlen, len);
5392                 if (flags & M_COPYBACK0_COPYBACK) {
5393                         bcopy(cp, mtod(m, caddr_t) + off, (unsigned)mlen);
5394                         cp += mlen;
5395                 }
5396                 len -= mlen;
5397                 mlen += off;
5398                 off = 0;
5399                 totlen += mlen;
5400                 if (len == 0)
5401                         break;
5402                 if (m->m_next == NULL) {
5403                         goto extend;
5404                 }
5405                 mp = &m->m_next;
5406                 m = m->m_next;
5407         }
5408 out:
5409         if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) {
5410                 VERIFY(flags & M_COPYBACK0_EXTEND);
5411                 m->m_pkthdr.len = totlen;
5412         }
5413
5414         return (0);
5415
5416 enobufs:
5417         return (ENOBUFS);
5418 }
5419
5420 char *
5421 mcl_to_paddr(char *addr)
5422 {
5423         vm_offset_t base_phys;
5424
5425         if (!MBUF_IN_MAP(addr))
5426                 return (NULL);
5427         base_phys = mcl_paddr[(addr - (char *)mbutl) >> PGSHIFT];
5428
5429         if (base_phys == 0)
5430                 return (NULL);
5431         return ((char *)((uintptr_t)base_phys | ((uintptr_t)addr & PGOFSET)));
5432 }
5433
5434 /*
5435  * Dup the mbuf chain passed in.  The whole thing.  No cute additional cruft.
5436  * And really copy the thing.  That way, we don't "precompute" checksums
5437  * for unsuspecting consumers.  Assumption: m->m_nextpkt == 0.  Trick: for
5438  * small packets, don't dup into a cluster.  That way received  packets
5439  * don't take up too much room in the sockbuf (cf. sbspace()).
5440  */
5441 int MDFail;
5442
5443 struct mbuf *
5444 m_dup(struct mbuf *m, int how)
5445 {
5446         struct mbuf *n, **np;
5447         struct mbuf *top;
5448         int copyhdr = 0;
5449
5450         np = &top;
5451         top = NULL;
5452         if (m->m_flags & M_PKTHDR)
5453                 copyhdr = 1;
5454
5455         /*
5456          * Quick check: if we have one mbuf and its data fits in an
5457          *  mbuf with packet header, just copy and go.
5458          */
5459         if (m->m_next == NULL) {
5460                 /* Then just move the data into an mbuf and be done... */
5461                 if (copyhdr) {
5462                         if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
5463                                 if ((n = _M_GETHDR(how, m->m_type)) == NULL)
5464                                         return (NULL);
5465                                 n->m_len = m->m_len;
5466                                 m_dup_pkthdr(n, m, how);
5467                                 bcopy(m->m_data, n->m_data, m->m_len);
5468                                 return (n);
5469                         }
5470                 } else if (m->m_len <= MLEN) {
5471                         if ((n = _M_GET(how, m->m_type)) == NULL)
5472                                 return (NULL);
5473                         bcopy(m->m_data, n->m_data, m->m_len);
5474                         n->m_len = m->m_len;
5475                         return (n);
5476                 }
5477         }
5478         while (m != NULL) {
5479 #if BLUE_DEBUG
5480                 kprintf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
5481                     m->m_data);
5482 #endif
5483                 if (copyhdr)
5484                         n = _M_GETHDR(how, m->m_type);
5485                 else
5486                         n = _M_GET(how, m->m_type);
5487                 if (n == NULL)
5488                         goto nospace;
5489                 if (m->m_flags & M_EXT) {
5490                         if (m->m_len <= m_maxsize(MC_CL))
5491                                 MCLGET(n, how);
5492                         else if (m->m_len <= m_maxsize(MC_BIGCL))
5493                                 n = m_mbigget(n, how);
5494                         else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0)
5495                                 n = m_m16kget(n, how);
5496                         if (!(n->m_flags & M_EXT)) {
5497                                 (void) m_free(n);
5498                                 goto nospace;
5499                         }
5500                 }
5501                 *np = n;
5502                 if (copyhdr) {
5503                         /* Don't use M_COPY_PKTHDR: preserve m_data */
5504                         m_dup_pkthdr(n, m, how);
5505                         copyhdr = 0;
5506                         if (!(n->m_flags & M_EXT))
5507                                 n->m_data = n->m_pktdat;
5508                 }
5509                 n->m_len = m->m_len;
5510                 /*
5511                  * Get the dup on the same bdry as the original
5512                  * Assume that the two mbufs have the same offset to data area
5513                  * (up to word boundaries)
5514                  */
5515                 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len);
5516                 m = m->m_next;
5517                 np = &n->m_next;
5518 #if BLUE_DEBUG
5519                 kprintf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
5520                     n->m_data);
5521 #endif
5522         }
5523
5524         if (top == NULL)
5525                 MDFail++;
5526         return (top);
5527
5528 nospace:
5529         m_freem(top);
5530         MDFail++;
5531         return (NULL);
5532 }
5533
5534 #define MBUF_MULTIPAGES(m)                                              \
5535         (((m)->m_flags & M_EXT) &&                                      \
5536         ((IS_P2ALIGNED((m)->m_data, NBPG) && (m)->m_len > NBPG) ||      \
5537         (!IS_P2ALIGNED((m)->m_data, NBPG) &&                            \
5538         P2ROUNDUP((m)->m_data, NBPG) < ((uintptr_t)(m)->m_data + (m)->m_len))))
5539
5540 static struct mbuf *
5541 m_expand(struct mbuf *m, struct mbuf **last)
5542 {
5543         struct mbuf *top = NULL;
5544         struct mbuf **nm = &top;
5545         uintptr_t data0, data;
5546         unsigned int len0, len;
5547
5548         VERIFY(MBUF_MULTIPAGES(m));
5549         VERIFY(m->m_next == NULL);
5550         data0 = (uintptr_t)m->m_data;
5551         len0 = m->m_len;
5552         *last = top;
5553
5554         for (;;) {
5555                 struct mbuf *n;
5556
5557                 data = data0;
5558                 if (IS_P2ALIGNED(data, NBPG) && len0 > NBPG)
5559                         len = NBPG;
5560                 else if (!IS_P2ALIGNED(data, NBPG) &&
5561                     P2ROUNDUP(data, NBPG) < (data + len0))
5562                         len = P2ROUNDUP(data, NBPG) - data;
5563                 else
5564                         len = len0;
5565
5566                 VERIFY(len > 0);
5567                 VERIFY(m->m_flags & M_EXT);
5568                 m->m_data = (void *)data;
5569                 m->m_len = len;
5570
5571                 *nm = *last = m;
5572                 nm = &m->m_next;
5573                 m->m_next = NULL;
5574
5575                 data0 += len;
5576                 len0 -= len;
5577                 if (len0 == 0)
5578                         break;
5579
5580                 n = _M_RETRY(M_DONTWAIT, MT_DATA);
5581                 if (n == NULL) {
5582                         m_freem(top);
5583                         top = *last = NULL;
5584                         break;
5585                 }
5586
5587                 n->m_ext = m->m_ext;
5588                 m_incref(m);
5589                 n->m_flags |= M_EXT;
5590                 m = n;
5591         }
5592         return (top);
5593 }
5594
5595 struct mbuf *
5596 m_normalize(struct mbuf *m)
5597 {
5598         struct mbuf *top = NULL;
5599         struct mbuf **nm = &top;
5600         boolean_t expanded = FALSE;
5601
5602         while (m != NULL) {
5603                 struct mbuf *n;
5604
5605                 n = m->m_next;
5606                 m->m_next = NULL;
5607
5608                 /* Does the data cross one or more page boundaries? */
5609                 if (MBUF_MULTIPAGES(m)) {
5610                         struct mbuf *last;
5611                         if ((m = m_expand(m, &last)) == NULL) {
5612                                 m_freem(n);
5613                                 m_freem(top);
5614                                 top = NULL;
5615                                 break;
5616                         }
5617                         *nm = m;
5618                         nm = &last->m_next;
5619                         expanded = TRUE;
5620                 } else {
5621                         *nm = m;
5622                         nm = &m->m_next;
5623                 }
5624                 m = n;
5625         }
5626         if (expanded)
5627                 atomic_add_32(&mb_normalized, 1);
5628         return (top);
5629 }
5630
5631 /*
5632  * Append the specified data to the indicated mbuf chain,
5633  * Extend the mbuf chain if the new data does not fit in
5634  * existing space.
5635  *
5636  * Return 1 if able to complete the job; otherwise 0.
5637  */
5638 int
5639 m_append(struct mbuf *m0, int len, caddr_t cp)
5640 {
5641         struct mbuf *m, *n;
5642         int remainder, space;
5643
5644         for (m = m0; m->m_next != NULL; m = m->m_next)
5645                 ;
5646         remainder = len;
5647         space = M_TRAILINGSPACE(m);
5648         if (space > 0) {
5649                 /*
5650                  * Copy into available space.
5651                  */
5652                 if (space > remainder)
5653                         space = remainder;
5654                 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
5655                 m->m_len += space;
5656                 cp += space, remainder -= space;
5657         }
5658         while (remainder > 0) {
5659                 /*
5660                  * Allocate a new mbuf; could check space
5661                  * and allocate a cluster instead.
5662                  */
5663                 n = m_get(M_WAITOK, m->m_type);
5664                 if (n == NULL)
5665                         break;
5666                 n->m_len = min(MLEN, remainder);
5667                 bcopy(cp, mtod(n, caddr_t), n->m_len);
5668                 cp += n->m_len;
5669                 remainder -= n->m_len;
5670                 m->m_next = n;
5671                 m = n;
5672         }
5673         if (m0->m_flags & M_PKTHDR)
5674                 m0->m_pkthdr.len += len - remainder;
5675         return (remainder == 0);
5676 }
5677
5678 struct mbuf *
5679 m_last(struct mbuf *m)
5680 {
5681         while (m->m_next != NULL)
5682                 m = m->m_next;
5683         return (m);
5684 }
5685
5686 unsigned int
5687 m_fixhdr(struct mbuf *m0)
5688 {
5689         u_int len;
5690
5691         len = m_length2(m0, NULL);
5692         m0->m_pkthdr.len = len;
5693         return (len);
5694 }
5695
5696 unsigned int
5697 m_length2(struct mbuf *m0, struct mbuf **last)
5698 {
5699         struct mbuf *m;
5700         u_int len;
5701
5702         len = 0;
5703         for (m = m0; m != NULL; m = m->m_next) {
5704                 len += m->m_len;
5705                 if (m->m_next == NULL)
5706                         break;
5707         }
5708         if (last != NULL)
5709                 *last = m;
5710         return (len);
5711 }
5712
5713 /*
5714  * Defragment a mbuf chain, returning the shortest possible chain of mbufs
5715  * and clusters.  If allocation fails and this cannot be completed, NULL will
5716  * be returned, but the passed in chain will be unchanged.  Upon success,
5717  * the original chain will be freed, and the new chain will be returned.
5718  *
5719  * If a non-packet header is passed in, the original mbuf (chain?) will
5720  * be returned unharmed.
5721  *
5722  * If offset is specfied, the first mbuf in the chain will have a leading
5723  * space of the amount stated by the "off" parameter.
5724  *
5725  * This routine requires that the m_pkthdr.header field of the original
5726  * mbuf chain is cleared by the caller.
5727  */
5728 struct mbuf *
5729 m_defrag_offset(struct mbuf *m0, u_int32_t off, int how)
5730 {
5731         struct mbuf *m_new = NULL, *m_final = NULL;
5732         int progress = 0, length, pktlen;
5733
5734         if (!(m0->m_flags & M_PKTHDR))
5735                 return (m0);
5736
5737         VERIFY(off < MHLEN);
5738         m_fixhdr(m0); /* Needed sanity check */
5739
5740         pktlen = m0->m_pkthdr.len + off;
5741         if (pktlen > MHLEN)
5742                 m_final = m_getcl(how, MT_DATA, M_PKTHDR);
5743         else
5744                 m_final = m_gethdr(how, MT_DATA);
5745
5746         if (m_final == NULL)
5747                 goto nospace;
5748
5749         if (off > 0) {
5750                 pktlen -= off;
5751                 m_final->m_len -= off;
5752                 m_final->m_data += off;
5753         }
5754
5755         /*
5756          * Caller must have handled the contents pointed to by this
5757          * pointer before coming here, as otherwise it will point to
5758          * the original mbuf which will get freed upon success.
5759          */
5760         VERIFY(m0->m_pkthdr.header == NULL);
5761
5762         if (m_dup_pkthdr(m_final, m0, how) == 0)
5763                 goto nospace;
5764
5765         m_new = m_final;
5766
5767         while (progress < pktlen) {
5768                 length = pktlen - progress;
5769                 if (length > MCLBYTES)
5770                         length = MCLBYTES;
5771
5772                 if (m_new == NULL) {
5773                         if (length > MLEN)
5774                                 m_new = m_getcl(how, MT_DATA, 0);
5775                         else
5776                                 m_new = m_get(how, MT_DATA);
5777                         if (m_new == NULL)
5778                                 goto nospace;
5779                 }
5780
5781                 m_copydata(m0, progress, length, mtod(m_new, caddr_t));
5782                 progress += length;
5783                 m_new->m_len = length;
5784                 if (m_new != m_final)
5785                         m_cat(m_final, m_new);
5786                 m_new = NULL;
5787         }
5788         m_freem(m0);
5789         m0 = m_final;
5790         return (m0);
5791 nospace:
5792         if (m_final)
5793                 m_freem(m_final);
5794         return (NULL);
5795 }
5796
5797 struct mbuf *
5798 m_defrag(struct mbuf *m0, int how)
5799 {
5800         return (m_defrag_offset(m0, 0, how));
5801 }
5802
5803 void
5804 m_mchtype(struct mbuf *m, int t)
5805 {
5806         mtype_stat_inc(t);
5807         mtype_stat_dec(m->m_type);
5808         (m)->m_type = t;
5809 }
5810
5811 void *
5812 m_mtod(struct mbuf *m)
5813 {
5814         return (MTOD(m, void *));
5815 }
5816
5817 struct mbuf *
5818 m_dtom(void *x)
5819 {
5820         return ((struct mbuf *)((uintptr_t)(x) & ~(MSIZE-1)));
5821 }
5822
5823 void
5824 m_mcheck(struct mbuf *m)
5825 {
5826         _MCHECK(m);
5827 }
5828
5829 /*
5830  * Return a pointer to mbuf/offset of location in mbuf chain.
5831  */
5832 struct mbuf *
5833 m_getptr(struct mbuf *m, int loc, int *off)
5834 {
5835
5836         while (loc >= 0) {
5837                 /* Normal end of search. */
5838                 if (m->m_len > loc) {
5839                         *off = loc;
5840                         return (m);
5841                 } else {
5842                         loc -= m->m_len;
5843                         if (m->m_next == NULL) {
5844                                 if (loc == 0) {
5845                                         /* Point at the end of valid data. */
5846                                         *off = m->m_len;
5847                                         return (m);
5848                                 }
5849                                 return (NULL);
5850                         }
5851                         m = m->m_next;
5852                 }
5853         }
5854         return (NULL);
5855 }
5856
5857 /*
5858  * Inform the corresponding mcache(s) that there's a waiter below.
5859  */
5860 static void
5861 mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
5862 {
5863         mcache_waiter_inc(m_cache(class));
5864         if (comp) {
5865                 if (class == MC_CL) {
5866                         mcache_waiter_inc(m_cache(MC_MBUF_CL));
5867                 } else if (class == MC_BIGCL) {
5868                         mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
5869                 } else if (class == MC_16KCL) {
5870                         mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
5871                 } else {
5872                         mcache_waiter_inc(m_cache(MC_MBUF_CL));
5873                         mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
5874                 }
5875         }
5876 }
5877
5878 /*
5879  * Inform the corresponding mcache(s) that there's no more waiter below.
5880  */
5881 static void
5882 mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
5883 {
5884         mcache_waiter_dec(m_cache(class));
5885         if (comp) {
5886                 if (class == MC_CL) {
5887                         mcache_waiter_dec(m_cache(MC_MBUF_CL));
5888                 } else if (class == MC_BIGCL) {
5889                         mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
5890                 } else if (class == MC_16KCL) {
5891                         mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
5892                 } else {
5893                         mcache_waiter_dec(m_cache(MC_MBUF_CL));
5894                         mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
5895                 }
5896         }
5897 }
5898
5899 /*
5900  * Called during slab (blocking and non-blocking) allocation.  If there
5901  * is at least one waiter, and the time since the first waiter is blocked
5902  * is greater than the watchdog timeout, panic the system.
5903  */
5904 static void
5905 mbuf_watchdog(void)
5906 {
5907         struct timeval now;
5908         unsigned int since;
5909
5910         if (mb_waiters == 0 || !mb_watchdog)
5911                 return;
5912
5913         microuptime(&now);
5914         since = now.tv_sec - mb_wdtstart.tv_sec;
5915         if (since >= MB_WDT_MAXTIME) {
5916                 panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__,
5917                     mb_waiters, since, mbuf_dump());
5918                 /* NOTREACHED */
5919         }
5920 }
5921
5922 /*
5923  * Called during blocking allocation.  Returns TRUE if one or more objects
5924  * are available at the per-CPU caches layer and that allocation should be
5925  * retried at that level.
5926  */
5927 static boolean_t
5928 mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
5929 {
5930         boolean_t mcache_retry = FALSE;
5931
5932         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5933
5934         /* Check if there's anything at the cache layer */
5935         if (mbuf_cached_above(class, wait)) {
5936                 mcache_retry = TRUE;
5937                 goto done;
5938         }
5939
5940         /* Nothing?  Then try hard to get it from somewhere */
5941         m_reclaim(class, num, (wait & MCR_COMP));
5942
5943         /* We tried hard and got something? */
5944         if (m_infree(class) > 0) {
5945                 mbstat.m_wait++;
5946                 goto done;
5947         } else if (mbuf_cached_above(class, wait)) {
5948                 mbstat.m_wait++;
5949                 mcache_retry = TRUE;
5950                 goto done;
5951         } else if (wait & MCR_TRYHARD) {
5952                 mcache_retry = TRUE;
5953                 goto done;
5954         }
5955
5956         /*
5957          * There's really nothing for us right now; inform the
5958          * cache(s) that there is a waiter below and go to sleep.
5959          */
5960         mbuf_waiter_inc(class, (wait & MCR_COMP));
5961
5962         VERIFY(!(wait & MCR_NOSLEEP));
5963
5964         /*
5965          * If this is the first waiter, arm the watchdog timer.  Otherwise
5966          * check if we need to panic the system due to watchdog timeout.
5967          */
5968         if (mb_waiters == 0)
5969                 microuptime(&mb_wdtstart);
5970         else
5971                 mbuf_watchdog();
5972
5973         mb_waiters++;
5974         (void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL);
5975
5976         /* We are now up; stop getting notified until next round */
5977         mbuf_waiter_dec(class, (wait & MCR_COMP));
5978
5979         /* We waited and got something */
5980         if (m_infree(class) > 0) {
5981                 mbstat.m_wait++;
5982                 goto done;
5983         } else if (mbuf_cached_above(class, wait)) {
5984                 mbstat.m_wait++;
5985                 mcache_retry = TRUE;
5986         }
5987 done:
5988         return (mcache_retry);
5989 }
5990
5991 static void
5992 mbuf_worker_thread(void)
5993 {
5994         int mbuf_expand;
5995
5996         while (1) {
5997                 lck_mtx_lock(mbuf_mlock);
5998
5999                 mbuf_expand = 0;
6000                 if (mbuf_expand_mcl) {
6001                         int n;
6002
6003                         /* Adjust to current number of cluster in use */
6004                         n = mbuf_expand_mcl -
6005                             (m_total(MC_CL) - m_infree(MC_CL));
6006                         if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL))
6007                                 n = m_maxlimit(MC_CL) - m_total(MC_CL);
6008                         mbuf_expand_mcl = 0;
6009
6010                         if (n > 0 && freelist_populate(MC_CL, n, M_WAIT) > 0)
6011                                 mbuf_expand++;
6012                 }
6013                 if (mbuf_expand_big) {
6014                         int n;
6015
6016                         /* Adjust to current number of 4 KB cluster in use */
6017                         n = mbuf_expand_big -
6018                             (m_total(MC_BIGCL) - m_infree(MC_BIGCL));
6019                         if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL))
6020                                 n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
6021                         mbuf_expand_big = 0;
6022
6023                         if (n > 0 && freelist_populate(MC_BIGCL, n, M_WAIT) > 0)
6024                                 mbuf_expand++;
6025                 }
6026                 if (mbuf_expand_16k) {
6027                         int n;
6028
6029                         /* Adjust to current number of 16 KB cluster in use */
6030                         n = mbuf_expand_16k -
6031                             (m_total(MC_16KCL) - m_infree(MC_16KCL));
6032                         if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL))
6033                                 n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
6034                         mbuf_expand_16k = 0;
6035
6036                         if (n > 0)
6037                                 (void) freelist_populate(MC_16KCL, n, M_WAIT);
6038                 }
6039
6040                 /*
6041                  * Because we can run out of memory before filling the mbuf
6042                  * map, we should not allocate more clusters than they are
6043                  * mbufs -- otherwise we could have a large number of useless
6044                  * clusters allocated.
6045                  */
6046                 if (mbuf_expand) {
6047                         while (m_total(MC_MBUF) <
6048                             (m_total(MC_BIGCL) + m_total(MC_CL))) {
6049                                 if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0)
6050                                         break;
6051                         }
6052                 }
6053
6054                 lck_mtx_unlock(mbuf_mlock);
6055
6056                 assert_wait(&mbuf_worker_run, THREAD_UNINT);
6057                 (void) thread_block((thread_continue_t)mbuf_worker_thread);
6058         }
6059 }
6060
6061 static void
6062 mbuf_worker_thread_init(void)
6063 {
6064         mbuf_worker_ready++;
6065         mbuf_worker_thread();
6066 }
6067
6068 static mcl_slab_t *
6069 slab_get(void *buf)
6070 {
6071         mcl_slabg_t *slg;
6072         unsigned int ix, k;
6073
6074         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6075
6076         VERIFY(MBUF_IN_MAP(buf));
6077         ix = ((char *)buf - (char *)mbutl) >> MBSHIFT;
6078         VERIFY(ix < maxslabgrp);
6079
6080         if ((slg = slabstbl[ix]) == NULL) {
6081                 /*
6082                  * In the current implementation, we never shrink the memory
6083                  * pool (hence the cluster map); if we attempt to reallocate
6084                  * a cluster group when it's already allocated, panic since
6085                  * this is a sign of a memory corruption (slabstbl[ix] got
6086                  * nullified).  This also means that there shouldn't be any
6087                  * hole in the kernel sub-map for the mbuf pool.
6088                  */
6089                 ++slabgrp;
6090                 VERIFY(ix < slabgrp);
6091                 /*
6092                  * Slabs expansion can only be done single threaded; when
6093                  * we get here, it must be as a result of m_clalloc() which
6094                  * is serialized and therefore mb_clalloc_busy must be set.
6095                  */
6096                 VERIFY(mb_clalloc_busy);
6097                 lck_mtx_unlock(mbuf_mlock);
6098
6099                 /* This is a new buffer; create the slabs group for it */
6100                 MALLOC(slg, mcl_slabg_t *, sizeof (*slg), M_TEMP,
6101                     M_WAITOK | M_ZERO);
6102                 VERIFY(slg != NULL);
6103
6104                 lck_mtx_lock(mbuf_mlock);
6105                 /*
6106                  * No other thread could have gone into m_clalloc() after
6107                  * we dropped the lock above, so verify that it's true.
6108                  */
6109                 VERIFY(mb_clalloc_busy);
6110
6111                 slabstbl[ix] = slg;
6112
6113                 /* Chain each slab in the group to its forward neighbor */
6114                 for (k = 1; k < NSLABSPMB; k++)
6115                         slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
6116                 VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
6117
6118                 /* And chain the last slab in the previous group to this */
6119                 if (ix > 0) {
6120                         VERIFY(slabstbl[ix - 1]->
6121                             slg_slab[NSLABSPMB - 1].sl_next == NULL);
6122                         slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
6123                             &slg->slg_slab[0];
6124                 }
6125         }
6126
6127         ix = MTOBG(buf) % NSLABSPMB;
6128         VERIFY(ix < NSLABSPMB);
6129
6130         return (&slg->slg_slab[ix]);
6131 }
6132
6133 static void
6134 slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
6135     void *base, void *head, unsigned int len, int refcnt, int chunks)
6136 {
6137         sp->sl_class = class;
6138         sp->sl_flags = flags;
6139         sp->sl_base = base;
6140         sp->sl_head = head;
6141         sp->sl_len = len;
6142         sp->sl_refcnt = refcnt;
6143         sp->sl_chunks = chunks;
6144         slab_detach(sp);
6145 }
6146
6147 static void
6148 slab_insert(mcl_slab_t *sp, mbuf_class_t class)
6149 {
6150         VERIFY(slab_is_detached(sp));
6151         m_slab_cnt(class)++;
6152         TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
6153         sp->sl_flags &= ~SLF_DETACHED;
6154         if (class == MC_16KCL) {
6155                 int k;
6156                 for (k = 1; k < NSLABSP16KB; k++) {
6157                         sp = sp->sl_next;
6158                         /* Next slab must already be present */
6159                         VERIFY(sp != NULL);
6160                         VERIFY(slab_is_detached(sp));
6161                         sp->sl_flags &= ~SLF_DETACHED;
6162                 }
6163         }
6164 }
6165
6166 static void
6167 slab_remove(mcl_slab_t *sp, mbuf_class_t class)
6168 {
6169         VERIFY(!slab_is_detached(sp));
6170         VERIFY(m_slab_cnt(class) > 0);
6171         m_slab_cnt(class)--;
6172         TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
6173         slab_detach(sp);
6174         if (class == MC_16KCL) {
6175                 int k;
6176                 for (k = 1; k < NSLABSP16KB; k++) {
6177                         sp = sp->sl_next;
6178                         /* Next slab must already be present */
6179                         VERIFY(sp != NULL);
6180                         VERIFY(!slab_is_detached(sp));
6181                         slab_detach(sp);
6182                 }
6183         }
6184 }
6185
6186 static boolean_t
6187 slab_inrange(mcl_slab_t *sp, void *buf)
6188 {
6189         return ((uintptr_t)buf >= (uintptr_t)sp->sl_base &&
6190             (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len));
6191 }
6192
6193 #undef panic
6194
6195 static void
6196 slab_nextptr_panic(mcl_slab_t *sp, void *addr)
6197 {
6198         int i;
6199         unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
6200         uintptr_t buf = (uintptr_t)sp->sl_base;
6201
6202         for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
6203                 void *next = ((mcache_obj_t *)buf)->obj_next;
6204                 if (next != addr)
6205                         continue;
6206                 if (!mclverify) {
6207                         if (next != NULL && !MBUF_IN_MAP(next)) {
6208                                 mcache_t *cp = m_cache(sp->sl_class);
6209                                 panic("%s: %s buffer %p in slab %p modified "
6210                                     "after free at offset 0: %p out of range "
6211                                     "[%p-%p)\n", __func__, cp->mc_name,
6212                                     (void *)buf, sp, next, mbutl, embutl);
6213                                 /* NOTREACHED */
6214                         }
6215                 } else {
6216                         mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
6217                             (mcache_obj_t *)buf);
6218                         mcl_audit_verify_nextptr(next, mca);
6219                 }
6220         }
6221 }
6222
6223 static void
6224 slab_detach(mcl_slab_t *sp)
6225 {
6226         sp->sl_link.tqe_next = (mcl_slab_t *)-1;
6227         sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
6228         sp->sl_flags |= SLF_DETACHED;
6229 }
6230
6231 static boolean_t
6232 slab_is_detached(mcl_slab_t *sp)
6233 {
6234         return ((intptr_t)sp->sl_link.tqe_next == -1 &&
6235             (intptr_t)sp->sl_link.tqe_prev == -1 &&
6236             (sp->sl_flags & SLF_DETACHED));
6237 }
6238
6239 static void
6240 mcl_audit_init(void *buf, mcache_audit_t **mca_list,
6241     mcache_obj_t **con_list, size_t con_size, unsigned int num)
6242 {
6243         mcache_audit_t *mca, *mca_tail;
6244         mcache_obj_t *con = NULL;
6245         boolean_t save_contents = (con_list != NULL);
6246         unsigned int i, ix;
6247
6248         ASSERT(num <= NMBPBG);
6249         ASSERT(con_list == NULL || con_size != 0);
6250
6251         ix = MTOBG(buf);
6252         VERIFY(ix < maxclaudit);
6253
6254         /* Make sure we haven't been here before */
6255         for (i = 0; i < NMBPBG; i++)
6256                 VERIFY(mclaudit[ix].cl_audit[i] == NULL);
6257
6258         mca = mca_tail = *mca_list;
6259         if (save_contents)
6260                 con = *con_list;
6261
6262         for (i = 0; i < num; i++) {
6263                 mcache_audit_t *next;
6264
6265                 next = mca->mca_next;
6266                 bzero(mca, sizeof (*mca));
6267                 mca->mca_next = next;
6268                 mclaudit[ix].cl_audit[i] = mca;
6269
6270                 /* Attach the contents buffer if requested */
6271                 if (save_contents) {
6272                         VERIFY(con != NULL);
6273                         mca->mca_contents_size = con_size;
6274                         mca->mca_contents = con;
6275                         con = con->obj_next;
6276                         bzero(mca->mca_contents, mca->mca_contents_size);
6277                 }
6278
6279                 mca_tail = mca;
6280                 mca = mca->mca_next;
6281         }
6282
6283         if (save_contents)
6284                 *con_list = con;
6285
6286         *mca_list = mca_tail->mca_next;
6287         mca_tail->mca_next = NULL;
6288 }
6289
6290 /*
6291  * Given an address of a buffer (mbuf/2KB/4KB/16KB), return
6292  * the corresponding audit structure for that buffer.
6293  */
6294 static mcache_audit_t *
6295 mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *o)
6296 {
6297         mcache_audit_t *mca = NULL;
6298         int ix = MTOBG(o);
6299
6300         VERIFY(ix < maxclaudit);
6301         VERIFY(IS_P2ALIGNED(o, MIN(m_maxsize(class), NBPG)));
6302
6303         switch (class) {
6304         case MC_MBUF:
6305                 /*
6306                  * For the mbuf case, find the index of the page
6307                  * used by the mbuf and use that index to locate the
6308                  * base address of the page.  Then find out the
6309                  * mbuf index relative to the page base and use
6310                  * it to locate the audit structure.
6311                  */
6312                 VERIFY(MCLIDX(BGTOM(ix), o) < (int)NMBPBG);
6313                 mca = mclaudit[ix].cl_audit[MCLIDX(BGTOM(ix), o)];
6314                 break;
6315
6316         case MC_CL:
6317                 /*
6318                  * Same thing as above, but for 2KB clusters in a page.
6319                  */
6320                 VERIFY(CLBGIDX(BGTOM(ix), o) < (int)NCLPBG);
6321                 mca = mclaudit[ix].cl_audit[CLBGIDX(BGTOM(ix), o)];
6322                 break;
6323
6324         case MC_BIGCL:
6325         case MC_16KCL:
6326                 /*
6327                  * Same as above, but only return the first element.
6328                  */
6329                 mca = mclaudit[ix].cl_audit[0];
6330                 break;
6331
6332         default:
6333                 VERIFY(0);
6334                 /* NOTREACHED */
6335         }
6336
6337         return (mca);
6338 }
6339
6340 static void
6341 mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite,
6342     boolean_t alloc)
6343 {
6344         struct mbuf *m = addr;
6345         mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next;
6346
6347         VERIFY(mca->mca_contents != NULL &&
6348             mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
6349
6350         if (mclverify)
6351                 mcl_audit_verify_nextptr(next, mca);
6352
6353         if (!alloc) {
6354                 /* Save constructed mbuf fields */
6355                 mcl_audit_save_mbuf(m, mca);
6356                 if (mclverify) {
6357                         mcache_set_pattern(MCACHE_FREE_PATTERN, m,
6358                             m_maxsize(MC_MBUF));
6359                 }
6360                 ((mcache_obj_t *)m)->obj_next = next;
6361                 return;
6362         }
6363
6364         /* Check if the buffer has been corrupted while in freelist */
6365         if (mclverify) {
6366                 mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
6367         }
6368         /* Restore constructed mbuf fields */
6369         mcl_audit_restore_mbuf(m, mca, composite);
6370 }
6371
6372 static void
6373 mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite)
6374 {
6375         struct mbuf *ms = (struct mbuf *)mca->mca_contents;
6376
6377         if (composite) {
6378                 struct mbuf *next = m->m_next;
6379                 VERIFY(ms->m_flags == M_EXT && MEXT_RFA(ms) != NULL &&
6380                     MBUF_IS_COMPOSITE(ms));
6381                 /*
6382                  * We could have hand-picked the mbuf fields and restore
6383                  * them individually, but that will be a maintenance
6384                  * headache.  Instead, restore everything that was saved;
6385                  * the mbuf layer will recheck and reinitialize anyway.
6386                  */
6387                 bcopy(ms, m, mca->mca_contents_size);
6388                 m->m_next = next;
6389         } else {
6390                 /*
6391                  * For a regular mbuf (no cluster attached) there's nothing
6392                  * to restore other than the type field, which is expected
6393                  * to be MT_FREE.
6394                  */
6395                 m->m_type = ms->m_type;
6396         }
6397         _MCHECK(m);
6398 }
6399
6400 static void
6401 mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca)
6402 {
6403         _MCHECK(m);
6404         bcopy(m, mca->mca_contents, mca->mca_contents_size);
6405 }
6406
6407 static void
6408 mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc,
6409     boolean_t save_next)
6410 {
6411         mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next;
6412
6413         if (!alloc) {
6414                 if (mclverify) {
6415                         mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
6416                 }
6417                 if (save_next) {
6418                         mcl_audit_verify_nextptr(next, mca);
6419                         ((mcache_obj_t *)addr)->obj_next = next;
6420                 }
6421         } else if (mclverify) {
6422                 /* Check if the buffer has been corrupted while in freelist */
6423                 mcl_audit_verify_nextptr(next, mca);
6424                 mcache_audit_free_verify_set(mca, addr, 0, size);
6425         }
6426 }
6427
6428 static void
6429 mcl_audit_mcheck_panic(struct mbuf *m)
6430 {
6431         mcache_audit_t *mca;
6432
6433         MRANGE(m);
6434         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
6435
6436         panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n",
6437             m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(mca));
6438         /* NOTREACHED */
6439 }
6440
6441 static void
6442 mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
6443 {
6444         if (next != NULL && !MBUF_IN_MAP(next) &&
6445             (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) {
6446                 panic("mcl_audit: buffer %p modified after free at offset 0: "
6447                     "%p out of range [%p-%p)\n%s\n",
6448                     mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca));
6449                 /* NOTREACHED */
6450         }
6451 }
6452
6453 /* This function turns on mbuf leak detection */
6454 static void
6455 mleak_activate(void)
6456 {
6457         mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR;
6458         PE_parse_boot_argn("mleak_sample_factor",
6459             &mleak_table.mleak_sample_factor,
6460             sizeof (mleak_table.mleak_sample_factor));
6461
6462         if (mleak_table.mleak_sample_factor == 0)
6463                 mclfindleak = 0;
6464
6465         if (mclfindleak == 0)
6466                 return;
6467
6468         vm_size_t alloc_size =
6469             mleak_alloc_buckets * sizeof (struct mallocation);
6470         vm_size_t trace_size = mleak_trace_buckets * sizeof (struct mtrace);
6471
6472         MALLOC(mleak_allocations, struct mallocation *, alloc_size,
6473             M_TEMP, M_WAITOK | M_ZERO);
6474         VERIFY(mleak_allocations != NULL);
6475
6476         MALLOC(mleak_traces, struct mtrace *, trace_size,
6477             M_TEMP, M_WAITOK | M_ZERO);
6478         VERIFY(mleak_traces != NULL);
6479
6480         MALLOC(mleak_stat, mleak_stat_t *, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES),
6481             M_TEMP, M_WAITOK | M_ZERO);
6482         VERIFY(mleak_stat != NULL);
6483         mleak_stat->ml_cnt = MLEAK_NUM_TRACES;
6484 #ifdef __LP64__
6485         mleak_stat->ml_isaddr64 = 1;
6486 #endif /* __LP64__ */
6487 }
6488
6489 static void
6490 mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc)
6491 {
6492         int temp;
6493
6494         if (mclfindleak == 0)
6495                 return;
6496
6497         if (!alloc)
6498                 return (mleak_free(addr));
6499
6500         temp = atomic_add_32_ov(&mleak_table.mleak_capture, 1);
6501
6502         if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) {
6503                 uintptr_t bt[MLEAK_STACK_DEPTH];
6504                 int logged = fastbacktrace(bt, MLEAK_STACK_DEPTH);
6505                 mleak_log(bt, addr, logged, num);
6506         }
6507 }
6508
6509 /*
6510  * This function records the allocation in the mleak_allocations table
6511  * and the backtrace in the mleak_traces table; if allocation slot is in use,
6512  * replace old allocation with new one if the trace slot is in use, return
6513  * (or increment refcount if same trace).
6514  */
6515 static boolean_t
6516 mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num)
6517 {
6518         struct mallocation *allocation;
6519         struct mtrace *trace;
6520         uint32_t trace_index;
6521
6522         /* Quit if someone else modifying the tables */
6523         if (!lck_mtx_try_lock_spin(mleak_lock)) {
6524                 mleak_table.total_conflicts++;
6525                 return (FALSE);
6526         }
6527
6528         allocation = &mleak_allocations[hashaddr((uintptr_t)addr,
6529             mleak_alloc_buckets)];
6530         trace_index = hashbacktrace(bt, depth, mleak_trace_buckets);
6531         trace = &mleak_traces[trace_index];
6532
6533         VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]);
6534         VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]);
6535
6536         allocation->hitcount++;
6537         trace->hitcount++;
6538
6539         /*
6540          * If the allocation bucket we want is occupied
6541          * and the occupier has the same trace, just bail.
6542          */
6543         if (allocation->element != NULL &&
6544             trace_index == allocation->trace_index) {
6545                 mleak_table.alloc_collisions++;
6546                 lck_mtx_unlock(mleak_lock);
6547                 return (TRUE);
6548         }
6549
6550         /*
6551          * Store the backtrace in the traces array;
6552          * Size of zero = trace bucket is free.
6553          */
6554         if (trace->allocs > 0 &&
6555             bcmp(trace->addr, bt, (depth * sizeof (uintptr_t))) != 0) {
6556                 /* Different, unique trace, but the same hash! Bail out. */
6557                 trace->collisions++;
6558                 mleak_table.trace_collisions++;
6559                 lck_mtx_unlock(mleak_lock);
6560                 return (TRUE);
6561         } else if (trace->allocs > 0) {
6562                 /* Same trace, already added, so increment refcount */
6563                 trace->allocs++;
6564         } else {
6565                 /* Found an unused trace bucket, so record the trace here */
6566                 if (trace->depth != 0) {
6567                         /* this slot previously used but not currently in use */
6568                         mleak_table.trace_overwrites++;
6569                 }
6570                 mleak_table.trace_recorded++;
6571                 trace->allocs = 1;
6572                 memcpy(trace->addr, bt, (depth * sizeof (uintptr_t)));
6573                 trace->depth = depth;
6574                 trace->collisions = 0;
6575         }
6576
6577         /* Step 2: Store the allocation record in the allocations array */
6578         if (allocation->element != NULL) {
6579                 /*
6580                  * Replace an existing allocation.  No need to preserve
6581                  * because only a subset of the allocations are being
6582                  * recorded anyway.
6583                  */
6584                 mleak_table.alloc_collisions++;
6585         } else if (allocation->trace_index != 0) {
6586                 mleak_table.alloc_overwrites++;
6587         }
6588         allocation->element = addr;
6589         allocation->trace_index = trace_index;
6590         allocation->count = num;
6591         mleak_table.alloc_recorded++;
6592         mleak_table.outstanding_allocs++;
6593
6594         lck_mtx_unlock(mleak_lock);
6595         return (TRUE);
6596 }
6597
6598 static void
6599 mleak_free(mcache_obj_t *addr)
6600 {
6601         while (addr != NULL) {
6602                 struct mallocation *allocation = &mleak_allocations
6603                     [hashaddr((uintptr_t)addr, mleak_alloc_buckets)];
6604
6605                 if (allocation->element == addr &&
6606                     allocation->trace_index < mleak_trace_buckets) {
6607                         lck_mtx_lock_spin(mleak_lock);
6608                         if (allocation->element == addr &&
6609                             allocation->trace_index < mleak_trace_buckets) {
6610                                 struct mtrace *trace;
6611                                 trace = &mleak_traces[allocation->trace_index];
6612                                 /* allocs = 0 means trace bucket is unused */
6613                                 if (trace->allocs > 0)
6614                                         trace->allocs--;
6615                                 if (trace->allocs == 0)
6616                                         trace->depth = 0;
6617                                 /* NULL element means alloc bucket is unused */
6618                                 allocation->element = NULL;
6619                                 mleak_table.outstanding_allocs--;
6620                         }
6621                         lck_mtx_unlock(mleak_lock);
6622                 }
6623                 addr = addr->obj_next;
6624         }
6625 }
6626
6627 static void
6628 mleak_sort_traces()
6629 {
6630         int i, j, k;
6631         struct mtrace *swap;
6632
6633         for(i = 0; i < MLEAK_NUM_TRACES; i++)
6634                 mleak_top_trace[i] = NULL;
6635
6636         for(i = 0, j = 0; j < MLEAK_NUM_TRACES && i < mleak_trace_buckets; i++)
6637         {
6638                 if (mleak_traces[i].allocs <= 0)
6639                         continue;
6640
6641                 mleak_top_trace[j] = &mleak_traces[i];
6642                 for (k = j; k > 0; k--) {
6643                         if (mleak_top_trace[k]->allocs <=
6644                             mleak_top_trace[k-1]->allocs)
6645                                 break;
6646
6647                         swap = mleak_top_trace[k-1];
6648                         mleak_top_trace[k-1] = mleak_top_trace[k];
6649                         mleak_top_trace[k] = swap;
6650                 }
6651                 j++;
6652         }
6653
6654         j--;
6655         for(; i < mleak_trace_buckets; i++) {
6656                 if (mleak_traces[i].allocs <= mleak_top_trace[j]->allocs)
6657                         continue;
6658
6659                 mleak_top_trace[j] = &mleak_traces[i];
6660
6661                 for (k = j; k > 0; k--) {
6662                         if (mleak_top_trace[k]->allocs <=
6663                             mleak_top_trace[k-1]->allocs)
6664                                 break;
6665
6666                         swap = mleak_top_trace[k-1];
6667                         mleak_top_trace[k-1] = mleak_top_trace[k];
6668                         mleak_top_trace[k] = swap;
6669                 }
6670         }
6671 }
6672
6673 static void
6674 mleak_update_stats()
6675 {
6676         mleak_trace_stat_t *mltr;
6677         int i;
6678
6679         VERIFY(mleak_stat != NULL);
6680 #ifdef __LP64__
6681         VERIFY(mleak_stat->ml_isaddr64);
6682 #else
6683         VERIFY(!mleak_stat->ml_isaddr64);
6684 #endif /* !__LP64__ */
6685         VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES);
6686
6687         mleak_sort_traces();
6688
6689         mltr = &mleak_stat->ml_trace[0];
6690         bzero(mltr, sizeof (*mltr) * MLEAK_NUM_TRACES);
6691         for (i = 0; i < MLEAK_NUM_TRACES; i++) {
6692         int j;
6693
6694                 if (mleak_top_trace[i] == NULL ||
6695                     mleak_top_trace[i]->allocs == 0)
6696                         continue;
6697
6698                 mltr->mltr_collisions   = mleak_top_trace[i]->collisions;
6699                 mltr->mltr_hitcount     = mleak_top_trace[i]->hitcount;
6700                 mltr->mltr_allocs       = mleak_top_trace[i]->allocs;
6701                 mltr->mltr_depth        = mleak_top_trace[i]->depth;
6702
6703                 VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH);
6704                 for (j = 0; j < mltr->mltr_depth; j++)
6705                         mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j];
6706
6707                 mltr++;
6708         }
6709 }
6710
6711 static struct mbtypes {
6712         int             mt_type;
6713         const char      *mt_name;
6714 } mbtypes[] = {
6715         { MT_DATA,      "data" },
6716         { MT_OOBDATA,   "oob data" },
6717         { MT_CONTROL,   "ancillary data" },
6718         { MT_HEADER,    "packet headers" },
6719         { MT_SOCKET,    "socket structures" },
6720         { MT_PCB,       "protocol control blocks" },
6721         { MT_RTABLE,    "routing table entries" },
6722         { MT_HTABLE,    "IMP host table entries" },
6723         { MT_ATABLE,    "address resolution tables" },
6724         { MT_FTABLE,    "fragment reassembly queue headers" },
6725         { MT_SONAME,    "socket names and addresses" },
6726         { MT_SOOPTS,    "socket options" },
6727         { MT_RIGHTS,    "access rights" },
6728         { MT_IFADDR,    "interface addresses" },
6729         { MT_TAG,       "packet tags" },
6730         { 0,            NULL }
6731 };
6732
6733 #define MBUF_DUMP_BUF_CHK() {   \
6734         clen -= k;              \
6735         if (clen < 1)           \
6736                 goto done;      \
6737         c += k;                 \
6738 }
6739
6740 static char *
6741 mbuf_dump(void)
6742 {
6743         unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct;
6744         u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0;
6745         u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0;
6746         u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0;
6747         int nmbtypes = sizeof (mbstat.m_mtypes) / sizeof (short);
6748         uint8_t seen[256];
6749         struct mbtypes *mp;
6750         mb_class_stat_t *sp;
6751         mleak_trace_stat_t *mltr;
6752         char *c = mbuf_dump_buf;
6753         int i, k, clen = MBUF_DUMP_BUF_SIZE;
6754
6755         mbuf_dump_buf[0] = '\0';
6756
6757         /* synchronize all statistics in the mbuf table */
6758         mbuf_stat_sync();
6759         mbuf_mtypes_sync(TRUE);
6760
6761         sp = &mb_stat->mbs_class[0];
6762         for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) {
6763                 u_int32_t mem;
6764
6765                 if (m_class(i) == MC_MBUF) {
6766                         m_mbufs = sp->mbcl_active;
6767                 } else if (m_class(i) == MC_CL) {
6768                         m_clfree = sp->mbcl_total - sp->mbcl_active;
6769                 } else if (m_class(i) == MC_BIGCL) {
6770                         m_bigclfree = sp->mbcl_total - sp->mbcl_active;
6771                 } else if (njcl > 0 && m_class(i) == MC_16KCL) {
6772                         m_16kclfree = sp->mbcl_total - sp->mbcl_active;
6773                         m_16kclusters = sp->mbcl_total;
6774                 } else if (m_class(i) == MC_MBUF_CL) {
6775                         m_mbufclfree = sp->mbcl_total - sp->mbcl_active;
6776                 } else if (m_class(i) == MC_MBUF_BIGCL) {
6777                         m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active;
6778                 } else if (njcl > 0 && m_class(i) == MC_MBUF_16KCL) {
6779                         m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active;
6780                 }
6781
6782                 mem = sp->mbcl_ctotal * sp->mbcl_size;
6783                 totmem += mem;
6784                 totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) *
6785                     sp->mbcl_size;
6786
6787         }
6788
6789         /* adjust free counts to include composite caches */
6790         m_clfree += m_mbufclfree;
6791         m_bigclfree += m_mbufbigclfree;
6792         m_16kclfree += m_mbuf16kclfree;
6793
6794         totmbufs = 0;
6795         for (mp = mbtypes; mp->mt_name != NULL; mp++)
6796                 totmbufs += mbstat.m_mtypes[mp->mt_type];
6797         if (totmbufs > m_mbufs)
6798                 totmbufs = m_mbufs;
6799         k = snprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs);
6800         MBUF_DUMP_BUF_CHK();
6801
6802         bzero(&seen, sizeof (seen));
6803         for (mp = mbtypes; mp->mt_name != NULL; mp++) {
6804                 if (mbstat.m_mtypes[mp->mt_type] != 0) {
6805                         seen[mp->mt_type] = 1;
6806                         k = snprintf(c, clen, "\t%u mbufs allocated to %s\n",
6807                             mbstat.m_mtypes[mp->mt_type], mp->mt_name);
6808                         MBUF_DUMP_BUF_CHK();
6809                 }
6810         }
6811         seen[MT_FREE] = 1;
6812         for (i = 0; i < nmbtypes; i++)
6813                 if (!seen[i] && mbstat.m_mtypes[i] != 0) {
6814                         k = snprintf(c, clen, "\t%u mbufs allocated to "
6815                             "<mbuf type %d>\n", mbstat.m_mtypes[i], i);
6816                         MBUF_DUMP_BUF_CHK();
6817                 }
6818         if ((m_mbufs - totmbufs) > 0) {
6819                 k = snprintf(c, clen, "\t%lu mbufs allocated to caches\n",
6820                     m_mbufs - totmbufs);
6821                 MBUF_DUMP_BUF_CHK();
6822         }
6823         k = snprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n"
6824             "%u/%u mbuf 4KB clusters in use\n",
6825             (unsigned int)(mbstat.m_clusters - m_clfree),
6826             (unsigned int)mbstat.m_clusters,
6827             (unsigned int)(mbstat.m_bigclusters - m_bigclfree),
6828             (unsigned int)mbstat.m_bigclusters);
6829         MBUF_DUMP_BUF_CHK();
6830
6831         if (njcl > 0) {
6832                 k = snprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n",
6833                     m_16kclusters - m_16kclfree, m_16kclusters,
6834                     njclbytes / 1024);
6835                 MBUF_DUMP_BUF_CHK();
6836         }
6837         totused = totmem - totfree;
6838         if (totmem == 0) {
6839                 totpct = 0;
6840         } else if (totused < (ULONG_MAX / 100)) {
6841                 totpct = (totused * 100) / totmem;
6842         } else {
6843                 u_long totmem1 = totmem / 100;
6844                 u_long totused1 = totused / 100;
6845                 totpct = (totused1 * 100) / totmem1;
6846         }
6847         k = snprintf(c, clen, "%lu KB allocated to network (approx. %lu%% "
6848             "in use)\n", totmem / 1024, totpct);
6849         MBUF_DUMP_BUF_CHK();
6850
6851         /* mbuf leak detection statistics */
6852         mleak_update_stats();
6853
6854         k = snprintf(c, clen, "\nmbuf leak detection table:\n");
6855         MBUF_DUMP_BUF_CHK();
6856         k = snprintf(c, clen, "\ttotal captured: %u (one per %u)\n",
6857             mleak_table.mleak_capture / mleak_table.mleak_sample_factor,
6858             mleak_table.mleak_sample_factor);
6859         MBUF_DUMP_BUF_CHK();
6860         k = snprintf(c, clen, "\ttotal allocs outstanding: %llu\n",
6861             mleak_table.outstanding_allocs);
6862         MBUF_DUMP_BUF_CHK();
6863         k = snprintf(c, clen, "\tnew hash recorded: %llu allocs, %llu traces\n",
6864             mleak_table.alloc_recorded, mleak_table.trace_recorded);
6865         MBUF_DUMP_BUF_CHK();
6866         k = snprintf(c, clen, "\thash collisions: %llu allocs, %llu traces\n",
6867             mleak_table.alloc_collisions, mleak_table.trace_collisions);
6868         MBUF_DUMP_BUF_CHK();
6869         k = snprintf(c, clen, "\toverwrites: %llu allocs, %llu traces\n",
6870             mleak_table.alloc_overwrites, mleak_table.trace_overwrites);
6871         MBUF_DUMP_BUF_CHK();
6872         k = snprintf(c, clen, "\tlock conflicts: %llu\n\n",
6873             mleak_table.total_conflicts);
6874         MBUF_DUMP_BUF_CHK();
6875
6876         k = snprintf(c, clen, "top %d outstanding traces:\n",
6877             mleak_stat->ml_cnt);
6878         MBUF_DUMP_BUF_CHK();
6879         for (i = 0; i < mleak_stat->ml_cnt; i++) {
6880                 mltr = &mleak_stat->ml_trace[i];
6881                 k = snprintf(c, clen, "[%d] %llu outstanding alloc(s), "
6882                     "%llu hit(s), %llu collision(s)\n", (i + 1),
6883                     mltr->mltr_allocs, mltr->mltr_hitcount,
6884                     mltr->mltr_collisions);
6885                 MBUF_DUMP_BUF_CHK();
6886         }
6887
6888         if (mleak_stat->ml_isaddr64)
6889                 k = snprintf(c, clen, MB_LEAK_HDR_64);
6890         else
6891                 k = snprintf(c, clen, MB_LEAK_HDR_32);
6892         MBUF_DUMP_BUF_CHK();
6893
6894         for (i = 0; i < MLEAK_STACK_DEPTH; i++) {
6895                 int j;
6896                 k = snprintf(c, clen, "%2d: ", (i + 1));
6897                 MBUF_DUMP_BUF_CHK();
6898                 for (j = 0; j < mleak_stat->ml_cnt; j++) {
6899                         mltr = &mleak_stat->ml_trace[j];
6900                         if (i < mltr->mltr_depth) {
6901                                 if (mleak_stat->ml_isaddr64) {
6902                                         k = snprintf(c, clen, "0x%0llx  ",
6903                                             mltr->mltr_addr[i]);
6904                                 } else {
6905                                         k = snprintf(c, clen,
6906                                             "0x%08x  ",
6907                                             (u_int32_t)mltr->mltr_addr[i]);
6908                                 }
6909                         } else {
6910                                 if (mleak_stat->ml_isaddr64)
6911                                         k = snprintf(c, clen,
6912                                             MB_LEAK_SPACING_64);
6913                                 else
6914                                         k = snprintf(c, clen,
6915                                             MB_LEAK_SPACING_32);
6916                         }
6917                         MBUF_DUMP_BUF_CHK();
6918                 }
6919                 k = snprintf(c, clen, "\n");
6920                 MBUF_DUMP_BUF_CHK();
6921         }
6922 done:
6923         return (mbuf_dump_buf);
6924 }
6925
6926 #undef MBUF_DUMP_BUF_CHK
6927
6928 SYSCTL_DECL(_kern_ipc);
6929 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat,
6930     CTLFLAG_RD | CTLFLAG_LOCKED,
6931     0, 0, mbstat_sysctl, "S,mbstat", "");
6932 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat,
6933     CTLFLAG_RD | CTLFLAG_LOCKED,
6934     0, 0, mb_stat_sysctl, "S,mb_stat", "");
6935 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace,
6936     CTLFLAG_RD | CTLFLAG_LOCKED,
6937     0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", "");
6938 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table,
6939     CTLFLAG_RD | CTLFLAG_LOCKED,
6940     0, 0, mleak_table_sysctl, "S,mleak_table", "");
6941 SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor,
6942     CTLFLAG_RW | CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, "");
6943 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized,
6944     CTLFLAG_RD | CTLFLAG_LOCKED, &mb_normalized, 0, "");
6945 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog,
6946     CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, "");