bsd/kern/uipc_mbuf.c

   1 /*
   2  * Copyright (c) 1998-2012 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1988, 1991, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/malloc.h>
  73 #include <sys/mbuf.h>
  74 #include <sys/kernel.h>
  75 #include <sys/sysctl.h>
  76 #include <sys/syslog.h>
  77 #include <sys/protosw.h>
  78 #include <sys/domain.h>
  79 #include <sys/queue.h>
  80 #include <sys/proc.h>
  81
  82 #include <kern/kern_types.h>
  83 #include <kern/simple_lock.h>
  84 #include <kern/queue.h>
  85 #include <kern/sched_prim.h>
  86 #include <kern/cpu_number.h>
  87 #include <kern/zalloc.h>
  88
  89 #include <libkern/OSAtomic.h>
  90 #include <libkern/libkern.h>
  91
  92 #include <IOKit/IOMapper.h>
  93
  94 #include <machine/limits.h>
  95 #include <machine/machine_routines.h>
  96
  97 #if CONFIG_MACF_NET
  98 #include <security/mac_framework.h>
  99 #endif /* MAC_NET */
 100
 101 #include <sys/mcache.h>
 102
 103 /*
 104  * MBUF IMPLEMENTATION NOTES.
 105  *
 106  * There is a total of 5 per-CPU caches:
 107  *
 108  * MC_MBUF:
 109  *      This is a cache of rudimentary objects of MSIZE in size; each
 110  *      object represents an mbuf structure.  This cache preserves only
 111  *      the m_type field of the mbuf during its transactions.
 112  *
 113  * MC_CL:
 114  *      This is a cache of rudimentary objects of MCLBYTES in size; each
 115  *      object represents a mcluster structure.  This cache does not
 116  *      preserve the contents of the objects during its transactions.
 117  *
 118  * MC_BIGCL:
 119  *      This is a cache of rudimentary objects of MBIGCLBYTES in size; each
 120  *      object represents a mbigcluster structure.  This cache does not
 121  *      preserve the contents of the objects during its transaction.
 122  *
 123  * MC_MBUF_CL:
 124  *      This is a cache of mbufs each having a cluster attached to it.
 125  *      It is backed by MC_MBUF and MC_CL rudimentary caches.  Several
 126  *      fields of the mbuf related to the external cluster are preserved
 127  *      during transactions.
 128  *
 129  * MC_MBUF_BIGCL:
 130  *      This is a cache of mbufs each having a big cluster attached to it.
 131  *      It is backed by MC_MBUF and MC_BIGCL rudimentary caches.  Several
 132  *      fields of the mbuf related to the external cluster are preserved
 133  *      during transactions.
 134  *
 135  * OBJECT ALLOCATION:
 136  *
 137  * Allocation requests are handled first at the per-CPU (mcache) layer
 138  * before falling back to the slab layer.  Performance is optimal when
 139  * the request is satisfied at the CPU layer because global data/lock
 140  * never gets accessed.  When the slab layer is entered for allocation,
 141  * the slab freelist will be checked first for available objects before
 142  * the VM backing store is invoked.  Slab layer operations are serialized
 143  * for all of the caches as the mbuf global lock is held most of the time.
 144  * Allocation paths are different depending on the class of objects:
 145  *
 146  * a. Rudimentary object:
 147  *
 148  *      { m_get_common(), m_clattach(), m_mclget(),
 149  *        m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
 150  *        composite object allocation }
 151  *                      |       ^
 152  *                      |       |
 153  *                      |       +-----------------------+
 154  *                      v                               |
 155  *         mcache_alloc/mcache_alloc_ext()      mbuf_slab_audit()
 156  *                      |                               ^
 157  *                      v                               |
 158  *                 [CPU cache] -------> (found?) -------+
 159  *                      |                               |
 160  *                      v                               |
 161  *               mbuf_slab_alloc()                      |
 162  *                      |                               |
 163  *                      v                               |
 164  *      +---------> [freelist] -------> (found?) -------+
 165  *      |               |
 166  *      |               v
 167  *      |           m_clalloc()
 168  *      |               |
 169  *      |               v
 170  *      +---<<---- kmem_mb_alloc()
 171  *
 172  * b. Composite object:
 173  *
 174  *      { m_getpackets_internal(), m_allocpacket_internal() }
 175  *                      |       ^
 176  *                      |       |
 177  *                      |       +------ (done) ---------+
 178  *                      v                               |
 179  *         mcache_alloc/mcache_alloc_ext()      mbuf_cslab_audit()
 180  *                      |                               ^
 181  *                      v                               |
 182  *                 [CPU cache] -------> (found?) -------+
 183  *                      |                               |
 184  *                      v                               |
 185  *               mbuf_cslab_alloc()                     |
 186  *                      |                               |
 187  *                      v                               |
 188  *                  [freelist] -------> (found?) -------+
 189  *                      |                               |
 190  *                      v                               |
 191  *              (rudimentary object)                    |
 192  *         mcache_alloc/mcache_alloc_ext() ------>>-----+
 193  *
 194  * Auditing notes: If auditing is enabled, buffers will be subjected to
 195  * integrity checks by the audit routine.  This is done by verifying their
 196  * contents against DEADBEEF (free) pattern before returning them to caller.
 197  * As part of this step, the routine will also record the transaction and
 198  * pattern-fill the buffers with BADDCAFE (uninitialized) pattern.  It will
 199  * also restore any constructed data structure fields if necessary.
 200  *
 201  * OBJECT DEALLOCATION:
 202  *
 203  * Freeing an object simply involves placing it into the CPU cache; this
 204  * pollutes the cache to benefit subsequent allocations.  The slab layer
 205  * will only be entered if the object is to be purged out of the cache.
 206  * During normal operations, this happens only when the CPU layer resizes
 207  * its bucket while it's adjusting to the allocation load.  Deallocation
 208  * paths are different depending on the class of objects:
 209  *
 210  * a. Rudimentary object:
 211  *
 212  *      { m_free(), m_freem_list(), composite object deallocation }
 213  *                      |       ^
 214  *                      |       |
 215  *                      |       +------ (done) ---------+
 216  *                      v                               |
 217  *         mcache_free/mcache_free_ext()                |
 218  *                      |                               |
 219  *                      v                               |
 220  *              mbuf_slab_audit()                       |
 221  *                      |                               |
 222  *                      v                               |
 223  *                 [CPU cache] ---> (not purging?) -----+
 224  *                      |                               |
 225  *                      v                               |
 226  *               mbuf_slab_free()                       |
 227  *                      |                               |
 228  *                      v                               |
 229  *                  [freelist] ----------->>------------+
 230  *       (objects never get purged to VM)
 231  *
 232  * b. Composite object:
 233  *
 234  *      { m_free(), m_freem_list() }
 235  *                      |       ^
 236  *                      |       |
 237  *                      |       +------ (done) ---------+
 238  *                      v                               |
 239  *         mcache_free/mcache_free_ext()                |
 240  *                      |                               |
 241  *                      v                               |
 242  *              mbuf_cslab_audit()                      |
 243  *                      |                               |
 244  *                      v                               |
 245  *                 [CPU cache] ---> (not purging?) -----+
 246  *                      |                               |
 247  *                      v                               |
 248  *               mbuf_cslab_free()                      |
 249  *                      |                               |
 250  *                      v                               |
 251  *                  [freelist] ---> (not purging?) -----+
 252  *                      |                               |
 253  *                      v                               |
 254  *              (rudimentary object)                    |
 255  *         mcache_free/mcache_free_ext() ------->>------+
 256  *
 257  * Auditing notes: If auditing is enabled, the audit routine will save
 258  * any constructed data structure fields (if necessary) before filling the
 259  * contents of the buffers with DEADBEEF (free) pattern and recording the
 260  * transaction.  Buffers that are freed (whether at CPU or slab layer) are
 261  * expected to contain the free pattern.
 262  *
 263  * DEBUGGING:
 264  *
 265  * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
 266  * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT).  Additionally,
 267  * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
 268  * i.e. modify the boot argument parameter to "mbuf_debug=0x13".  Leak
 269  * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g.
 270  * "mbuf_debug=0x113".  Note that debugging consumes more CPU and memory.
 271  *
 272  * Each object is associated with exactly one mcache_audit_t structure that
 273  * contains the information related to its last buffer transaction.  Given
 274  * an address of an object, the audit structure can be retrieved by finding
 275  * the position of the object relevant to the base address of the cluster:
 276  *
 277  *      +------------+                  +=============+
 278  *      | mbuf addr  |                  | mclaudit[i] |
 279  *      +------------+                  +=============+
 280  *            |                         | cl_audit[0] |
 281  *      i = MTOBG(addr)                 +-------------+
 282  *            |                 +-----> | cl_audit[1] | -----> mcache_audit_t
 283  *      b = BGTOM(i)            |       +-------------+
 284  *            |                 |       |     ...     |
 285  *      x = MCLIDX(b, addr)     |       +-------------+
 286  *            |                 |       | cl_audit[7] |
 287  *            +-----------------+       +-------------+
 288  *               (e.g. x == 1)
 289  *
 290  * The mclaudit[] array is allocated at initialization time, but its contents
 291  * get populated when the corresponding cluster is created.  Because a page
 292  * can be turned into NMBPBG number of mbufs, we preserve enough space for the
 293  * mbufs so that there is a 1-to-1 mapping between them.  A page that never
 294  * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
 295  * remaining entries unused.  For 16KB cluster, only one entry from the first
 296  * page is allocated and used for the entire object.
 297  */
 298
 299 /* TODO: should be in header file */
 300 /* kernel translater */
 301 extern vm_offset_t kmem_mb_alloc(vm_map_t, int, int);
 302 extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
 303 extern vm_map_t mb_map;         /* special map */
 304
 305 /* Global lock */
 306 decl_lck_mtx_data(static, mbuf_mlock_data);
 307 static lck_mtx_t *mbuf_mlock = &mbuf_mlock_data;
 308 static lck_attr_t *mbuf_mlock_attr;
 309 static lck_grp_t *mbuf_mlock_grp;
 310 static lck_grp_attr_t *mbuf_mlock_grp_attr;
 311
 312 /* Back-end (common) layer */
 313 static void *mbuf_worker_run;   /* wait channel for worker thread */
 314 static int mbuf_worker_ready;   /* worker thread is runnable */
 315 static int mbuf_expand_mcl;     /* number of cluster creation requets */
 316 static int mbuf_expand_big;     /* number of big cluster creation requests */
 317 static int mbuf_expand_16k;     /* number of 16KB cluster creation requests */
 318 static int ncpu;                /* number of CPUs */
 319 static ppnum_t *mcl_paddr;      /* Array of cluster physical addresses */
 320 static ppnum_t mcl_pages;       /* Size of array (# physical pages) */
 321 static ppnum_t mcl_paddr_base;  /* Handle returned by IOMapper::iovmAlloc() */
 322 static mcache_t *ref_cache;     /* Cache of cluster reference & flags */
 323 static mcache_t *mcl_audit_con_cache; /* Audit contents cache */
 324 static unsigned int mbuf_debug; /* patchable mbuf mcache flags */
 325 static unsigned int mb_normalized; /* number of packets "normalized" */
 326
 327 #define MB_GROWTH_AGGRESSIVE    1       /* Threshold: 1/2 of total */
 328 #define MB_GROWTH_NORMAL        2       /* Threshold: 3/4 of total */
 329
 330 typedef enum {
 331         MC_MBUF = 0,    /* Regular mbuf */
 332         MC_CL,          /* Cluster */
 333         MC_BIGCL,       /* Large (4KB) cluster */
 334         MC_16KCL,       /* Jumbo (16KB) cluster */
 335         MC_MBUF_CL,     /* mbuf + cluster */
 336         MC_MBUF_BIGCL,  /* mbuf + large (4KB) cluster */
 337         MC_MBUF_16KCL   /* mbuf + jumbo (16KB) cluster */
 338 } mbuf_class_t;
 339
 340 #define MBUF_CLASS_MIN          MC_MBUF
 341 #define MBUF_CLASS_MAX          MC_MBUF_16KCL
 342 #define MBUF_CLASS_LAST         MC_16KCL
 343 #define MBUF_CLASS_VALID(c) \
 344         ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
 345 #define MBUF_CLASS_COMPOSITE(c) \
 346         ((int)(c) > MBUF_CLASS_LAST)
 347
 348
 349 /*
 350  * mbuf specific mcache allocation request flags.
 351  */
 352 #define MCR_COMP        MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
 353
 354 /*
 355  * Per-cluster slab structure.
 356  *
 357  * A slab is a cluster control structure that contains one or more object
 358  * chunks; the available chunks are chained in the slab's freelist (sl_head).
 359  * Each time a chunk is taken out of the slab, the slab's reference count
 360  * gets incremented.  When all chunks have been taken out, the empty slab
 361  * gets removed (SLF_DETACHED) from the class's slab list.  A chunk that is
 362  * returned to a slab causes the slab's reference count to be decremented;
 363  * it also causes the slab to be reinserted back to class's slab list, if
 364  * it's not already done.
 365  *
 366  * Compartmentalizing of the object chunks into slabs allows us to easily
 367  * merge one or more slabs together when the adjacent slabs are idle, as
 368  * well as to convert or move a slab from one class to another; e.g. the
 369  * mbuf cluster slab can be converted to a regular cluster slab when all
 370  * mbufs in the slab have been freed.
 371  *
 372  * A slab may also span across multiple clusters for chunks larger than
 373  * a cluster's size.  In this case, only the slab of the first cluster is
 374  * used.  The rest of the slabs are marked with SLF_PARTIAL to indicate
 375  * that they are part of the larger slab.
 376  *
 377  * Each slab controls a page of memory.
 378  */
 379 typedef struct mcl_slab {
 380         struct mcl_slab *sl_next;       /* neighboring slab */
 381         u_int8_t        sl_class;       /* controlling mbuf class */
 382         int8_t          sl_refcnt;      /* outstanding allocations */
 383         int8_t          sl_chunks;      /* chunks (bufs) in this slab */
 384         u_int16_t       sl_flags;       /* slab flags (see below) */
 385         u_int16_t       sl_len;         /* slab length */
 386         void            *sl_base;       /* base of allocated memory */
 387         void            *sl_head;       /* first free buffer */
 388         TAILQ_ENTRY(mcl_slab) sl_link;  /* next/prev slab on freelist */
 389 } mcl_slab_t;
 390
 391 #define SLF_MAPPED      0x0001          /* backed by a mapped page */
 392 #define SLF_PARTIAL     0x0002          /* part of another slab */
 393 #define SLF_DETACHED    0x0004          /* not in slab freelist */
 394
 395 /*
 396  * The array of slabs are broken into groups of arrays per 1MB of kernel
 397  * memory to reduce the footprint.  Each group is allocated on demand
 398  * whenever a new piece of memory mapped in from the VM crosses the 1MB
 399  * boundary.
 400  */
 401 #define NSLABSPMB       ((1 << MBSHIFT) >> PGSHIFT)     /* 256 slabs/grp */
 402
 403 typedef struct mcl_slabg {
 404         mcl_slab_t      slg_slab[NSLABSPMB];    /* group of slabs */
 405 } mcl_slabg_t;
 406
 407 /*
 408  * Number of slabs needed to control a 16KB cluster object.
 409  */
 410 #define NSLABSP16KB     (M16KCLBYTES >> PGSHIFT)
 411
 412 /*
 413  * Per-cluster audit structure.
 414  */
 415 typedef struct {
 416         mcache_audit_t  *cl_audit[NMBPBG];      /* array of audits */
 417 } mcl_audit_t;
 418
 419 /*
 420  * Size of data from the beginning of an mbuf that covers m_hdr, pkthdr
 421  * and m_ext structures.  If auditing is enabled, we allocate a shadow
 422  * mbuf structure of this size inside each audit structure, and the
 423  * contents of the real mbuf gets copied into it when the mbuf is freed.
 424  * This allows us to pattern-fill the mbuf for integrity check, and to
 425  * preserve any constructed mbuf fields (e.g. mbuf + cluster cache case).
 426  * Note that we don't save the contents of clusters when they are freed;
 427  * we simply pattern-fill them.
 428  */
 429 #define AUDIT_CONTENTS_SIZE     ((MSIZE - MHLEN) + sizeof (_m_ext_t))
 430
 431 /*
 432  * mbuf specific mcache audit flags
 433  */
 434 #define MB_INUSE        0x01    /* object has not been returned to slab */
 435 #define MB_COMP_INUSE   0x02    /* object has not been returned to cslab */
 436 #define MB_SCVALID      0x04    /* object has valid saved contents */
 437
 438 /*
 439  * Each of the following two arrays hold up to nmbclusters elements.
 440  */
 441 static mcl_audit_t *mclaudit;   /* array of cluster audit information */
 442 static unsigned int maxclaudit; /* max # of entries in audit table */
 443 static mcl_slabg_t **slabstbl;  /* cluster slabs table */
 444 static unsigned int maxslabgrp; /* max # of entries in slabs table */
 445 static unsigned int slabgrp;    /* # of entries in slabs table */
 446
 447 /* Globals */
 448 int nclusters;                  /* # of clusters for non-jumbo (legacy) sizes */
 449 int njcl;                       /* # of clusters for jumbo sizes */
 450 int njclbytes;                  /* size of a jumbo cluster */
 451 union mbigcluster *mbutl;       /* first mapped cluster address */
 452 union mbigcluster *embutl;      /* ending virtual address of mclusters */
 453 int _max_linkhdr;               /* largest link-level header */
 454 int _max_protohdr;              /* largest protocol header */
 455 int max_hdr;                    /* largest link+protocol header */
 456 int max_datalen;                /* MHLEN - max_hdr */
 457
 458 static boolean_t mclverify;     /* debug: pattern-checking */
 459 static boolean_t mcltrace;      /* debug: stack tracing */
 460 static boolean_t mclfindleak;   /* debug: leak detection */
 461 static boolean_t mclexpleak;    /* debug: expose leak info to user space */
 462
 463 /* mbuf leak detection variables */
 464 static struct mleak_table mleak_table;
 465 static mleak_stat_t *mleak_stat;
 466
 467 #define MLEAK_STAT_SIZE(n) \
 468         ((size_t)(&((mleak_stat_t *)0)->ml_trace[n]))
 469
 470 struct mallocation {
 471         mcache_obj_t *element;  /* the alloc'ed element, NULL if unused */
 472         u_int32_t trace_index;  /* mtrace index for corresponding backtrace */
 473         u_int32_t count;        /* How many objects were requested */
 474         u_int64_t hitcount;     /* for determining hash effectiveness */
 475 };
 476
 477 struct mtrace {
 478         u_int64_t       collisions;
 479         u_int64_t       hitcount;
 480         u_int64_t       allocs;
 481         u_int64_t       depth;
 482         uintptr_t       addr[MLEAK_STACK_DEPTH];
 483 };
 484
 485 /* Size must be a power of two for the zhash to be able to just mask off bits */
 486 #define MLEAK_ALLOCATION_MAP_NUM        512
 487 #define MLEAK_TRACE_MAP_NUM             256
 488
 489 /*
 490  * Sample factor for how often to record a trace.  This is overwritable
 491  * by the boot-arg mleak_sample_factor.
 492  */
 493 #define MLEAK_SAMPLE_FACTOR             500
 494
 495 /*
 496  * Number of top leakers recorded.
 497  */
 498 #define MLEAK_NUM_TRACES                5
 499
 500 #define MB_LEAK_SPACING_64 "                    "
 501 #define MB_LEAK_SPACING_32 "            "
 502
 503
 504 #define MB_LEAK_HDR_32  "\n\
 505     trace [1]   trace [2]   trace [3]   trace [4]   trace [5]  \n\
 506     ----------  ----------  ----------  ----------  ---------- \n\
 507 "
 508
 509 #define MB_LEAK_HDR_64  "\n\
 510     trace [1]           trace [2]           trace [3]       \
 511         trace [4]           trace [5]      \n\
 512     ------------------  ------------------  ------------------  \
 513     ------------------  ------------------ \n\
 514 "
 515
 516 static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM;
 517 static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM;
 518
 519 /* Hashmaps of allocations and their corresponding traces */
 520 static struct mallocation *mleak_allocations;
 521 static struct mtrace *mleak_traces;
 522 static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES];
 523
 524 /* Lock to protect mleak tables from concurrent modification */
 525 decl_lck_mtx_data(static, mleak_lock_data);
 526 static lck_mtx_t *mleak_lock = &mleak_lock_data;
 527 static lck_attr_t *mleak_lock_attr;
 528 static lck_grp_t *mleak_lock_grp;
 529 static lck_grp_attr_t *mleak_lock_grp_attr;
 530
 531 extern u_int32_t high_sb_max;
 532
 533 /* TODO: should be in header file */
 534 int do_reclaim = 0;
 535
 536 /* The minimum number of objects that are allocated, to start. */
 537 #define MINCL           32
 538 #define MINBIGCL        (MINCL >> 1)
 539 #define MIN16KCL        (MINCL >> 2)
 540
 541 /* Low watermarks (only map in pages once free counts go below) */
 542 #define MBIGCL_LOWAT    MINBIGCL
 543 #define M16KCL_LOWAT    MIN16KCL
 544
 545 typedef struct {
 546         mbuf_class_t    mtbl_class;     /* class type */
 547         mcache_t        *mtbl_cache;    /* mcache for this buffer class */
 548         TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */
 549         mcache_obj_t    *mtbl_cobjlist; /* composite objects freelist */
 550         mb_class_stat_t *mtbl_stats;    /* statistics fetchable via sysctl */
 551         u_int32_t       mtbl_maxsize;   /* maximum buffer size */
 552         int             mtbl_minlimit;  /* minimum allowed */
 553         int             mtbl_maxlimit;  /* maximum allowed */
 554         u_int32_t       mtbl_wantpurge; /* purge during next reclaim */
 555 } mbuf_table_t;
 556
 557 #define m_class(c)      mbuf_table[c].mtbl_class
 558 #define m_cache(c)      mbuf_table[c].mtbl_cache
 559 #define m_slablist(c)   mbuf_table[c].mtbl_slablist
 560 #define m_cobjlist(c)   mbuf_table[c].mtbl_cobjlist
 561 #define m_maxsize(c)    mbuf_table[c].mtbl_maxsize
 562 #define m_minlimit(c)   mbuf_table[c].mtbl_minlimit
 563 #define m_maxlimit(c)   mbuf_table[c].mtbl_maxlimit
 564 #define m_wantpurge(c)  mbuf_table[c].mtbl_wantpurge
 565 #define m_cname(c)      mbuf_table[c].mtbl_stats->mbcl_cname
 566 #define m_size(c)       mbuf_table[c].mtbl_stats->mbcl_size
 567 #define m_total(c)      mbuf_table[c].mtbl_stats->mbcl_total
 568 #define m_active(c)     mbuf_table[c].mtbl_stats->mbcl_active
 569 #define m_infree(c)     mbuf_table[c].mtbl_stats->mbcl_infree
 570 #define m_slab_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_slab_cnt
 571 #define m_alloc_cnt(c)  mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
 572 #define m_free_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_free_cnt
 573 #define m_notified(c)   mbuf_table[c].mtbl_stats->mbcl_notified
 574 #define m_purge_cnt(c)  mbuf_table[c].mtbl_stats->mbcl_purge_cnt
 575 #define m_fail_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_fail_cnt
 576 #define m_ctotal(c)     mbuf_table[c].mtbl_stats->mbcl_ctotal
 577
 578 static mbuf_table_t mbuf_table[] = {
 579         /*
 580          * The caches for mbufs, regular clusters and big clusters.
 581          */
 582         { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
 583             NULL, NULL, 0, 0, 0, 0 },
 584         { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
 585             NULL, NULL, 0, 0, 0, 0 },
 586         { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
 587             NULL, NULL, 0, 0, 0, 0 },
 588         { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
 589             NULL, NULL, 0, 0, 0, 0 },
 590         /*
 591          * The following are special caches; they serve as intermediate
 592          * caches backed by the above rudimentary caches.  Each object
 593          * in the cache is an mbuf with a cluster attached to it.  Unlike
 594          * the above caches, these intermediate caches do not directly
 595          * deal with the slab structures; instead, the constructed
 596          * cached elements are simply stored in the freelists.
 597          */
 598         { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
 599         { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
 600         { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
 601 };
 602
 603 #define NELEM(a)        (sizeof (a) / sizeof ((a)[0]))
 604
 605 static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */
 606 static int mb_waiters;                  /* number of waiters */
 607
 608 #define MB_WDT_MAXTIME  10              /* # of secs before watchdog panic */
 609 static struct timeval mb_wdtstart;      /* watchdog start timestamp */
 610 static char *mbuf_dump_buf;
 611
 612 #define MBUF_DUMP_BUF_SIZE      2048
 613
 614 /*
 615  * mbuf watchdog is enabled by default on embedded platforms.  It is
 616  * also toggeable via the kern.ipc.mb_watchdog sysctl.
 617  */
 618 #if CONFIG_EMBEDDED
 619 static unsigned int mb_watchdog = 1;
 620 #else
 621 static unsigned int mb_watchdog = 0;
 622 #endif /* CONFIG_EMBEDDED */
 623
 624 /* The following are used to serialize m_clalloc() */
 625 static boolean_t mb_clalloc_busy;
 626 static void *mb_clalloc_waitchan = &mb_clalloc_busy;
 627 static int mb_clalloc_waiters;
 628
 629 static void mbuf_mtypes_sync(boolean_t);
 630 static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
 631 static void mbuf_stat_sync(void);
 632 static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
 633 static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS;
 634 static int mleak_table_sysctl SYSCTL_HANDLER_ARGS;
 635 static char *mbuf_dump(void);
 636 static void mbuf_table_init(void);
 637 static inline void m_incref(struct mbuf *);
 638 static inline u_int32_t m_decref(struct mbuf *);
 639 static int m_clalloc(const u_int32_t, const int, const u_int32_t);
 640 static void mbuf_worker_thread_init(void);
 641 static mcache_obj_t *slab_alloc(mbuf_class_t, int);
 642 static void slab_free(mbuf_class_t, mcache_obj_t *);
 643 static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***,
 644     unsigned int, int);
 645 static void mbuf_slab_free(void *, mcache_obj_t *, int);
 646 static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t);
 647 static void mbuf_slab_notify(void *, u_int32_t);
 648 static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
 649     unsigned int);
 650 static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
 651 static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***,
 652     unsigned int, int);
 653 static void mbuf_cslab_free(void *, mcache_obj_t *, int);
 654 static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t);
 655 static int freelist_populate(mbuf_class_t, unsigned int, int);
 656 static void freelist_init(mbuf_class_t);
 657 static boolean_t mbuf_cached_above(mbuf_class_t, int);
 658 static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
 659 static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
 660 static int m_howmany(int, size_t);
 661 static void mbuf_worker_thread(void);
 662 static void mbuf_watchdog(void);
 663 static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
 664
 665 static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **,
 666     size_t, unsigned int);
 667 static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *);
 668 static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t);
 669 static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t,
 670     boolean_t);
 671 static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t);
 672 static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *);
 673 static void mcl_audit_mcheck_panic(struct mbuf *);
 674 static void mcl_audit_verify_nextptr(void *, mcache_audit_t *);
 675
 676 static void mleak_activate(void);
 677 static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t);
 678 static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int);
 679 static void mleak_free(mcache_obj_t *);
 680 static void mleak_sort_traces(void);
 681 static void mleak_update_stats(void);
 682
 683 static mcl_slab_t *slab_get(void *);
 684 static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
 685     void *, void *, unsigned int, int, int);
 686 static void slab_insert(mcl_slab_t *, mbuf_class_t);
 687 static void slab_remove(mcl_slab_t *, mbuf_class_t);
 688 static boolean_t slab_inrange(mcl_slab_t *, void *);
 689 static void slab_nextptr_panic(mcl_slab_t *, void *);
 690 static void slab_detach(mcl_slab_t *);
 691 static boolean_t slab_is_detached(mcl_slab_t *);
 692
 693 static int m_copyback0(struct mbuf **, int, int, const void *, int, int);
 694 static struct mbuf *m_split0(struct mbuf *, int, int, int);
 695
 696 /* flags for m_copyback0 */
 697 #define M_COPYBACK0_COPYBACK    0x0001  /* copyback from cp */
 698 #define M_COPYBACK0_PRESERVE    0x0002  /* preserve original data */
 699 #define M_COPYBACK0_COW         0x0004  /* do copy-on-write */
 700 #define M_COPYBACK0_EXTEND      0x0008  /* extend chain */
 701
 702 /*
 703  * This flag is set for all mbufs that come out of and into the composite
 704  * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL.  mbufs that
 705  * are marked with such a flag have clusters attached to them, and will be
 706  * treated differently when they are freed; instead of being placed back
 707  * into the mbuf and cluster freelists, the composite mbuf + cluster objects
 708  * are placed back into the appropriate composite cache's freelist, and the
 709  * actual freeing is deferred until the composite objects are purged.  At
 710  * such a time, this flag will be cleared from the mbufs and the objects
 711  * will be freed into their own separate freelists.
 712  */
 713 #define EXTF_COMPOSITE  0x1
 714
 715 /*
 716  * This flag indicates that the external cluster is read-only, i.e. it is
 717  * or was referred to by more than one mbufs.  Once set, this flag is never
 718  * cleared.
 719  */
 720 #define EXTF_READONLY   0x2
 721 #define EXTF_MASK       (EXTF_COMPOSITE | EXTF_READONLY)
 722
 723 #define MEXT_RFA(m)             ((m)->m_ext.ext_refflags)
 724 #define MEXT_REF(m)             (MEXT_RFA(m)->refcnt)
 725 #define MEXT_FLAGS(m)           (MEXT_RFA(m)->flags)
 726 #define MBUF_IS_COMPOSITE(m)    \
 727         (MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE)
 728
 729 /*
 730  * Macros used to verify the integrity of the mbuf.
 731  */
 732 #define _MCHECK(m) {                                                    \
 733         if ((m)->m_type != MT_FREE) {                                   \
 734                 if (mclaudit == NULL)                                   \
 735                         panic("MCHECK: m_type=%d m=%p",                 \
 736                             (u_int16_t)(m)->m_type, m);                 \
 737                 else                                                    \
 738                         mcl_audit_mcheck_panic(m);                      \
 739         }                                                               \
 740 }
 741
 742 #define MBUF_IN_MAP(addr)                                               \
 743         ((void *)(addr) >= (void *)mbutl && (void *)(addr) < (void *)embutl)
 744
 745 #define MRANGE(addr) {                                                  \
 746         if (!MBUF_IN_MAP(addr))                                         \
 747                 panic("MRANGE: address out of range 0x%p", addr);       \
 748 }
 749
 750 /*
 751  * Macro version of mtod.
 752  */
 753 #define MTOD(m, t)      ((t)((m)->m_data))
 754
 755 /*
 756  * Macros to obtain (4KB) cluster index and base cluster address.
 757  */
 758
 759 #define MTOBG(x)        (((char *)(x) - (char *)mbutl) >> MBIGCLSHIFT)
 760 #define BGTOM(x)        ((union mbigcluster *)(mbutl + (x)))
 761
 762 /*
 763  * Macro to find the mbuf index relative to a base.
 764  */
 765 #define MCLIDX(c, m)    (((char *)(m) - (char *)(c)) >> MSIZESHIFT)
 766
 767 /*
 768  * Same thing for 2KB cluster index.
 769  */
 770 #define CLBGIDX(c, m)   (((char *)(m) - (char *)(c)) >> MCLSHIFT)
 771
 772 /*
 773  * Macros used during mbuf and cluster initialization.
 774  */
 775 #define MBUF_INIT(m, pkthdr, type) {                                    \
 776         _MCHECK(m);                                                     \
 777         (m)->m_next = (m)->m_nextpkt = NULL;                            \
 778         (m)->m_len = 0;                                                 \
 779         (m)->m_type = type;                                             \
 780         if ((pkthdr) == 0) {                                            \
 781                 (m)->m_data = (m)->m_dat;                               \
 782                 (m)->m_flags = 0;                                       \
 783         } else {                                                        \
 784                 (m)->m_data = (m)->m_pktdat;                            \
 785                 (m)->m_flags = M_PKTHDR;                                \
 786                 (m)->m_pkthdr.rcvif = NULL;                             \
 787                 (m)->m_pkthdr.len = 0;                                  \
 788                 (m)->m_pkthdr.header = NULL;                            \
 789                 (m)->m_pkthdr.csum_flags = 0;                           \
 790                 (m)->m_pkthdr.csum_data = 0;                            \
 791                 (m)->m_pkthdr.tso_segsz = 0;                            \
 792                 (m)->m_pkthdr.vlan_tag = 0;                             \
 793                 (m)->m_pkthdr.socket_id = 0;                            \
 794                 (m)->m_pkthdr.vt_nrecs = 0;                             \
 795                 (m)->m_pkthdr.aux_flags = 0;                            \
 796                 m_tag_init(m);                                          \
 797                 m_service_class_init(m);                                \
 798         }                                                               \
 799 }
 800
 801 #define MEXT_INIT(m, buf, size, free, arg, rfa, ref, flag) {            \
 802         (m)->m_data = (m)->m_ext.ext_buf = (buf);                       \
 803         (m)->m_flags |= M_EXT;                                          \
 804         (m)->m_ext.ext_size = (size);                                   \
 805         (m)->m_ext.ext_free = (free);                                   \
 806         (m)->m_ext.ext_arg = (arg);                                     \
 807         (m)->m_ext.ext_refs.forward = (m)->m_ext.ext_refs.backward =    \
 808             &(m)->m_ext.ext_refs;                                       \
 809         MEXT_RFA(m) = (rfa);                                            \
 810         MEXT_REF(m) = (ref);                                            \
 811         MEXT_FLAGS(m) = (flag);                                         \
 812 }
 813
 814 #define MBUF_CL_INIT(m, buf, rfa, ref, flag)    \
 815         MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, ref, flag)
 816
 817 #define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
 818         MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, ref, flag)
 819
 820 #define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
 821         MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, ref, flag)
 822
 823 /*
 824  * Macro to convert BSD malloc sleep flag to mcache's
 825  */
 826 #define MSLEEPF(f)      ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
 827
 828 /*
 829  * The structure that holds all mbuf class statistics exportable via sysctl.
 830  * Similar to mbstat structure, the mb_stat structure is protected by the
 831  * global mbuf lock.  It contains additional information about the classes
 832  * that allows for a more accurate view of the state of the allocator.
 833  */
 834 struct mb_stat *mb_stat;
 835 struct omb_stat *omb_stat;      /* For backwards compatibility */
 836
 837 #define MB_STAT_SIZE(n) \
 838         ((size_t)(&((mb_stat_t *)0)->mbs_class[n]))
 839 #define OMB_STAT_SIZE(n) \
 840         ((size_t)(&((struct omb_stat *)0)->mbs_class[n]))
 841
 842 /*
 843  * The legacy structure holding all of the mbuf allocation statistics.
 844  * The actual statistics used by the kernel are stored in the mbuf_table
 845  * instead, and are updated atomically while the global mbuf lock is held.
 846  * They are mirrored in mbstat to support legacy applications (e.g. netstat).
 847  * Unlike before, the kernel no longer relies on the contents of mbstat for
 848  * its operations (e.g. cluster expansion) because the structure is exposed
 849  * to outside and could possibly be modified, therefore making it unsafe.
 850  * With the exception of the mbstat.m_mtypes array (see below), all of the
 851  * statistics are updated as they change.
 852  */
 853 struct mbstat mbstat;
 854
 855 #define MBSTAT_MTYPES_MAX \
 856         (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
 857
 858 /*
 859  * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
 860  * atomically and stored in a per-CPU structure which is lock-free; this is
 861  * done in order to avoid writing to the global mbstat data structure which
 862  * would cause false sharing.  During sysctl request for kern.ipc.mbstat,
 863  * the statistics across all CPUs will be converged into the mbstat.m_mtypes
 864  * array and returned to the application.  Any updates for types greater or
 865  * equal than MT_MAX would be done atomically to the mbstat; this slows down
 866  * performance but is okay since the kernel uses only up to MT_MAX-1 while
 867  * anything beyond that (up to type 255) is considered a corner case.
 868  */
 869 typedef struct {
 870         unsigned int    cpu_mtypes[MT_MAX];
 871 } __attribute__((aligned(CPU_CACHE_SIZE), packed)) mtypes_cpu_t;
 872
 873 typedef struct {
 874         mtypes_cpu_t    mbs_cpu[1];
 875 } mbuf_mtypes_t;
 876
 877 static mbuf_mtypes_t *mbuf_mtypes;      /* per-CPU statistics */
 878
 879 #define MBUF_MTYPES_SIZE(n) \
 880         ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n]))
 881
 882 #define MTYPES_CPU(p) \
 883         ((mtypes_cpu_t *)(void *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number())))
 884
 885 #define mtype_stat_add(type, n) {                                       \
 886         if ((unsigned)(type) < MT_MAX) {                                \
 887                 mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes);            \
 888                 atomic_add_32(&mbs->cpu_mtypes[type], n);               \
 889         } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) {    \
 890                 atomic_add_16((int16_t *)&mbstat.m_mtypes[type], n);    \
 891         }                                                               \
 892 }
 893
 894 #define mtype_stat_sub(t, n)    mtype_stat_add(t, -(n))
 895 #define mtype_stat_inc(t)       mtype_stat_add(t, 1)
 896 #define mtype_stat_dec(t)       mtype_stat_sub(t, 1)
 897
 898 static void
 899 mbuf_mtypes_sync(boolean_t locked)
 900 {
 901         int m, n;
 902         mtypes_cpu_t mtc;
 903
 904         if (locked)
 905                 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
 906
 907         bzero(&mtc, sizeof (mtc));
 908         for (m = 0; m < ncpu; m++) {
 909                 mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m];
 910                 mtypes_cpu_t temp;
 911
 912                 bcopy(&scp->cpu_mtypes, &temp.cpu_mtypes,
 913                     sizeof (temp.cpu_mtypes));
 914
 915                 for (n = 0; n < MT_MAX; n++)
 916                         mtc.cpu_mtypes[n] += temp.cpu_mtypes[n];
 917         }
 918         if (!locked)
 919                 lck_mtx_lock(mbuf_mlock);
 920         for (n = 0; n < MT_MAX; n++)
 921                 mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
 922         if (!locked)
 923                 lck_mtx_unlock(mbuf_mlock);
 924 }
 925
 926 static int
 927 mbstat_sysctl SYSCTL_HANDLER_ARGS
 928 {
 929 #pragma unused(oidp, arg1, arg2)
 930         mbuf_mtypes_sync(FALSE);
 931
 932         return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat)));
 933 }
 934
 935 static void
 936 mbuf_stat_sync(void)
 937 {
 938         mb_class_stat_t *sp;
 939         mcache_cpu_t *ccp;
 940         mcache_t *cp;
 941         int k, m, bktsize;
 942
 943         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
 944
 945         for (k = 0; k < NELEM(mbuf_table); k++) {
 946                 cp = m_cache(k);
 947                 ccp = &cp->mc_cpu[0];
 948                 bktsize = ccp->cc_bktsize;
 949                 sp = mbuf_table[k].mtbl_stats;
 950
 951                 if (cp->mc_flags & MCF_NOCPUCACHE)
 952                         sp->mbcl_mc_state = MCS_DISABLED;
 953                 else if (cp->mc_purge_cnt > 0)
 954                         sp->mbcl_mc_state = MCS_PURGING;
 955                 else if (bktsize == 0)
 956                         sp->mbcl_mc_state = MCS_OFFLINE;
 957                 else
 958                         sp->mbcl_mc_state = MCS_ONLINE;
 959
 960                 sp->mbcl_mc_cached = 0;
 961                 for (m = 0; m < ncpu; m++) {
 962                         ccp = &cp->mc_cpu[m];
 963                         if (ccp->cc_objs > 0)
 964                                 sp->mbcl_mc_cached += ccp->cc_objs;
 965                         if (ccp->cc_pobjs > 0)
 966                                 sp->mbcl_mc_cached += ccp->cc_pobjs;
 967                 }
 968                 sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
 969                 sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
 970                     sp->mbcl_infree;
 971
 972                 sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
 973                 sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
 974                 sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
 975
 976                 /* Calculate total count specific to each class */
 977                 sp->mbcl_ctotal = sp->mbcl_total;
 978                 switch (m_class(k)) {
 979                 case MC_MBUF:
 980                         /* Deduct mbufs used in composite caches */
 981                         sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
 982                             m_total(MC_MBUF_BIGCL));
 983                         break;
 984
 985                 case MC_CL:
 986                         /* Deduct clusters used in composite cache */
 987                         sp->mbcl_ctotal -= m_total(MC_MBUF_CL);
 988                         break;
 989
 990                 case MC_BIGCL:
 991                         /* Deduct clusters used in composite cache */
 992                         sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
 993                         break;
 994
 995                 case MC_16KCL:
 996                         /* Deduct clusters used in composite cache */
 997                         sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
 998                         break;
 999
1000                 default:
1001                         break;
1002                 }
1003         }
1004 }
1005
1006 static int
1007 mb_stat_sysctl SYSCTL_HANDLER_ARGS
1008 {
1009 #pragma unused(oidp, arg1, arg2)
1010         void *statp;
1011         int k, statsz, proc64 = proc_is64bit(req->p);
1012
1013         lck_mtx_lock(mbuf_mlock);
1014         mbuf_stat_sync();
1015
1016         if (!proc64) {
1017                 struct omb_class_stat *oc;
1018                 struct mb_class_stat *c;
1019
1020                 omb_stat->mbs_cnt = mb_stat->mbs_cnt;
1021                 oc = &omb_stat->mbs_class[0];
1022                 c = &mb_stat->mbs_class[0];
1023                 for (k = 0; k < omb_stat->mbs_cnt; k++, oc++, c++) {
1024                         (void) snprintf(oc->mbcl_cname, sizeof (oc->mbcl_cname),
1025                             "%s", c->mbcl_cname);
1026                         oc->mbcl_size = c->mbcl_size;
1027                         oc->mbcl_total = c->mbcl_total;
1028                         oc->mbcl_active = c->mbcl_active;
1029                         oc->mbcl_infree = c->mbcl_infree;
1030                         oc->mbcl_slab_cnt = c->mbcl_slab_cnt;
1031                         oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt;
1032                         oc->mbcl_free_cnt = c->mbcl_free_cnt;
1033                         oc->mbcl_notified = c->mbcl_notified;
1034                         oc->mbcl_purge_cnt = c->mbcl_purge_cnt;
1035                         oc->mbcl_fail_cnt = c->mbcl_fail_cnt;
1036                         oc->mbcl_ctotal = c->mbcl_ctotal;
1037                         oc->mbcl_mc_state = c->mbcl_mc_state;
1038                         oc->mbcl_mc_cached = c->mbcl_mc_cached;
1039                         oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt;
1040                         oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt;
1041                         oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt;
1042                 }
1043                 statp = omb_stat;
1044                 statsz = OMB_STAT_SIZE(NELEM(mbuf_table));
1045         } else {
1046                 statp = mb_stat;
1047                 statsz = MB_STAT_SIZE(NELEM(mbuf_table));
1048         }
1049
1050         lck_mtx_unlock(mbuf_mlock);
1051
1052         return (SYSCTL_OUT(req, statp, statsz));
1053 }
1054
1055 static int
1056 mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS
1057 {
1058 #pragma unused(oidp, arg1, arg2)
1059         int i;
1060
1061         /* Ensure leak tracing turned on */
1062         if (!mclfindleak || !mclexpleak)
1063                 return (ENXIO);
1064
1065         lck_mtx_lock(mleak_lock);
1066         mleak_update_stats();
1067         i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES));
1068         lck_mtx_unlock(mleak_lock);
1069
1070         return (i);
1071 }
1072
1073 static int
1074 mleak_table_sysctl SYSCTL_HANDLER_ARGS
1075 {
1076 #pragma unused(oidp, arg1, arg2)
1077         int i = 0;
1078
1079         /* Ensure leak tracing turned on */
1080         if (!mclfindleak || !mclexpleak)
1081                 return (ENXIO);
1082
1083         lck_mtx_lock(mleak_lock);
1084         i = SYSCTL_OUT(req, &mleak_table, sizeof (mleak_table));
1085         lck_mtx_unlock(mleak_lock);
1086
1087         return (i);
1088 }
1089
1090 static inline void
1091 m_incref(struct mbuf *m)
1092 {
1093         UInt32 old, new;
1094         volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
1095
1096         do {
1097                 old = *addr;
1098                 new = old + 1;
1099                 ASSERT(new != 0);
1100         } while (!OSCompareAndSwap(old, new, addr));
1101
1102         /*
1103          * If cluster is shared, mark it with (sticky) EXTF_READONLY;
1104          * we don't clear the flag when the refcount goes back to 1
1105          * to simplify code calling m_mclhasreference().
1106          */
1107         if (new > 1 && !(MEXT_FLAGS(m) & EXTF_READONLY))
1108                 (void) OSBitOrAtomic(EXTF_READONLY, &MEXT_FLAGS(m));
1109 }
1110
1111 static inline u_int32_t
1112 m_decref(struct mbuf *m)
1113 {
1114         UInt32 old, new;
1115         volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
1116
1117         do {
1118                 old = *addr;
1119                 new = old - 1;
1120                 ASSERT(old != 0);
1121         } while (!OSCompareAndSwap(old, new, addr));
1122
1123         return (new);
1124 }
1125
1126 static void
1127 mbuf_table_init(void)
1128 {
1129         unsigned int b, c, s;
1130         int m;
1131
1132         MALLOC(omb_stat, struct omb_stat *, OMB_STAT_SIZE(NELEM(mbuf_table)),
1133             M_TEMP, M_WAITOK | M_ZERO);
1134         VERIFY(omb_stat != NULL);
1135
1136         MALLOC(mb_stat, mb_stat_t *, MB_STAT_SIZE(NELEM(mbuf_table)),
1137             M_TEMP, M_WAITOK | M_ZERO);
1138         VERIFY(mb_stat != NULL);
1139
1140         mb_stat->mbs_cnt = NELEM(mbuf_table);
1141         for (m = 0; m < NELEM(mbuf_table); m++)
1142                 mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
1143
1144 #if CONFIG_MBUF_JUMBO
1145         /*
1146          * Set aside 1/3 of the mbuf cluster map for jumbo clusters; we do
1147          * this only on platforms where jumbo cluster pool is enabled.
1148          */
1149         njcl = nmbclusters / 3;
1150         njclbytes = M16KCLBYTES;
1151 #endif /* CONFIG_MBUF_JUMBO */
1152
1153         /*
1154          * nclusters holds both the 2KB and 4KB pools, so ensure it's
1155          * a multiple of 4KB clusters.
1156          */
1157         nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG);
1158         if (njcl > 0) {
1159                 /*
1160                  * Each jumbo cluster takes 8 2KB clusters, so make
1161                  * sure that the pool size is evenly divisible by 8;
1162                  * njcl is in 2KB unit, hence treated as such.
1163                  */
1164                 njcl = P2ROUNDDOWN(nmbclusters - nclusters, 8);
1165
1166                 /* Update nclusters with rounded down value of njcl */
1167                 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG);
1168         }
1169
1170         /*
1171          * njcl is valid only on platforms with 16KB jumbo clusters, where
1172          * it is configured to 1/3 of the pool size.  On these platforms,
1173          * the remaining is used for 2KB and 4KB clusters.  On platforms
1174          * without 16KB jumbo clusters, the entire pool is used for both
1175          * 2KB and 4KB clusters.  A 4KB cluster can either be splitted into
1176          * 16 mbufs, or into 2 2KB clusters.
1177          *
1178          *  +---+---+------------ ... -----------+------- ... -------+
1179          *  | c | b |              s             |        njcl       |
1180          *  +---+---+------------ ... -----------+------- ... -------+
1181          *
1182          * 1/32th of the shared region is reserved for pure 2KB and 4KB
1183          * clusters (1/64th each.)
1184          */
1185         c = P2ROUNDDOWN((nclusters >> 6), 2);           /* in 2KB unit */
1186         b = P2ROUNDDOWN((nclusters >> (6 + NCLPBGSHIFT)), 2); /* in 4KB unit */
1187         s = nclusters - (c + (b << NCLPBGSHIFT));       /* in 2KB unit */
1188
1189         /*
1190          * 1/64th (c) is reserved for 2KB clusters.
1191          */
1192         m_minlimit(MC_CL) = c;
1193         m_maxlimit(MC_CL) = s + c;                      /* in 2KB unit */
1194         m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
1195         (void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
1196
1197         /*
1198          * Another 1/64th (b) of the map is reserved for 4KB clusters.
1199          * It cannot be turned into 2KB clusters or mbufs.
1200          */
1201         m_minlimit(MC_BIGCL) = b;
1202         m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b;  /* in 4KB unit */
1203         m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES;
1204         (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
1205
1206         /*
1207          * The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB)
1208          */
1209         m_minlimit(MC_MBUF) = 0;
1210         m_maxlimit(MC_MBUF) = (s << NMBPCLSHIFT);       /* in mbuf unit */
1211         m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE;
1212         (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
1213
1214         /*
1215          * Set limits for the composite classes.
1216          */
1217         m_minlimit(MC_MBUF_CL) = 0;
1218         m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL);
1219         m_maxsize(MC_MBUF_CL) = MCLBYTES;
1220         m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
1221         (void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
1222
1223         m_minlimit(MC_MBUF_BIGCL) = 0;
1224         m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
1225         m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES;
1226         m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
1227         (void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
1228
1229         /*
1230          * And for jumbo classes.
1231          */
1232         m_minlimit(MC_16KCL) = 0;
1233         m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT);  /* in 16KB unit */
1234         m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
1235         (void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
1236
1237         m_minlimit(MC_MBUF_16KCL) = 0;
1238         m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
1239         m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
1240         m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
1241         (void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
1242
1243         /*
1244          * Initialize the legacy mbstat structure.
1245          */
1246         bzero(&mbstat, sizeof (mbstat));
1247         mbstat.m_msize = m_maxsize(MC_MBUF);
1248         mbstat.m_mclbytes = m_maxsize(MC_CL);
1249         mbstat.m_minclsize = MINCLSIZE;
1250         mbstat.m_mlen = MLEN;
1251         mbstat.m_mhlen = MHLEN;
1252         mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
1253 }
1254
1255 #if defined(__LP64__)
1256 typedef struct ncl_tbl {
1257         uint64_t nt_maxmem;     /* memory (sane) size */
1258         uint32_t nt_mbpool;     /* mbuf pool size */
1259 } ncl_tbl_t;
1260
1261 /* Non-server */
1262 static ncl_tbl_t ncl_table[] = {
1263         { (1ULL << GBSHIFT)       /*  1 GB */,  (64 << MBSHIFT)  /*  64 MB */ },
1264         { (1ULL << (GBSHIFT + 3)) /*  8 GB */,  (96 << MBSHIFT)  /*  96 MB */ },
1265         { (1ULL << (GBSHIFT + 4)) /* 16 GB */,  (128 << MBSHIFT) /* 128 MB */ },
1266         { 0, 0 }
1267 };
1268
1269 /* Server */
1270 static ncl_tbl_t ncl_table_srv[] = {
1271         { (1ULL << GBSHIFT)       /*  1 GB */,  (96 << MBSHIFT)  /*  96 MB */ },
1272         { (1ULL << (GBSHIFT + 2)) /*  4 GB */,  (128 << MBSHIFT) /* 128 MB */ },
1273         { (1ULL << (GBSHIFT + 3)) /*  8 GB */,  (160 << MBSHIFT) /* 160 MB */ },
1274         { (1ULL << (GBSHIFT + 4)) /* 16 GB */,  (192 << MBSHIFT) /* 192 MB */ },
1275         { (1ULL << (GBSHIFT + 5)) /* 32 GB */,  (256 << MBSHIFT) /* 256 MB */ },
1276         { (1ULL << (GBSHIFT + 6)) /* 64 GB */,  (384 << MBSHIFT) /* 384 MB */ },
1277         { 0, 0 }
1278 };
1279 #endif /* __LP64__ */
1280
1281 __private_extern__ unsigned int
1282 mbuf_default_ncl(int server, uint64_t mem)
1283 {
1284 #if !defined(__LP64__)
1285 #pragma unused(server)
1286         unsigned int n;
1287         /*
1288          * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
1289          */
1290         if ((n = ((mem / 16) / MCLBYTES)) > 32768)
1291                 n = 32768;
1292 #else
1293         unsigned int n, i;
1294         ncl_tbl_t *tbl = (server ? ncl_table_srv : ncl_table);
1295         /*
1296          * 64-bit kernel (mbuf pool size based on table).
1297          */
1298         n = tbl[0].nt_mbpool;
1299         for (i = 0; tbl[i].nt_mbpool != 0; i++) {
1300                 if (mem < tbl[i].nt_maxmem)
1301                         break;
1302                 n = tbl[i].nt_mbpool;
1303         }
1304         n >>= MCLSHIFT;
1305 #endif /* !__LP64__ */
1306         return (n);
1307 }
1308
1309 __private_extern__ void
1310 mbinit(void)
1311 {
1312         unsigned int m;
1313         unsigned int initmcl = 0;
1314         void *buf;
1315         thread_t thread = THREAD_NULL;
1316
1317         /*
1318          * These MBUF_ values must be equal to their private counterparts.
1319          */
1320         _CASSERT(MBUF_EXT == M_EXT);
1321         _CASSERT(MBUF_PKTHDR == M_PKTHDR);
1322         _CASSERT(MBUF_EOR == M_EOR);
1323         _CASSERT(MBUF_LOOP == M_LOOP);
1324         _CASSERT(MBUF_BCAST == M_BCAST);
1325         _CASSERT(MBUF_MCAST == M_MCAST);
1326         _CASSERT(MBUF_FRAG == M_FRAG);
1327         _CASSERT(MBUF_FIRSTFRAG == M_FIRSTFRAG);
1328         _CASSERT(MBUF_LASTFRAG == M_LASTFRAG);
1329         _CASSERT(MBUF_PROMISC == M_PROMISC);
1330         _CASSERT(MBUF_HASFCS == M_HASFCS);
1331
1332         _CASSERT(MBUF_TYPE_FREE == MT_FREE);
1333         _CASSERT(MBUF_TYPE_DATA == MT_DATA);
1334         _CASSERT(MBUF_TYPE_HEADER == MT_HEADER);
1335         _CASSERT(MBUF_TYPE_SOCKET == MT_SOCKET);
1336         _CASSERT(MBUF_TYPE_PCB == MT_PCB);
1337         _CASSERT(MBUF_TYPE_RTABLE == MT_RTABLE);
1338         _CASSERT(MBUF_TYPE_HTABLE == MT_HTABLE);
1339         _CASSERT(MBUF_TYPE_ATABLE == MT_ATABLE);
1340         _CASSERT(MBUF_TYPE_SONAME == MT_SONAME);
1341         _CASSERT(MBUF_TYPE_SOOPTS == MT_SOOPTS);
1342         _CASSERT(MBUF_TYPE_FTABLE == MT_FTABLE);
1343         _CASSERT(MBUF_TYPE_RIGHTS == MT_RIGHTS);
1344         _CASSERT(MBUF_TYPE_IFADDR == MT_IFADDR);
1345         _CASSERT(MBUF_TYPE_CONTROL == MT_CONTROL);
1346         _CASSERT(MBUF_TYPE_OOBDATA == MT_OOBDATA);
1347
1348         _CASSERT(MBUF_TSO_IPV4 == CSUM_TSO_IPV4);
1349         _CASSERT(MBUF_TSO_IPV6 == CSUM_TSO_IPV6);
1350         _CASSERT(MBUF_CSUM_REQ_SUM16 == CSUM_TCP_SUM16);
1351         _CASSERT(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16);
1352         _CASSERT(MBUF_CSUM_REQ_IP == CSUM_IP);
1353         _CASSERT(MBUF_CSUM_REQ_TCP == CSUM_TCP);
1354         _CASSERT(MBUF_CSUM_REQ_UDP == CSUM_UDP);
1355         _CASSERT(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6);
1356         _CASSERT(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6);
1357         _CASSERT(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED);
1358         _CASSERT(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID);
1359         _CASSERT(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID);
1360         _CASSERT(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR);
1361
1362         _CASSERT(MBUF_WAITOK == M_WAIT);
1363         _CASSERT(MBUF_DONTWAIT == M_DONTWAIT);
1364         _CASSERT(MBUF_COPYALL == M_COPYALL);
1365
1366         _CASSERT(MBUF_PKTAUXF_INET_RESOLVE_RTR == MAUXF_INET_RESOLVE_RTR);
1367         _CASSERT(MBUF_PKTAUXF_INET6_RESOLVE_RTR == MAUXF_INET6_RESOLVE_RTR);
1368
1369         _CASSERT(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK);
1370         _CASSERT(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK);
1371         _CASSERT(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE);
1372         _CASSERT(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE);
1373         _CASSERT(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE);
1374         _CASSERT(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI);
1375         _CASSERT(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI);
1376         _CASSERT(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI);
1377         _CASSERT(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO);
1378         _CASSERT(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO);
1379
1380         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK);
1381         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE);
1382         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI);
1383         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO);
1384
1385         if (nmbclusters == 0)
1386                 nmbclusters = NMBCLUSTERS;
1387
1388         /* This should be a sane (at least even) value by now */
1389         VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1));
1390
1391         /* Setup the mbuf table */
1392         mbuf_table_init();
1393
1394         /* Global lock for common layer */
1395         mbuf_mlock_grp_attr = lck_grp_attr_alloc_init();
1396         mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr);
1397         mbuf_mlock_attr = lck_attr_alloc_init();
1398         lck_mtx_init(mbuf_mlock, mbuf_mlock_grp, mbuf_mlock_attr);
1399
1400         /*
1401          * Allocate cluster slabs table:
1402          *
1403          *      maxslabgrp = (N * 2048) / (1024 * 1024)
1404          *
1405          * Where N is nmbclusters rounded up to the nearest 512.  This yields
1406          * mcl_slab_g_t units, each one representing a MB of memory.
1407          */
1408         maxslabgrp =
1409             (P2ROUNDUP(nmbclusters, (MBSIZE >> 11)) << MCLSHIFT) >> MBSHIFT;
1410         MALLOC(slabstbl, mcl_slabg_t **, maxslabgrp * sizeof (mcl_slabg_t *),
1411             M_TEMP, M_WAITOK | M_ZERO);
1412         VERIFY(slabstbl != NULL);
1413
1414         /*
1415          * Allocate audit structures, if needed:
1416          *
1417          *      maxclaudit = (maxslabgrp * 1024 * 1024) / 4096
1418          *
1419          * This yields mcl_audit_t units, each one representing a page.
1420          */
1421         PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof (mbuf_debug));
1422         mbuf_debug |= mcache_getflags();
1423         if (mbuf_debug & MCF_DEBUG) {
1424                 maxclaudit = ((maxslabgrp << MBSHIFT) >> PGSHIFT);
1425                 MALLOC(mclaudit, mcl_audit_t *, maxclaudit * sizeof (*mclaudit),
1426                     M_TEMP, M_WAITOK | M_ZERO);
1427                 VERIFY(mclaudit != NULL);
1428
1429                 mcl_audit_con_cache = mcache_create("mcl_audit_contents",
1430                     AUDIT_CONTENTS_SIZE, 0, 0, MCR_SLEEP);
1431                 VERIFY(mcl_audit_con_cache != NULL);
1432         }
1433         mclverify = (mbuf_debug & MCF_VERIFY);
1434         mcltrace = (mbuf_debug & MCF_TRACE);
1435         mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG);
1436         mclexpleak = mclfindleak && (mbuf_debug & MCF_EXPLEAKLOG);
1437
1438         /* Enable mbuf leak logging, with a lock to protect the tables */
1439
1440         mleak_lock_grp_attr = lck_grp_attr_alloc_init();
1441         mleak_lock_grp = lck_grp_alloc_init("mleak_lock", mleak_lock_grp_attr);
1442         mleak_lock_attr = lck_attr_alloc_init();
1443         lck_mtx_init(mleak_lock, mleak_lock_grp, mleak_lock_attr);
1444
1445         mleak_activate();
1446
1447         /* Calculate the number of pages assigned to the cluster pool */
1448         mcl_pages = (nmbclusters * MCLBYTES) / CLBYTES;
1449         MALLOC(mcl_paddr, ppnum_t *, mcl_pages * sizeof (ppnum_t),
1450             M_TEMP, M_WAITOK);
1451         VERIFY(mcl_paddr != NULL);
1452
1453         /* Register with the I/O Bus mapper */
1454         mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
1455         bzero((char *)mcl_paddr, mcl_pages * sizeof (ppnum_t));
1456
1457         embutl = (union mbigcluster *)
1458             ((void *)((unsigned char *)mbutl + (nmbclusters * MCLBYTES)));
1459         VERIFY((((char *)embutl - (char *)mbutl) % MBIGCLBYTES) == 0);
1460
1461         /* Prime up the freelist */
1462         PE_parse_boot_argn("initmcl", &initmcl, sizeof (initmcl));
1463         if (initmcl != 0) {
1464                 initmcl >>= NCLPBGSHIFT;        /* become a 4K unit */
1465                 if (initmcl > m_maxlimit(MC_BIGCL))
1466                         initmcl = m_maxlimit(MC_BIGCL);
1467         }
1468         if (initmcl < m_minlimit(MC_BIGCL))
1469                 initmcl = m_minlimit(MC_BIGCL);
1470
1471         lck_mtx_lock(mbuf_mlock);
1472
1473         /*
1474          * For classes with non-zero minimum limits, populate their freelists
1475          * so that m_total(class) is at least m_minlimit(class).
1476          */
1477         VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0);
1478         freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT);
1479         VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
1480         freelist_init(m_class(MC_CL));
1481
1482         for (m = 0; m < NELEM(mbuf_table); m++) {
1483                 /* Make sure we didn't miss any */
1484                 VERIFY(m_minlimit(m_class(m)) == 0 ||
1485                     m_total(m_class(m)) >= m_minlimit(m_class(m)));
1486         }
1487
1488         lck_mtx_unlock(mbuf_mlock);
1489
1490         (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init,
1491             NULL, &thread);
1492         thread_deallocate(thread);
1493
1494         ref_cache = mcache_create("mext_ref", sizeof (struct ext_ref),
1495             0, 0, MCR_SLEEP);
1496
1497         /* Create the cache for each class */
1498         for (m = 0; m < NELEM(mbuf_table); m++) {
1499                 void *allocfunc, *freefunc, *auditfunc, *logfunc;
1500                 u_int32_t flags;
1501
1502                 flags = mbuf_debug;
1503                 if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL ||
1504                     m_class(m) == MC_MBUF_16KCL) {
1505                         allocfunc = mbuf_cslab_alloc;
1506                         freefunc = mbuf_cslab_free;
1507                         auditfunc = mbuf_cslab_audit;
1508                         logfunc = mleak_logger;
1509                 } else {
1510                         allocfunc = mbuf_slab_alloc;
1511                         freefunc = mbuf_slab_free;
1512                         auditfunc = mbuf_slab_audit;
1513                         logfunc = mleak_logger;
1514                 }
1515
1516                 /*
1517                  * Disable per-CPU caches for jumbo classes if there
1518                  * is no jumbo cluster pool available in the system.
1519                  * The cache itself is still created (but will never
1520                  * be populated) since it simplifies the code.
1521                  */
1522                 if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) &&
1523                     njcl == 0)
1524                         flags |= MCF_NOCPUCACHE;
1525
1526                 if (!mclfindleak)
1527                         flags |= MCF_NOLEAKLOG;
1528
1529                 m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
1530                     allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify,
1531                     (void *)(uintptr_t)m, flags, MCR_SLEEP);
1532         }
1533
1534         /*
1535          * Allocate structure for per-CPU statistics that's aligned
1536          * on the CPU cache boundary; this code assumes that we never
1537          * uninitialize this framework, since the original address
1538          * before alignment is not saved.
1539          */
1540         ncpu = ml_get_max_cpus();
1541         MALLOC(buf, void *, MBUF_MTYPES_SIZE(ncpu) + CPU_CACHE_SIZE,
1542             M_TEMP, M_WAITOK);
1543         VERIFY(buf != NULL);
1544
1545         mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf, CPU_CACHE_SIZE);
1546         bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu));
1547
1548         /*
1549          * Set the max limit on sb_max to be 1/16 th of the size of
1550          * memory allocated for mbuf clusters.
1551          */
1552         high_sb_max = (nmbclusters << (MCLSHIFT - 4));
1553         if (high_sb_max < sb_max) {
1554                 /* sb_max is too large for this configuration, scale it down */
1555                 if (high_sb_max > (1 << MBSHIFT)) {
1556                         /* We have atleast 16 M of mbuf pool */
1557                         sb_max = high_sb_max;
1558                 } else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) {
1559                         /*
1560                          * If we have more than 1M of mbufpool, cap the size of
1561                          * max sock buf at 1M
1562                          */
1563                         sb_max = high_sb_max = (1 << MBSHIFT);
1564                 } else {
1565                         sb_max = high_sb_max;
1566                 }
1567         }
1568
1569         /* allocate space for mbuf_dump_buf */
1570         MALLOC(mbuf_dump_buf, char *, MBUF_DUMP_BUF_SIZE, M_TEMP, M_WAITOK);
1571         VERIFY(mbuf_dump_buf != NULL);
1572
1573         printf("mbinit: done [%d MB total pool size, (%d/%d) split]\n",
1574             (nmbclusters << MCLSHIFT) >> MBSHIFT,
1575             (nclusters << MCLSHIFT) >> MBSHIFT,
1576             (njcl << MCLSHIFT) >> MBSHIFT);
1577 }
1578
1579 /*
1580  * Obtain a slab of object(s) from the class's freelist.
1581  */
1582 static mcache_obj_t *
1583 slab_alloc(mbuf_class_t class, int wait)
1584 {
1585         mcl_slab_t *sp;
1586         mcache_obj_t *buf;
1587
1588         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1589
1590         VERIFY(class != MC_16KCL || njcl > 0);
1591
1592         /* This should always be NULL for us */
1593         VERIFY(m_cobjlist(class) == NULL);
1594
1595         /*
1596          * Treat composite objects as having longer lifespan by using
1597          * a slab from the reverse direction, in hoping that this could
1598          * reduce the probability of fragmentation for slabs that hold
1599          * more than one buffer chunks (e.g. mbuf slabs).  For other
1600          * slabs, this probably doesn't make much of a difference.
1601          */
1602         if ((class == MC_MBUF || class == MC_CL) && (wait & MCR_COMP))
1603                 sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
1604         else
1605                 sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
1606
1607         if (sp == NULL) {
1608                 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1609                 /* The slab list for this class is empty */
1610                 return (NULL);
1611         }
1612
1613         VERIFY(m_infree(class) > 0);
1614         VERIFY(!slab_is_detached(sp));
1615         VERIFY(sp->sl_class == class &&
1616             (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1617         buf = sp->sl_head;
1618         VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
1619
1620         if (class == MC_MBUF) {
1621                 sp->sl_head = buf->obj_next;
1622                 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NMBPBG - 1));
1623         } else if (class == MC_CL) {
1624                 sp->sl_head = buf->obj_next;
1625                 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NCLPBG - 1));
1626         } else {
1627                 sp->sl_head = NULL;
1628         }
1629         if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
1630                 slab_nextptr_panic(sp, sp->sl_head);
1631                 /* In case sl_head is in the map but not in the slab */
1632                 VERIFY(slab_inrange(sp, sp->sl_head));
1633                 /* NOTREACHED */
1634         }
1635
1636         /* Increment slab reference */
1637         sp->sl_refcnt++;
1638
1639         if (mclaudit != NULL) {
1640                 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1641                 mca->mca_uflags = 0;
1642                 /* Save contents on mbuf objects only */
1643                 if (class == MC_MBUF)
1644                         mca->mca_uflags |= MB_SCVALID;
1645         }
1646
1647         if (class == MC_CL) {
1648                 mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1649                 /*
1650                  * A 2K cluster slab can have at most NCLPBG references.
1651                  */
1652                 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPBG &&
1653                     sp->sl_chunks == NCLPBG &&
1654                     sp->sl_len == m_maxsize(MC_BIGCL));
1655                 VERIFY(sp->sl_refcnt < NCLPBG || sp->sl_head == NULL);
1656         } else if (class == MC_BIGCL) {
1657                 mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
1658                     m_infree(MC_MBUF_BIGCL);
1659                 /*
1660                  * A 4K cluster slab can have at most 1 reference.
1661                  */
1662                 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1663                     sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1664         } else if (class == MC_16KCL) {
1665                 mcl_slab_t *nsp;
1666                 int k;
1667
1668                 --m_infree(MC_16KCL);
1669                 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1670                     sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1671                 /*
1672                  * Increment 2nd-Nth slab reference, where N is NSLABSP16KB.
1673                  * A 16KB big cluster takes NSLABSP16KB slabs, each having at
1674                  * most 1 reference.
1675                  */
1676                 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1677                         nsp = nsp->sl_next;
1678                         /* Next slab must already be present */
1679                         VERIFY(nsp != NULL);
1680                         nsp->sl_refcnt++;
1681                         VERIFY(!slab_is_detached(nsp));
1682                         VERIFY(nsp->sl_class == MC_16KCL &&
1683                             nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1684                             nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1685                             nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1686                             nsp->sl_head == NULL);
1687                 }
1688         } else {
1689                 VERIFY(class == MC_MBUF);
1690                 --m_infree(MC_MBUF);
1691                 /*
1692                  * If auditing is turned on, this check is
1693                  * deferred until later in mbuf_slab_audit().
1694                  */
1695                 if (mclaudit == NULL)
1696                         _MCHECK((struct mbuf *)buf);
1697                 /*
1698                  * Since we have incremented the reference count above,
1699                  * an mbuf slab (formerly a 4KB cluster slab that was cut
1700                  * up into mbufs) must have a reference count between 1
1701                  * and NMBPBG at this point.
1702                  */
1703                 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPBG &&
1704                     sp->sl_chunks == NMBPBG &&
1705                     sp->sl_len == m_maxsize(MC_BIGCL));
1706                 VERIFY(sp->sl_refcnt < NMBPBG || sp->sl_head == NULL);
1707         }
1708
1709         /* If empty, remove this slab from the class's freelist */
1710         if (sp->sl_head == NULL) {
1711                 VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPBG);
1712                 VERIFY(class != MC_CL || sp->sl_refcnt == NCLPBG);
1713                 slab_remove(sp, class);
1714         }
1715
1716         return (buf);
1717 }
1718
1719 /*
1720  * Place a slab of object(s) back into a class's slab list.
1721  */
1722 static void
1723 slab_free(mbuf_class_t class, mcache_obj_t *buf)
1724 {
1725         mcl_slab_t *sp;
1726
1727         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1728
1729         VERIFY(class != MC_16KCL || njcl > 0);
1730         VERIFY(buf->obj_next == NULL);
1731         sp = slab_get(buf);
1732         VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
1733             (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1734
1735         /* Decrement slab reference */
1736         sp->sl_refcnt--;
1737
1738         if (class == MC_CL) {
1739                 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1740                 /*
1741                  * A slab that has been splitted for 2KB clusters can have
1742                  * at most 1 outstanding reference at this point.
1743                  */
1744                 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPBG - 1) &&
1745                     sp->sl_chunks == NCLPBG &&
1746                     sp->sl_len == m_maxsize(MC_BIGCL));
1747                 VERIFY(sp->sl_refcnt < (NCLPBG - 1) ||
1748                     (slab_is_detached(sp) && sp->sl_head == NULL));
1749         } else if (class == MC_BIGCL) {
1750                 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1751                 /*
1752                  * A 4KB cluster slab can have at most 1 reference
1753                  * which must be 0 at this point.
1754                  */
1755                 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1756                     sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1757                 VERIFY(slab_is_detached(sp));
1758         } else if (class == MC_16KCL) {
1759                 mcl_slab_t *nsp;
1760                 int k;
1761                 /*
1762                  * A 16KB cluster takes NSLABSP16KB slabs, all must
1763                  * now have 0 reference.
1764                  */
1765                 VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES));
1766                 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1767                     sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1768                 VERIFY(slab_is_detached(sp));
1769                 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1770                         nsp = nsp->sl_next;
1771                         /* Next slab must already be present */
1772                         VERIFY(nsp != NULL);
1773                         nsp->sl_refcnt--;
1774                         VERIFY(slab_is_detached(nsp));
1775                         VERIFY(nsp->sl_class == MC_16KCL &&
1776                             (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
1777                             nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
1778                             nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1779                             nsp->sl_head == NULL);
1780                 }
1781         } else {
1782                 /*
1783                  * A slab that has been splitted for mbufs has at most NMBPBG
1784                  * reference counts.  Since we have decremented one reference
1785                  * above, it must now be between 0 and NMBPBG-1.
1786                  */
1787                 VERIFY(class == MC_MBUF);
1788                 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NMBPBG - 1) &&
1789                     sp->sl_chunks == NMBPBG &&
1790                     sp->sl_len == m_maxsize(MC_BIGCL));
1791                 VERIFY(sp->sl_refcnt < (NMBPBG - 1) ||
1792                     (slab_is_detached(sp) && sp->sl_head == NULL));
1793         }
1794
1795         /*
1796          * When auditing is enabled, ensure that the buffer still
1797          * contains the free pattern.  Otherwise it got corrupted
1798          * while at the CPU cache layer.
1799          */
1800         if (mclaudit != NULL) {
1801                 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1802                 if (mclverify) {
1803                         mcache_audit_free_verify(mca, buf, 0, m_maxsize(class));
1804                 }
1805                 mca->mca_uflags &= ~MB_SCVALID;
1806         }
1807
1808         if (class == MC_CL) {
1809                 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1810                 buf->obj_next = sp->sl_head;
1811         } else if (class == MC_BIGCL) {
1812                 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1813                     m_infree(MC_MBUF_BIGCL);
1814         } else if (class == MC_16KCL) {
1815                 ++m_infree(MC_16KCL);
1816         } else {
1817                 ++m_infree(MC_MBUF);
1818                 buf->obj_next = sp->sl_head;
1819         }
1820         sp->sl_head = buf;
1821
1822         /*
1823          * If a slab has been splitted to either one which holds 2KB clusters,
1824          * or one which holds mbufs, turn it back to one which holds a 4KB
1825          * cluster.
1826          */
1827         if (class == MC_MBUF && sp->sl_refcnt == 0 &&
1828             m_total(class) > m_minlimit(class) &&
1829             m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) {
1830                 int i = NMBPBG;
1831
1832                 m_total(MC_BIGCL)++;
1833                 mbstat.m_bigclusters = m_total(MC_BIGCL);
1834                 m_total(MC_MBUF) -= NMBPBG;
1835                 mbstat.m_mbufs = m_total(MC_MBUF);
1836                 m_infree(MC_MBUF) -= NMBPBG;
1837                 mtype_stat_add(MT_FREE, -((unsigned)NMBPBG));
1838
1839                 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
1840                 VERIFY(m_total(MC_MBUF) >= m_minlimit(MC_MBUF));
1841
1842                 while (i--) {
1843                         struct mbuf *m = sp->sl_head;
1844                         VERIFY(m != NULL);
1845                         sp->sl_head = m->m_next;
1846                         m->m_next = NULL;
1847                 }
1848                 VERIFY(sp->sl_head == NULL);
1849
1850                 /* Remove the slab from the mbuf class's slab list */
1851                 slab_remove(sp, class);
1852
1853                 /* Reinitialize it as a 4KB cluster slab */
1854                 slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base,
1855                     sp->sl_len, 0, 1);
1856
1857                 if (mclverify) {
1858                         mcache_set_pattern(MCACHE_FREE_PATTERN,
1859                             (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL));
1860                 }
1861                 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1862                     m_infree(MC_MBUF_BIGCL);
1863
1864                 VERIFY(slab_is_detached(sp));
1865                 /* And finally switch class */
1866                 class = MC_BIGCL;
1867         } else if (class == MC_CL && sp->sl_refcnt == 0 &&
1868             m_total(class) > m_minlimit(class) &&
1869             m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) {
1870                 int i = NCLPBG;
1871
1872                 m_total(MC_BIGCL)++;
1873                 mbstat.m_bigclusters = m_total(MC_BIGCL);
1874                 m_total(MC_CL) -= NCLPBG;
1875                 mbstat.m_clusters = m_total(MC_CL);
1876                 m_infree(MC_CL) -= NCLPBG;
1877                 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
1878                 VERIFY(m_total(MC_CL) >= m_minlimit(MC_CL));
1879
1880                 while (i--) {
1881                         union mcluster *c = sp->sl_head;
1882                         VERIFY(c != NULL);
1883                         sp->sl_head = c->mcl_next;
1884                         c->mcl_next = NULL;
1885                 }
1886                 VERIFY(sp->sl_head == NULL);
1887
1888                 /* Remove the slab from the 2KB cluster class's slab list */
1889                 slab_remove(sp, class);
1890
1891                 /* Reinitialize it as a 4KB cluster slab */
1892                 slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base,
1893                     sp->sl_len, 0, 1);
1894
1895                 if (mclverify) {
1896                         mcache_set_pattern(MCACHE_FREE_PATTERN,
1897                             (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL));
1898                 }
1899                 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1900                     m_infree(MC_MBUF_BIGCL);
1901
1902                 VERIFY(slab_is_detached(sp));
1903                 /* And finally switch class */
1904                 class = MC_BIGCL;
1905         }
1906
1907         /* Reinsert the slab to the class's slab list */
1908         if (slab_is_detached(sp))
1909                 slab_insert(sp, class);
1910 }
1911
1912 /*
1913  * Common allocator for rudimentary objects called by the CPU cache layer
1914  * during an allocation request whenever there is no available element in the
1915  * bucket layer.  It returns one or more elements from the appropriate global
1916  * freelist.  If the freelist is empty, it will attempt to populate it and
1917  * retry the allocation.
1918  */
1919 static unsigned int
1920 mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
1921 {
1922         mbuf_class_t class = (mbuf_class_t)arg;
1923         unsigned int need = num;
1924         mcache_obj_t **list = *plist;
1925
1926         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1927         ASSERT(need > 0);
1928
1929         lck_mtx_lock(mbuf_mlock);
1930
1931         for (;;) {
1932                 if ((*list = slab_alloc(class, wait)) != NULL) {
1933                         (*list)->obj_next = NULL;
1934                         list = *plist = &(*list)->obj_next;
1935
1936                         if (--need == 0) {
1937                                 /*
1938                                  * If the number of elements in freelist has
1939                                  * dropped below low watermark, asynchronously
1940                                  * populate the freelist now rather than doing
1941                                  * it later when we run out of elements.
1942                                  */
1943                                 if (!mbuf_cached_above(class, wait) &&
1944                                     m_infree(class) < m_total(class) >> 5) {
1945                                         (void) freelist_populate(class, 1,
1946                                             M_DONTWAIT);
1947                                 }
1948                                 break;
1949                         }
1950                 } else {
1951                         VERIFY(m_infree(class) == 0 || class == MC_CL);
1952
1953                         (void) freelist_populate(class, 1,
1954                             (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
1955
1956                         if (m_infree(class) > 0)
1957                                 continue;
1958
1959                         /* Check if there's anything at the cache layer */
1960                         if (mbuf_cached_above(class, wait))
1961                                 break;
1962
1963                         /* watchdog checkpoint */
1964                         mbuf_watchdog();
1965
1966                         /* We have nothing and cannot block; give up */
1967                         if (wait & MCR_NOSLEEP) {
1968                                 if (!(wait & MCR_TRYHARD)) {
1969                                         m_fail_cnt(class)++;
1970                                         mbstat.m_drops++;
1971                                         break;
1972                                 }
1973                         }
1974
1975                         /*
1976                          * If the freelist is still empty and the caller is
1977                          * willing to be blocked, sleep on the wait channel
1978                          * until an element is available.  Otherwise, if
1979                          * MCR_TRYHARD is set, do our best to satisfy the
1980                          * request without having to go to sleep.
1981                          */
1982                         if (mbuf_worker_ready &&
1983                             mbuf_sleep(class, need, wait))
1984                                 break;
1985
1986                         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1987                 }
1988         }
1989
1990         m_alloc_cnt(class) += num - need;
1991         lck_mtx_unlock(mbuf_mlock);
1992
1993         return (num - need);
1994 }
1995
1996 /*
1997  * Common de-allocator for rudimentary objects called by the CPU cache
1998  * layer when one or more elements need to be returned to the appropriate
1999  * global freelist.
2000  */
2001 static void
2002 mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
2003 {
2004         mbuf_class_t class = (mbuf_class_t)arg;
2005         mcache_obj_t *nlist;
2006         unsigned int num = 0;
2007         int w;
2008
2009         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2010
2011         lck_mtx_lock(mbuf_mlock);
2012
2013         for (;;) {
2014                 nlist = list->obj_next;
2015                 list->obj_next = NULL;
2016                 slab_free(class, list);
2017                 ++num;
2018                 if ((list = nlist) == NULL)
2019                         break;
2020         }
2021         m_free_cnt(class) += num;
2022
2023         if ((w = mb_waiters) > 0)
2024                 mb_waiters = 0;
2025
2026         lck_mtx_unlock(mbuf_mlock);
2027
2028         if (w != 0)
2029                 wakeup(mb_waitchan);
2030 }
2031
2032 /*
2033  * Common auditor for rudimentary objects called by the CPU cache layer
2034  * during an allocation or free request.  For the former, this is called
2035  * after the objects are obtained from either the bucket or slab layer
2036  * and before they are returned to the caller.  For the latter, this is
2037  * called immediately during free and before placing the objects into
2038  * the bucket or slab layer.
2039  */
2040 static void
2041 mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2042 {
2043         mbuf_class_t class = (mbuf_class_t)arg;
2044         mcache_audit_t *mca;
2045
2046         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2047
2048         while (list != NULL) {
2049                 lck_mtx_lock(mbuf_mlock);
2050                 mca = mcl_audit_buf2mca(class, list);
2051
2052                 /* Do the sanity checks */
2053                 if (class == MC_MBUF) {
2054                         mcl_audit_mbuf(mca, list, FALSE, alloc);
2055                         ASSERT(mca->mca_uflags & MB_SCVALID);
2056                 } else {
2057                         mcl_audit_cluster(mca, list, m_maxsize(class),
2058                             alloc, TRUE);
2059                         ASSERT(!(mca->mca_uflags & MB_SCVALID));
2060                 }
2061                 /* Record this transaction */
2062                 if (mcltrace)
2063                         mcache_buffer_log(mca, list, m_cache(class));
2064
2065                 if (alloc)
2066                         mca->mca_uflags |= MB_INUSE;
2067                 else
2068                         mca->mca_uflags &= ~MB_INUSE;
2069                 /* Unpair the object (unconditionally) */
2070                 mca->mca_uptr = NULL;
2071                 lck_mtx_unlock(mbuf_mlock);
2072
2073                 list = list->obj_next;
2074         }
2075 }
2076
2077 /*
2078  * Common notify routine for all caches.  It is called by mcache when
2079  * one or more objects get freed.  We use this indication to trigger
2080  * the wakeup of any sleeping threads so that they can retry their
2081  * allocation requests.
2082  */
2083 static void
2084 mbuf_slab_notify(void *arg, u_int32_t reason)
2085 {
2086         mbuf_class_t class = (mbuf_class_t)arg;
2087         int w;
2088
2089         ASSERT(MBUF_CLASS_VALID(class));
2090
2091         if (reason != MCN_RETRYALLOC)
2092                 return;
2093
2094         lck_mtx_lock(mbuf_mlock);
2095         if ((w = mb_waiters) > 0) {
2096                 m_notified(class)++;
2097                 mb_waiters = 0;
2098         }
2099         lck_mtx_unlock(mbuf_mlock);
2100
2101         if (w != 0)
2102                 wakeup(mb_waitchan);
2103 }
2104
2105 /*
2106  * Obtain object(s) from the composite class's freelist.
2107  */
2108 static unsigned int
2109 cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
2110 {
2111         unsigned int need = num;
2112         mcl_slab_t *sp, *clsp, *nsp;
2113         struct mbuf *m;
2114         mcache_obj_t **list = *plist;
2115         void *cl;
2116
2117         VERIFY(need > 0);
2118         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2119         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2120
2121         /* Get what we can from the freelist */
2122         while ((*list = m_cobjlist(class)) != NULL) {
2123                 MRANGE(*list);
2124
2125                 m = (struct mbuf *)*list;
2126                 sp = slab_get(m);
2127                 cl = m->m_ext.ext_buf;
2128                 clsp = slab_get(cl);
2129                 VERIFY(m->m_flags == M_EXT && cl != NULL);
2130                 VERIFY(MEXT_RFA(m) != NULL && MBUF_IS_COMPOSITE(m));
2131
2132                 if (class == MC_MBUF_CL) {
2133                         VERIFY(clsp->sl_refcnt >= 1 &&
2134                             clsp->sl_refcnt <= NCLPBG);
2135                 } else {
2136                         VERIFY(clsp->sl_refcnt == 1);
2137                 }
2138
2139                 if (class == MC_MBUF_16KCL) {
2140                         int k;
2141                         for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2142                                 nsp = nsp->sl_next;
2143                                 /* Next slab must already be present */
2144                                 VERIFY(nsp != NULL);
2145                                 VERIFY(nsp->sl_refcnt == 1);
2146                         }
2147                 }
2148
2149                 if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
2150                     !MBUF_IN_MAP(m_cobjlist(class))) {
2151                         slab_nextptr_panic(sp, m_cobjlist(class));
2152                         /* NOTREACHED */
2153                 }
2154                 (*list)->obj_next = NULL;
2155                 list = *plist = &(*list)->obj_next;
2156
2157                 if (--need == 0)
2158                         break;
2159         }
2160         m_infree(class) -= (num - need);
2161
2162         return (num - need);
2163 }
2164
2165 /*
2166  * Place object(s) back into a composite class's freelist.
2167  */
2168 static unsigned int
2169 cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
2170 {
2171         mcache_obj_t *o, *tail;
2172         unsigned int num = 0;
2173         struct mbuf *m, *ms;
2174         mcache_audit_t *mca = NULL;
2175         mcache_obj_t *ref_list = NULL;
2176         mcl_slab_t *clsp, *nsp;
2177         void *cl;
2178         mbuf_class_t cl_class;
2179
2180         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2181         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2182         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2183
2184         if (class == MC_MBUF_CL) {
2185                 cl_class = MC_CL;
2186         } else if (class == MC_MBUF_BIGCL) {
2187                 cl_class = MC_BIGCL;
2188         } else {
2189                 VERIFY(class == MC_MBUF_16KCL);
2190                 cl_class = MC_16KCL;
2191         }
2192
2193         o = tail = list;
2194
2195         while ((m = ms = (struct mbuf *)o) != NULL) {
2196                 mcache_obj_t *rfa, *nexto = o->obj_next;
2197
2198                 /* Do the mbuf sanity checks */
2199                 if (mclaudit != NULL) {
2200                         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2201                         if (mclverify) {
2202                                 mcache_audit_free_verify(mca, m, 0,
2203                                     m_maxsize(MC_MBUF));
2204                         }
2205                         ms = (struct mbuf *)mca->mca_contents;
2206                 }
2207
2208                 /* Do the cluster sanity checks */
2209                 cl = ms->m_ext.ext_buf;
2210                 clsp = slab_get(cl);
2211                 if (mclverify) {
2212                         size_t size = m_maxsize(cl_class);
2213                         mcache_audit_free_verify(mcl_audit_buf2mca(cl_class,
2214                             (mcache_obj_t *)cl), cl, 0, size);
2215                 }
2216                 VERIFY(ms->m_type == MT_FREE);
2217                 VERIFY(ms->m_flags == M_EXT);
2218                 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2219                 if (cl_class == MC_CL) {
2220                         VERIFY(clsp->sl_refcnt >= 1 &&
2221                             clsp->sl_refcnt <= NCLPBG);
2222                 } else {
2223                         VERIFY(clsp->sl_refcnt == 1);
2224                 }
2225                 if (cl_class == MC_16KCL) {
2226                         int k;
2227                         for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2228                                 nsp = nsp->sl_next;
2229                                 /* Next slab must already be present */
2230                                 VERIFY(nsp != NULL);
2231                                 VERIFY(nsp->sl_refcnt == 1);
2232                         }
2233                 }
2234
2235                 /*
2236                  * If we're asked to purge, restore the actual mbuf using
2237                  * contents of the shadow structure (if auditing is enabled)
2238                  * and clear EXTF_COMPOSITE flag from the mbuf, as we are
2239                  * about to free it and the attached cluster into their caches.
2240                  */
2241                 if (purged) {
2242                         /* Restore constructed mbuf fields */
2243                         if (mclaudit != NULL)
2244                                 mcl_audit_restore_mbuf(m, mca, TRUE);
2245
2246                         MEXT_REF(m) = 0;
2247                         MEXT_FLAGS(m) = 0;
2248
2249                         rfa = (mcache_obj_t *)(void *)MEXT_RFA(m);
2250                         rfa->obj_next = ref_list;
2251                         ref_list = rfa;
2252                         MEXT_RFA(m) = NULL;
2253
2254                         m->m_type = MT_FREE;
2255                         m->m_flags = m->m_len = 0;
2256                         m->m_next = m->m_nextpkt = NULL;
2257
2258                         /* Save mbuf fields and make auditing happy */
2259                         if (mclaudit != NULL)
2260                                 mcl_audit_mbuf(mca, o, FALSE, FALSE);
2261
2262                         VERIFY(m_total(class) > 0);
2263                         m_total(class)--;
2264
2265                         /* Free the mbuf */
2266                         o->obj_next = NULL;
2267                         slab_free(MC_MBUF, o);
2268
2269                         /* And free the cluster */
2270                         ((mcache_obj_t *)cl)->obj_next = NULL;
2271                         if (class == MC_MBUF_CL)
2272                                 slab_free(MC_CL, cl);
2273                         else if (class == MC_MBUF_BIGCL)
2274                                 slab_free(MC_BIGCL, cl);
2275                         else
2276                                 slab_free(MC_16KCL, cl);
2277                 }
2278
2279                 ++num;
2280                 tail = o;
2281                 o = nexto;
2282         }
2283
2284         if (!purged) {
2285                 tail->obj_next = m_cobjlist(class);
2286                 m_cobjlist(class) = list;
2287                 m_infree(class) += num;
2288         } else if (ref_list != NULL) {
2289                 mcache_free_ext(ref_cache, ref_list);
2290         }
2291
2292         return (num);
2293 }
2294
2295 /*
2296  * Common allocator for composite objects called by the CPU cache layer
2297  * during an allocation request whenever there is no available element in
2298  * the bucket layer.  It returns one or more composite elements from the
2299  * appropriate global freelist.  If the freelist is empty, it will attempt
2300  * to obtain the rudimentary objects from their caches and construct them
2301  * into composite mbuf + cluster objects.
2302  */
2303 static unsigned int
2304 mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed,
2305     int wait)
2306 {
2307         mbuf_class_t class = (mbuf_class_t)arg;
2308         mbuf_class_t cl_class = 0;
2309         unsigned int num = 0, cnum = 0, want = needed;
2310         mcache_obj_t *ref_list = NULL;
2311         mcache_obj_t *mp_list = NULL;
2312         mcache_obj_t *clp_list = NULL;
2313         mcache_obj_t **list;
2314         struct ext_ref *rfa;
2315         struct mbuf *m;
2316         void *cl;
2317
2318         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2319         ASSERT(needed > 0);
2320
2321         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2322
2323         /* There should not be any slab for this class */
2324         VERIFY(m_slab_cnt(class) == 0 &&
2325             m_slablist(class).tqh_first == NULL &&
2326             m_slablist(class).tqh_last == NULL);
2327
2328         lck_mtx_lock(mbuf_mlock);
2329
2330         /* Try using the freelist first */
2331         num = cslab_alloc(class, plist, needed);
2332         list = *plist;
2333         if (num == needed) {
2334                 m_alloc_cnt(class) += num;
2335                 lck_mtx_unlock(mbuf_mlock);
2336                 return (needed);
2337         }
2338
2339         lck_mtx_unlock(mbuf_mlock);
2340
2341         /*
2342          * We could not satisfy the request using the freelist alone;
2343          * allocate from the appropriate rudimentary caches and use
2344          * whatever we can get to construct the composite objects.
2345          */
2346         needed -= num;
2347
2348         /*
2349          * Mark these allocation requests as coming from a composite cache.
2350          * Also, if the caller is willing to be blocked, mark the request
2351          * with MCR_FAILOK such that we don't end up sleeping at the mbuf
2352          * slab layer waiting for the individual object when one or more
2353          * of the already-constructed composite objects are available.
2354          */
2355         wait |= MCR_COMP;
2356         if (!(wait & MCR_NOSLEEP))
2357                 wait |= MCR_FAILOK;
2358
2359         /* allocate mbufs */
2360         needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
2361         if (needed == 0) {
2362                 ASSERT(mp_list == NULL);
2363                 goto fail;
2364         }
2365
2366         /* allocate clusters */
2367         if (class == MC_MBUF_CL) {
2368                 cl_class = MC_CL;
2369         } else if (class == MC_MBUF_BIGCL) {
2370                 cl_class = MC_BIGCL;
2371         } else {
2372                 VERIFY(class == MC_MBUF_16KCL);
2373                 cl_class = MC_16KCL;
2374         }
2375         needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait);
2376         if (needed == 0) {
2377                 ASSERT(clp_list == NULL);
2378                 goto fail;
2379         }
2380
2381         needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
2382         if (needed == 0) {
2383                 ASSERT(ref_list == NULL);
2384                 goto fail;
2385         }
2386
2387         /*
2388          * By this time "needed" is MIN(mbuf, cluster, ref).  Any left
2389          * overs will get freed accordingly before we return to caller.
2390          */
2391         for (cnum = 0; cnum < needed; cnum++) {
2392                 struct mbuf *ms;
2393
2394                 m = ms = (struct mbuf *)mp_list;
2395                 mp_list = mp_list->obj_next;
2396
2397                 cl = clp_list;
2398                 clp_list = clp_list->obj_next;
2399                 ((mcache_obj_t *)cl)->obj_next = NULL;
2400
2401                 rfa = (struct ext_ref *)ref_list;
2402                 ref_list = ref_list->obj_next;
2403                 ((mcache_obj_t *)(void *)rfa)->obj_next = NULL;
2404
2405                 /*
2406                  * If auditing is enabled, construct the shadow mbuf
2407                  * in the audit structure instead of in the actual one.
2408                  * mbuf_cslab_audit() will take care of restoring the
2409                  * contents after the integrity check.
2410                  */
2411                 if (mclaudit != NULL) {
2412                         mcache_audit_t *mca, *cl_mca;
2413
2414                         lck_mtx_lock(mbuf_mlock);
2415                         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2416                         ms = ((struct mbuf *)mca->mca_contents);
2417                         cl_mca = mcl_audit_buf2mca(MC_CL, (mcache_obj_t *)cl);
2418
2419                         /*
2420                          * Pair them up.  Note that this is done at the time
2421                          * the mbuf+cluster objects are constructed.  This
2422                          * information should be treated as "best effort"
2423                          * debugging hint since more than one mbufs can refer
2424                          * to a cluster.  In that case, the cluster might not
2425                          * be freed along with the mbuf it was paired with.
2426                          */
2427                         mca->mca_uptr = cl_mca;
2428                         cl_mca->mca_uptr = mca;
2429
2430                         ASSERT(mca->mca_uflags & MB_SCVALID);
2431                         ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
2432                         lck_mtx_unlock(mbuf_mlock);
2433
2434                         /* Technically, they are in the freelist */
2435                         if (mclverify) {
2436                                 size_t size;
2437
2438                                 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
2439                                     m_maxsize(MC_MBUF));
2440
2441                                 if (class == MC_MBUF_CL)
2442                                         size = m_maxsize(MC_CL);
2443                                 else if (class == MC_MBUF_BIGCL)
2444                                         size = m_maxsize(MC_BIGCL);
2445                                 else
2446                                         size = m_maxsize(MC_16KCL);
2447
2448                                 mcache_set_pattern(MCACHE_FREE_PATTERN, cl,
2449                                     size);
2450                         }
2451                 }
2452
2453                 MBUF_INIT(ms, 0, MT_FREE);
2454                 if (class == MC_MBUF_16KCL) {
2455                         MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2456                 } else if (class == MC_MBUF_BIGCL) {
2457                         MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2458                 } else {
2459                         MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2460                 }
2461                 VERIFY(ms->m_flags == M_EXT);
2462                 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2463
2464                 *list = (mcache_obj_t *)m;
2465                 (*list)->obj_next = NULL;
2466                 list = *plist = &(*list)->obj_next;
2467         }
2468
2469 fail:
2470         /*
2471          * Free up what's left of the above.
2472          */
2473         if (mp_list != NULL)
2474                 mcache_free_ext(m_cache(MC_MBUF), mp_list);
2475         if (clp_list != NULL)
2476                 mcache_free_ext(m_cache(cl_class), clp_list);
2477         if (ref_list != NULL)
2478                 mcache_free_ext(ref_cache, ref_list);
2479
2480         lck_mtx_lock(mbuf_mlock);
2481         if (num > 0 || cnum > 0) {
2482                 m_total(class) += cnum;
2483                 VERIFY(m_total(class) <= m_maxlimit(class));
2484                 m_alloc_cnt(class) += num + cnum;
2485         }
2486         if ((num + cnum) < want)
2487                 m_fail_cnt(class) += (want - (num + cnum));
2488         lck_mtx_unlock(mbuf_mlock);
2489
2490         return (num + cnum);
2491 }
2492
2493 /*
2494  * Common de-allocator for composite objects called by the CPU cache
2495  * layer when one or more elements need to be returned to the appropriate
2496  * global freelist.
2497  */
2498 static void
2499 mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
2500 {
2501         mbuf_class_t class = (mbuf_class_t)arg;
2502         unsigned int num;
2503         int w;
2504
2505         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2506
2507         lck_mtx_lock(mbuf_mlock);
2508
2509         num = cslab_free(class, list, purged);
2510         m_free_cnt(class) += num;
2511
2512         if ((w = mb_waiters) > 0)
2513                 mb_waiters = 0;
2514
2515         lck_mtx_unlock(mbuf_mlock);
2516
2517         if (w != 0)
2518                 wakeup(mb_waitchan);
2519 }
2520
2521 /*
2522  * Common auditor for composite objects called by the CPU cache layer
2523  * during an allocation or free request.  For the former, this is called
2524  * after the objects are obtained from either the bucket or slab layer
2525  * and before they are returned to the caller.  For the latter, this is
2526  * called immediately during free and before placing the objects into
2527  * the bucket or slab layer.
2528  */
2529 static void
2530 mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2531 {
2532         mbuf_class_t class = (mbuf_class_t)arg;
2533         mcache_audit_t *mca;
2534         struct mbuf *m, *ms;
2535         mcl_slab_t *clsp, *nsp;
2536         size_t size;
2537         void *cl;
2538
2539         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2540
2541         while ((m = ms = (struct mbuf *)list) != NULL) {
2542                 lck_mtx_lock(mbuf_mlock);
2543                 /* Do the mbuf sanity checks and record its transaction */
2544                 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2545                 mcl_audit_mbuf(mca, m, TRUE, alloc);
2546                 if (mcltrace)
2547                         mcache_buffer_log(mca, m, m_cache(class));
2548
2549                 if (alloc)
2550                         mca->mca_uflags |= MB_COMP_INUSE;
2551                 else
2552                         mca->mca_uflags &= ~MB_COMP_INUSE;
2553
2554                 /*
2555                  * Use the shadow mbuf in the audit structure if we are
2556                  * freeing, since the contents of the actual mbuf has been
2557                  * pattern-filled by the above call to mcl_audit_mbuf().
2558                  */
2559                 if (!alloc && mclverify)
2560                         ms = (struct mbuf *)mca->mca_contents;
2561
2562                 /* Do the cluster sanity checks and record its transaction */
2563                 cl = ms->m_ext.ext_buf;
2564                 clsp = slab_get(cl);
2565                 VERIFY(ms->m_flags == M_EXT && cl != NULL);
2566                 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2567                 if (class == MC_MBUF_CL)
2568                         VERIFY(clsp->sl_refcnt >= 1 &&
2569                             clsp->sl_refcnt <= NCLPBG);
2570                 else
2571                         VERIFY(clsp->sl_refcnt == 1);
2572
2573                 if (class == MC_MBUF_16KCL) {
2574                         int k;
2575                         for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2576                                 nsp = nsp->sl_next;
2577                                 /* Next slab must already be present */
2578                                 VERIFY(nsp != NULL);
2579                                 VERIFY(nsp->sl_refcnt == 1);
2580                         }
2581                 }
2582
2583                 mca = mcl_audit_buf2mca(MC_CL, cl);
2584                 if (class == MC_MBUF_CL)
2585                         size = m_maxsize(MC_CL);
2586                 else if (class == MC_MBUF_BIGCL)
2587                         size = m_maxsize(MC_BIGCL);
2588                 else
2589                         size = m_maxsize(MC_16KCL);
2590                 mcl_audit_cluster(mca, cl, size, alloc, FALSE);
2591                 if (mcltrace)
2592                         mcache_buffer_log(mca, cl, m_cache(class));
2593
2594                 if (alloc)
2595                         mca->mca_uflags |= MB_COMP_INUSE;
2596                 else
2597                         mca->mca_uflags &= ~MB_COMP_INUSE;
2598                 lck_mtx_unlock(mbuf_mlock);
2599
2600                 list = list->obj_next;
2601         }
2602 }
2603
2604 /*
2605  * Allocate some number of mbuf clusters and place on cluster freelist.
2606  */
2607 static int
2608 m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
2609 {
2610         int i;
2611         vm_size_t size = 0;
2612         int numpages = 0, large_buffer = (bufsize == m_maxsize(MC_16KCL));
2613         vm_offset_t page = 0;
2614         mcache_audit_t *mca_list = NULL;
2615         mcache_obj_t *con_list = NULL;
2616         mcl_slab_t *sp;
2617
2618         VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
2619             bufsize == m_maxsize(MC_16KCL));
2620
2621         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2622
2623         /*
2624          * Multiple threads may attempt to populate the cluster map one
2625          * after another.  Since we drop the lock below prior to acquiring
2626          * the physical page(s), our view of the cluster map may no longer
2627          * be accurate, and we could end up over-committing the pages beyond
2628          * the maximum allowed for each class.  To prevent it, this entire
2629          * operation (including the page mapping) is serialized.
2630          */
2631         while (mb_clalloc_busy) {
2632                 mb_clalloc_waiters++;
2633                 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
2634                     (PZERO-1), "m_clalloc", NULL);
2635                 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2636         }
2637
2638         /* We are busy now; tell everyone else to go away */
2639         mb_clalloc_busy = TRUE;
2640
2641         /*
2642          * Honor the caller's wish to block or not block.  We have a way
2643          * to grow the pool asynchronously using the mbuf worker thread.
2644          */
2645         i = m_howmany(num, bufsize);
2646         if (i == 0 || (wait & M_DONTWAIT))
2647                 goto out;
2648
2649         lck_mtx_unlock(mbuf_mlock);
2650
2651         size = round_page(i * bufsize);
2652         page = kmem_mb_alloc(mb_map, size, large_buffer);
2653
2654         /*
2655          * If we did ask for "n" 16KB physically contiguous chunks
2656          * and didn't get them, then please try again without this
2657          * restriction.
2658          */
2659         if (large_buffer && page == 0)
2660                 page = kmem_mb_alloc(mb_map, size, 0);
2661
2662         if (page == 0) {
2663                 if (bufsize == m_maxsize(MC_BIGCL)) {
2664                         /* Try for 1 page if failed, only 4KB request */
2665                         size = NBPG;
2666                         page = kmem_mb_alloc(mb_map, size, 0);
2667                 }
2668
2669                 if (page == 0) {
2670                         lck_mtx_lock(mbuf_mlock);
2671                         goto out;
2672                 }
2673         }
2674
2675         VERIFY(IS_P2ALIGNED(page, NBPG));
2676         numpages = size / NBPG;
2677
2678         /* If auditing is enabled, allocate the audit structures now */
2679         if (mclaudit != NULL) {
2680                 int needed;
2681
2682                 /*
2683                  * Yes, I realize this is a waste of memory for clusters
2684                  * that never get transformed into mbufs, as we may end
2685                  * up with NMBPBG-1 unused audit structures per cluster.
2686                  * But doing so tremendously simplifies the allocation
2687                  * strategy, since at this point we are not holding the
2688                  * mbuf lock and the caller is okay to be blocked.
2689                  */
2690                 if (bufsize == m_maxsize(MC_BIGCL)) {
2691                         needed = numpages * NMBPBG;
2692
2693                         i = mcache_alloc_ext(mcl_audit_con_cache,
2694                             &con_list, needed, MCR_SLEEP);
2695
2696                         VERIFY(con_list != NULL && i == needed);
2697                 } else {
2698                         needed = numpages / NSLABSP16KB;
2699                 }
2700
2701                 i = mcache_alloc_ext(mcache_audit_cache,
2702                     (mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
2703
2704                 VERIFY(mca_list != NULL && i == needed);
2705         }
2706
2707         lck_mtx_lock(mbuf_mlock);
2708
2709         for (i = 0; i < numpages; i++, page += NBPG) {
2710                 ppnum_t offset = ((char *)page - (char *)mbutl) / NBPG;
2711                 ppnum_t new_page = pmap_find_phys(kernel_pmap,
2712                     (vm_offset_t)page);
2713
2714                 /*
2715                  * In the case of no mapper being available the following
2716                  * code noops and returns the input page; if there is a
2717                  * mapper the appropriate I/O page is returned.
2718                  */
2719                 VERIFY(offset < mcl_pages);
2720                 new_page = IOMapperInsertPage(mcl_paddr_base, offset, new_page);
2721                 mcl_paddr[offset] = new_page << PGSHIFT;
2722
2723                 /* Pattern-fill this fresh page */
2724                 if (mclverify) {
2725                         mcache_set_pattern(MCACHE_FREE_PATTERN,
2726                             (caddr_t)page, NBPG);
2727                 }
2728                 if (bufsize == m_maxsize(MC_BIGCL)) {
2729                         union mbigcluster *mbc = (union mbigcluster *)page;
2730
2731                         /* One for the entire page */
2732                         sp = slab_get(mbc);
2733                         if (mclaudit != NULL) {
2734                                 mcl_audit_init(mbc, &mca_list, &con_list,
2735                                     AUDIT_CONTENTS_SIZE, NMBPBG);
2736                         }
2737                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2738                         slab_init(sp, MC_BIGCL, SLF_MAPPED,
2739                             mbc, mbc, bufsize, 0, 1);
2740
2741                         /* Insert this slab */
2742                         slab_insert(sp, MC_BIGCL);
2743
2744                         /* Update stats now since slab_get() drops the lock */
2745                         mbstat.m_bigclfree = ++m_infree(MC_BIGCL) +
2746                             m_infree(MC_MBUF_BIGCL);
2747                         mbstat.m_bigclusters = ++m_total(MC_BIGCL);
2748                         VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
2749                 } else if ((i % NSLABSP16KB) == 0) {
2750                         union m16kcluster *m16kcl = (union m16kcluster *)page;
2751                         mcl_slab_t *nsp;
2752                         int k;
2753
2754                         VERIFY(njcl > 0);
2755                         /* One for the entire 16KB */
2756                         sp = slab_get(m16kcl);
2757                         if (mclaudit != NULL)
2758                                 mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
2759
2760                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2761                         slab_init(sp, MC_16KCL, SLF_MAPPED,
2762                             m16kcl, m16kcl, bufsize, 0, 1);
2763
2764                         /*
2765                          * 2nd-Nth page's slab is part of the first one,
2766                          * where N is NSLABSP16KB.
2767                          */
2768                         for (k = 1; k < NSLABSP16KB; k++) {
2769                                 nsp = slab_get(((union mbigcluster *)page) + k);
2770                                 VERIFY(nsp->sl_refcnt == 0 &&
2771                                     nsp->sl_flags == 0);
2772                                 slab_init(nsp, MC_16KCL,
2773                                     SLF_MAPPED | SLF_PARTIAL,
2774                                     m16kcl, NULL, 0, 0, 0);
2775                         }
2776
2777                         /* Insert this slab */
2778                         slab_insert(sp, MC_16KCL);
2779
2780                         /* Update stats now since slab_get() drops the lock */
2781                         m_infree(MC_16KCL)++;
2782                         m_total(MC_16KCL)++;
2783                         VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
2784                 }
2785         }
2786         VERIFY(mca_list == NULL && con_list == NULL);
2787
2788         /* We're done; let others enter */
2789         mb_clalloc_busy = FALSE;
2790         if (mb_clalloc_waiters > 0) {
2791                 mb_clalloc_waiters = 0;
2792                 wakeup(mb_clalloc_waitchan);
2793         }
2794
2795         if (bufsize == m_maxsize(MC_BIGCL))
2796                 return (numpages);
2797
2798         VERIFY(bufsize == m_maxsize(MC_16KCL));
2799         return (numpages / NSLABSP16KB);
2800
2801 out:
2802         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2803
2804         /* We're done; let others enter */
2805         mb_clalloc_busy = FALSE;
2806         if (mb_clalloc_waiters > 0) {
2807                 mb_clalloc_waiters = 0;
2808                 wakeup(mb_clalloc_waitchan);
2809         }
2810
2811         /*
2812          * When non-blocking we kick a thread if we have to grow the
2813          * pool or if the number of free clusters is less than requested.
2814          */
2815         if (bufsize == m_maxsize(MC_BIGCL)) {
2816                 if (i > 0) {
2817                         /*
2818                          * Remember total number of 4KB clusters needed
2819                          * at this time.
2820                          */
2821                         i += m_total(MC_BIGCL);
2822                         if (i > mbuf_expand_big) {
2823                                 mbuf_expand_big = i;
2824                                 if (mbuf_worker_ready)
2825                                         wakeup((caddr_t)&mbuf_worker_run);
2826                         }
2827                 }
2828
2829                 if (m_infree(MC_BIGCL) >= num)
2830                         return (1);
2831         } else {
2832                 if (i > 0) {
2833                         /*
2834                          * Remember total number of 16KB clusters needed
2835                          * at this time.
2836                          */
2837                         i += m_total(MC_16KCL);
2838                         if (i > mbuf_expand_16k) {
2839                                 mbuf_expand_16k = i;
2840                                 if (mbuf_worker_ready)
2841                                         wakeup((caddr_t)&mbuf_worker_run);
2842                         }
2843                 }
2844
2845                 if (m_infree(MC_16KCL) >= num)
2846                         return (1);
2847         }
2848         return (0);
2849 }
2850
2851 /*
2852  * Populate the global freelist of the corresponding buffer class.
2853  */
2854 static int
2855 freelist_populate(mbuf_class_t class, unsigned int num, int wait)
2856 {
2857         mcache_obj_t *o = NULL;
2858         int i, numpages = 0, count;
2859
2860         VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL ||
2861             class == MC_16KCL);
2862
2863         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2864
2865         switch (class) {
2866         case MC_MBUF:
2867         case MC_CL:
2868         case MC_BIGCL:
2869                 numpages = (num * m_size(class) + NBPG - 1) / NBPG;
2870                 i = m_clalloc(numpages, wait, m_maxsize(MC_BIGCL));
2871
2872                 /* Respect the 4KB clusters minimum limit */
2873                 if (m_total(MC_BIGCL) == m_maxlimit(MC_BIGCL) &&
2874                     m_infree(MC_BIGCL) <= m_minlimit(MC_BIGCL)) {
2875                         if (class != MC_BIGCL || (wait & MCR_COMP))
2876                                 return (0);
2877                 }
2878                 if (class == MC_BIGCL)
2879                         return (i != 0);
2880                 break;
2881
2882         case MC_16KCL:
2883                 return (m_clalloc(num, wait, m_maxsize(class)) != 0);
2884                 /* NOTREACHED */
2885
2886         default:
2887                 VERIFY(0);
2888                 /* NOTREACHED */
2889         }
2890
2891         VERIFY(class == MC_MBUF || class == MC_CL);
2892
2893         /* how many objects will we cut the page into? */
2894         int numobj = (class == MC_MBUF ? NMBPBG : NCLPBG);
2895
2896         for (count = 0; count < numpages; count++) {
2897
2898                 /* respect totals, minlimit, maxlimit */
2899                 if (m_total(MC_BIGCL) <= m_minlimit(MC_BIGCL) ||
2900                     m_total(class) >= m_maxlimit(class))
2901                         break;
2902
2903                 if ((o = slab_alloc(MC_BIGCL, wait)) == NULL)
2904                         break;
2905
2906                 struct mbuf *m = (struct mbuf *)o;
2907                 union mcluster *c = (union mcluster *)o;
2908                 mcl_slab_t *sp = slab_get(o);
2909                 mcache_audit_t *mca = NULL;
2910
2911                 VERIFY(slab_is_detached(sp) &&
2912                     (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
2913
2914                 /*
2915                  * Make sure that the cluster is unmolested
2916                  * while in freelist
2917                  */
2918                 if (mclverify) {
2919                         mca = mcl_audit_buf2mca(MC_BIGCL, o);
2920                         mcache_audit_free_verify(mca, o, 0,
2921                             m_maxsize(MC_BIGCL));
2922                 }
2923
2924                 /* Reinitialize it as an mbuf or 2K slab */
2925                 slab_init(sp, class, sp->sl_flags,
2926                     sp->sl_base, NULL, sp->sl_len, 0, numobj);
2927
2928                 VERIFY(o == (mcache_obj_t *)sp->sl_base);
2929                 VERIFY(sp->sl_head == NULL);
2930
2931                 VERIFY(m_total(MC_BIGCL) > 0);
2932                 m_total(MC_BIGCL)--;
2933                 mbstat.m_bigclusters = m_total(MC_BIGCL);
2934
2935                 m_total(class) += numobj;
2936                 m_infree(class) += numobj;
2937
2938                 VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
2939                 VERIFY(m_total(class) <= m_maxlimit(class));
2940
2941                 i = numobj;
2942                 if (class == MC_MBUF) {
2943                         mbstat.m_mbufs = m_total(MC_MBUF);
2944                         mtype_stat_add(MT_FREE, NMBPBG);
2945                         while (i--) {
2946                                 /*
2947                                  * If auditing is enabled, construct the
2948                                  * shadow mbuf in the audit structure
2949                                  * instead of the actual one.
2950                                  * mbuf_slab_audit() will take care of
2951                                  * restoring the contents after the
2952                                  * integrity check.
2953                                  */
2954                                 if (mclaudit != NULL) {
2955                                         struct mbuf *ms;
2956                                         mca = mcl_audit_buf2mca(MC_MBUF,
2957                                             (mcache_obj_t *)m);
2958                                         ms = ((struct mbuf *)
2959                                             mca->mca_contents);
2960                                         ms->m_type = MT_FREE;
2961                                 } else {
2962                                         m->m_type = MT_FREE;
2963                                 }
2964                                 m->m_next = sp->sl_head;
2965                                 sp->sl_head = (void *)m++;
2966                         }
2967                 } else { /* MC_CL */
2968                         mbstat.m_clfree =
2969                             m_infree(MC_CL) + m_infree(MC_MBUF_CL);
2970                         mbstat.m_clusters = m_total(MC_CL);
2971                         while (i--) {
2972                                 c->mcl_next = sp->sl_head;
2973                                 sp->sl_head = (void *)c++;
2974                         }
2975                 }
2976
2977                 /* Insert into the mbuf or 2k slab list */
2978                 slab_insert(sp, class);
2979
2980                 if ((i = mb_waiters) > 0)
2981                         mb_waiters = 0;
2982                 if (i != 0)
2983                         wakeup(mb_waitchan);
2984         }
2985         return (count != 0);
2986 }
2987
2988 /*
2989  * For each class, initialize the freelist to hold m_minlimit() objects.
2990  */
2991 static void
2992 freelist_init(mbuf_class_t class)
2993 {
2994         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2995
2996         VERIFY(class == MC_CL || class == MC_BIGCL);
2997         VERIFY(m_total(class) == 0);
2998         VERIFY(m_minlimit(class) > 0);
2999
3000         while (m_total(class) < m_minlimit(class))
3001                 (void) freelist_populate(class, m_minlimit(class), M_WAIT);
3002
3003         VERIFY(m_total(class) >= m_minlimit(class));
3004 }
3005
3006 /*
3007  * (Inaccurately) check if it might be worth a trip back to the
3008  * mcache layer due the availability of objects there.  We'll
3009  * end up back here if there's nothing up there.
3010  */
3011 static boolean_t
3012 mbuf_cached_above(mbuf_class_t class, int wait)
3013 {
3014         switch (class) {
3015         case MC_MBUF:
3016                 if (wait & MCR_COMP)
3017                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)) ||
3018                             !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3019                 break;
3020
3021         case MC_CL:
3022                 if (wait & MCR_COMP)
3023                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)));
3024                 break;
3025
3026         case MC_BIGCL:
3027                 if (wait & MCR_COMP)
3028                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3029                 break;
3030
3031         case MC_16KCL:
3032                 if (wait & MCR_COMP)
3033                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL)));
3034                 break;
3035
3036         case MC_MBUF_CL:
3037         case MC_MBUF_BIGCL:
3038         case MC_MBUF_16KCL:
3039                 break;
3040
3041         default:
3042                 VERIFY(0);
3043                 /* NOTREACHED */
3044         }
3045
3046         return (!mcache_bkt_isempty(m_cache(class)));
3047 }
3048
3049 /*
3050  * If possible, convert constructed objects to raw ones.
3051  */
3052 static boolean_t
3053 mbuf_steal(mbuf_class_t class, unsigned int num)
3054 {
3055         mcache_obj_t *top = NULL;
3056         mcache_obj_t **list = &top;
3057         unsigned int tot = 0;
3058
3059         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3060
3061         switch (class) {
3062         case MC_MBUF:
3063         case MC_CL:
3064         case MC_BIGCL:
3065         case MC_16KCL:
3066                 return (FALSE);
3067
3068         case MC_MBUF_CL:
3069         case MC_MBUF_BIGCL:
3070         case MC_MBUF_16KCL:
3071                 /* Get the required number of constructed objects if possible */
3072                 if (m_infree(class) > m_minlimit(class)) {
3073                         tot = cslab_alloc(class, &list,
3074                             MIN(num, m_infree(class)));
3075                 }
3076
3077                 /* And destroy them to get back the raw objects */
3078                 if (top != NULL)
3079                         (void) cslab_free(class, top, 1);
3080                 break;
3081
3082         default:
3083                 VERIFY(0);
3084                 /* NOTREACHED */
3085         }
3086
3087         return (tot == num);
3088 }
3089
3090 static void
3091 m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
3092 {
3093         int m, bmap = 0;
3094
3095         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3096
3097         VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
3098         VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
3099         VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3100
3101         /*
3102          * This logic can be made smarter; for now, simply mark
3103          * all other related classes as potential victims.
3104          */
3105         switch (class) {
3106         case MC_MBUF:
3107                 m_wantpurge(MC_CL)++;
3108                 m_wantpurge(MC_BIGCL)++;
3109                 m_wantpurge(MC_MBUF_CL)++;
3110                 m_wantpurge(MC_MBUF_BIGCL)++;
3111                 break;
3112
3113         case MC_CL:
3114                 m_wantpurge(MC_MBUF)++;
3115                 m_wantpurge(MC_BIGCL)++;
3116                 m_wantpurge(MC_MBUF_BIGCL)++;
3117                 if (!comp)
3118                         m_wantpurge(MC_MBUF_CL)++;
3119                 break;
3120
3121         case MC_BIGCL:
3122                 m_wantpurge(MC_MBUF)++;
3123                 m_wantpurge(MC_CL)++;
3124                 m_wantpurge(MC_MBUF_CL)++;
3125                 if (!comp)
3126                         m_wantpurge(MC_MBUF_BIGCL)++;
3127                 break;
3128
3129         case MC_16KCL:
3130                 if (!comp)
3131                         m_wantpurge(MC_MBUF_16KCL)++;
3132                 break;
3133
3134         default:
3135                 VERIFY(0);
3136                 /* NOTREACHED */
3137         }
3138
3139         /*
3140          * Run through each marked class and check if we really need to
3141          * purge (and therefore temporarily disable) the per-CPU caches
3142          * layer used by the class.  If so, remember the classes since
3143          * we are going to drop the lock below prior to purging.
3144          */
3145         for (m = 0; m < NELEM(mbuf_table); m++) {
3146                 if (m_wantpurge(m) > 0) {
3147                         m_wantpurge(m) = 0;
3148                         /*
3149                          * Try hard to steal the required number of objects
3150                          * from the freelist of other mbuf classes.  Only
3151                          * purge and disable the per-CPU caches layer when
3152                          * we don't have enough; it's the last resort.
3153                          */
3154                         if (!mbuf_steal(m, num))
3155                                 bmap |= (1 << m);
3156                 }
3157         }
3158
3159         lck_mtx_unlock(mbuf_mlock);
3160
3161         if (bmap != 0) {
3162                 /* drain is performed in pfslowtimo(), to avoid deadlocks */
3163                 do_reclaim = 1;
3164
3165                 /* Sigh; we have no other choices but to ask mcache to purge */
3166                 for (m = 0; m < NELEM(mbuf_table); m++) {
3167                         if ((bmap & (1 << m)) &&
3168                             mcache_purge_cache(m_cache(m))) {
3169                                 lck_mtx_lock(mbuf_mlock);
3170                                 m_purge_cnt(m)++;
3171                                 mbstat.m_drain++;
3172                                 lck_mtx_unlock(mbuf_mlock);
3173                         }
3174                 }
3175         } else {
3176                 /*
3177                  * Request mcache to reap extra elements from all of its caches;
3178                  * note that all reaps are serialized and happen only at a fixed
3179                  * interval.
3180                  */
3181                 mcache_reap();
3182         }
3183         lck_mtx_lock(mbuf_mlock);
3184 }
3185
3186 static inline struct mbuf *
3187 m_get_common(int wait, short type, int hdr)
3188 {
3189         struct mbuf *m;
3190         int mcflags = MSLEEPF(wait);
3191
3192         /* Is this due to a non-blocking retry?  If so, then try harder */
3193         if (mcflags & MCR_NOSLEEP)
3194                 mcflags |= MCR_TRYHARD;
3195
3196         m = mcache_alloc(m_cache(MC_MBUF), mcflags);
3197         if (m != NULL) {
3198                 MBUF_INIT(m, hdr, type);
3199                 mtype_stat_inc(type);
3200                 mtype_stat_dec(MT_FREE);
3201 #if CONFIG_MACF_NET
3202                 if (hdr && mac_init_mbuf(m, wait) != 0) {
3203                         m_free(m);
3204                         return (NULL);
3205                 }
3206 #endif /* MAC_NET */
3207         }
3208         return (m);
3209 }
3210
3211 /*
3212  * Space allocation routines; these are also available as macros
3213  * for critical paths.
3214  */
3215 #define _M_GET(wait, type)      m_get_common(wait, type, 0)
3216 #define _M_GETHDR(wait, type)   m_get_common(wait, type, 1)
3217 #define _M_RETRY(wait, type)    _M_GET(wait, type)
3218 #define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
3219 #define _MGET(m, how, type)     ((m) = _M_GET(how, type))
3220 #define _MGETHDR(m, how, type)  ((m) = _M_GETHDR(how, type))
3221
3222 struct mbuf *
3223 m_get(int wait, int type)
3224 {
3225         return (_M_GET(wait, type));
3226 }
3227
3228 struct mbuf *
3229 m_gethdr(int wait, int type)
3230 {
3231         return (_M_GETHDR(wait, type));
3232 }
3233
3234 struct mbuf *
3235 m_retry(int wait, int type)
3236 {
3237         return (_M_RETRY(wait, type));
3238 }
3239
3240 struct mbuf *
3241 m_retryhdr(int wait, int type)
3242 {
3243         return (_M_RETRYHDR(wait, type));
3244 }
3245
3246 struct mbuf *
3247 m_getclr(int wait, int type)
3248 {
3249         struct mbuf *m;
3250
3251         _MGET(m, wait, type);
3252         if (m != NULL)
3253                 bzero(MTOD(m, caddr_t), MLEN);
3254         return (m);
3255 }
3256
3257 struct mbuf *
3258 m_free(struct mbuf *m)
3259 {
3260         struct mbuf *n = m->m_next;
3261
3262         if (m->m_type == MT_FREE)
3263                 panic("m_free: freeing an already freed mbuf");
3264
3265         /* Free the aux data and tags if there is any */
3266         if (m->m_flags & M_PKTHDR) {
3267                 m_tag_delete_chain(m, NULL);
3268         }
3269
3270         if (m->m_flags & M_EXT) {
3271                 u_int32_t refcnt;
3272                 u_int32_t composite;
3273
3274                 refcnt = m_decref(m);
3275                 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3276                 if (refcnt == 0 && !composite) {
3277                         if (m->m_ext.ext_free == NULL) {
3278                                 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3279                         } else if (m->m_ext.ext_free == m_bigfree) {
3280                                 mcache_free(m_cache(MC_BIGCL),
3281                                     m->m_ext.ext_buf);
3282                         } else if (m->m_ext.ext_free == m_16kfree) {
3283                                 mcache_free(m_cache(MC_16KCL),
3284                                     m->m_ext.ext_buf);
3285                         } else {
3286                                 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
3287                                     m->m_ext.ext_size, m->m_ext.ext_arg);
3288                         }
3289                         mcache_free(ref_cache, MEXT_RFA(m));
3290                         MEXT_RFA(m) = NULL;
3291                 } else if (refcnt == 0 && composite) {
3292                         VERIFY(m->m_type != MT_FREE);
3293
3294                         mtype_stat_dec(m->m_type);
3295                         mtype_stat_inc(MT_FREE);
3296
3297                         m->m_type = MT_FREE;
3298                         m->m_flags = M_EXT;
3299                         m->m_len = 0;
3300                         m->m_next = m->m_nextpkt = NULL;
3301
3302                         MEXT_FLAGS(m) &= ~EXTF_READONLY;
3303
3304                         /* "Free" into the intermediate cache */
3305                         if (m->m_ext.ext_free == NULL) {
3306                                 mcache_free(m_cache(MC_MBUF_CL), m);
3307                         } else if (m->m_ext.ext_free == m_bigfree) {
3308                                 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3309                         } else {
3310                                 VERIFY(m->m_ext.ext_free == m_16kfree);
3311                                 mcache_free(m_cache(MC_MBUF_16KCL), m);
3312                         }
3313                         return (n);
3314                 }
3315         }
3316
3317         if (m->m_type != MT_FREE) {
3318                 mtype_stat_dec(m->m_type);
3319                 mtype_stat_inc(MT_FREE);
3320         }
3321
3322         m->m_type = MT_FREE;
3323         m->m_flags = m->m_len = 0;
3324         m->m_next = m->m_nextpkt = NULL;
3325
3326         mcache_free(m_cache(MC_MBUF), m);
3327
3328         return (n);
3329 }
3330
3331 __private_extern__ struct mbuf *
3332 m_clattach(struct mbuf *m, int type, caddr_t extbuf,
3333     void (*extfree)(caddr_t, u_int, caddr_t), u_int extsize, caddr_t extarg,
3334     int wait)
3335 {
3336         struct ext_ref *rfa = NULL;
3337
3338         if (m == NULL && (m = _M_GETHDR(wait, type)) == NULL)
3339                 return (NULL);
3340
3341         if (m->m_flags & M_EXT) {
3342                 u_int32_t refcnt;
3343                 u_int32_t composite;
3344
3345                 refcnt = m_decref(m);
3346                 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3347                 if (refcnt == 0 && !composite) {
3348                         if (m->m_ext.ext_free == NULL) {
3349                                 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3350                         } else if (m->m_ext.ext_free == m_bigfree) {
3351                                 mcache_free(m_cache(MC_BIGCL),
3352                                     m->m_ext.ext_buf);
3353                         } else if (m->m_ext.ext_free == m_16kfree) {
3354                                 mcache_free(m_cache(MC_16KCL),
3355                                     m->m_ext.ext_buf);
3356                         } else {
3357                                 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
3358                                     m->m_ext.ext_size, m->m_ext.ext_arg);
3359                         }
3360                         /* Re-use the reference structure */
3361                         rfa = MEXT_RFA(m);
3362                 } else if (refcnt == 0 && composite) {
3363                         VERIFY(m->m_type != MT_FREE);
3364
3365                         mtype_stat_dec(m->m_type);
3366                         mtype_stat_inc(MT_FREE);
3367
3368                         m->m_type = MT_FREE;
3369                         m->m_flags = M_EXT;
3370                         m->m_len = 0;
3371                         m->m_next = m->m_nextpkt = NULL;
3372
3373                         MEXT_FLAGS(m) &= ~EXTF_READONLY;
3374
3375                         /* "Free" into the intermediate cache */
3376                         if (m->m_ext.ext_free == NULL) {
3377                                 mcache_free(m_cache(MC_MBUF_CL), m);
3378                         } else if (m->m_ext.ext_free == m_bigfree) {
3379                                 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3380                         } else {
3381                                 VERIFY(m->m_ext.ext_free == m_16kfree);
3382                                 mcache_free(m_cache(MC_MBUF_16KCL), m);
3383                         }
3384                         /*
3385                          * Allocate a new mbuf, since we didn't divorce
3386                          * the composite mbuf + cluster pair above.
3387                          */
3388                         if ((m = _M_GETHDR(wait, type)) == NULL)
3389                                 return (NULL);
3390                 }
3391         }
3392
3393         if (rfa == NULL &&
3394             (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
3395                 m_free(m);
3396                 return (NULL);
3397         }
3398
3399         MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa, 1, 0);
3400
3401         return (m);
3402 }
3403
3404 /*
3405  * Perform `fast' allocation mbuf clusters from a cache of recently-freed
3406  * clusters. (If the cache is empty, new clusters are allocated en-masse.)
3407  */
3408 struct mbuf *
3409 m_getcl(int wait, int type, int flags)
3410 {
3411         struct mbuf *m;
3412         int mcflags = MSLEEPF(wait);
3413         int hdr = (flags & M_PKTHDR);
3414
3415         /* Is this due to a non-blocking retry?  If so, then try harder */
3416         if (mcflags & MCR_NOSLEEP)
3417                 mcflags |= MCR_TRYHARD;
3418
3419         m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags);
3420         if (m != NULL) {
3421                 u_int32_t flag;
3422                 struct ext_ref *rfa;
3423                 void *cl;
3424
3425                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3426                 cl = m->m_ext.ext_buf;
3427                 rfa = MEXT_RFA(m);
3428
3429                 ASSERT(cl != NULL && rfa != NULL);
3430                 VERIFY(MBUF_IS_COMPOSITE(m) && m->m_ext.ext_free == NULL);
3431
3432                 flag = MEXT_FLAGS(m);
3433
3434                 MBUF_INIT(m, hdr, type);
3435                 MBUF_CL_INIT(m, cl, rfa, 1, flag);
3436
3437                 mtype_stat_inc(type);
3438                 mtype_stat_dec(MT_FREE);
3439 #if CONFIG_MACF_NET
3440                 if (hdr && mac_init_mbuf(m, wait) != 0) {
3441                         m_freem(m);
3442                         return (NULL);
3443                 }
3444 #endif /* MAC_NET */
3445         }
3446         return (m);
3447 }
3448
3449 /* m_mclget() add an mbuf cluster to a normal mbuf */
3450 struct mbuf *
3451 m_mclget(struct mbuf *m, int wait)
3452 {
3453         struct ext_ref *rfa;
3454
3455         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3456                 return (m);
3457
3458         m->m_ext.ext_buf = m_mclalloc(wait);
3459         if (m->m_ext.ext_buf != NULL) {
3460                 MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3461         } else {
3462                 mcache_free(ref_cache, rfa);
3463         }
3464         return (m);
3465 }
3466
3467 /* Allocate an mbuf cluster */
3468 caddr_t
3469 m_mclalloc(int wait)
3470 {
3471         int mcflags = MSLEEPF(wait);
3472
3473         /* Is this due to a non-blocking retry?  If so, then try harder */
3474         if (mcflags & MCR_NOSLEEP)
3475                 mcflags |= MCR_TRYHARD;
3476
3477         return (mcache_alloc(m_cache(MC_CL), mcflags));
3478 }
3479
3480 /* Free an mbuf cluster */
3481 void
3482 m_mclfree(caddr_t p)
3483 {
3484         mcache_free(m_cache(MC_CL), p);
3485 }
3486
3487 /*
3488  * mcl_hasreference() checks if a cluster of an mbuf is referenced by
3489  * another mbuf; see comments in m_incref() regarding EXTF_READONLY.
3490  */
3491 int
3492 m_mclhasreference(struct mbuf *m)
3493 {
3494         if (!(m->m_flags & M_EXT))
3495                 return (0);
3496
3497         ASSERT(MEXT_RFA(m) != NULL);
3498
3499         return ((MEXT_FLAGS(m) & EXTF_READONLY) ? 1 : 0);
3500 }
3501
3502 __private_extern__ caddr_t
3503 m_bigalloc(int wait)
3504 {
3505         int mcflags = MSLEEPF(wait);
3506
3507         /* Is this due to a non-blocking retry?  If so, then try harder */
3508         if (mcflags & MCR_NOSLEEP)
3509                 mcflags |= MCR_TRYHARD;
3510
3511         return (mcache_alloc(m_cache(MC_BIGCL), mcflags));
3512 }
3513
3514 __private_extern__ void
3515 m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3516 {
3517         mcache_free(m_cache(MC_BIGCL), p);
3518 }
3519
3520 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
3521 __private_extern__ struct mbuf *
3522 m_mbigget(struct mbuf *m, int wait)
3523 {
3524         struct ext_ref *rfa;
3525
3526         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3527                 return (m);
3528
3529         m->m_ext.ext_buf =  m_bigalloc(wait);
3530         if (m->m_ext.ext_buf != NULL) {
3531                 MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3532         } else {
3533                 mcache_free(ref_cache, rfa);
3534         }
3535         return (m);
3536 }
3537
3538 __private_extern__ caddr_t
3539 m_16kalloc(int wait)
3540 {
3541         int mcflags = MSLEEPF(wait);
3542
3543         /* Is this due to a non-blocking retry?  If so, then try harder */
3544         if (mcflags & MCR_NOSLEEP)
3545                 mcflags |= MCR_TRYHARD;
3546
3547         return (mcache_alloc(m_cache(MC_16KCL), mcflags));
3548 }
3549
3550 __private_extern__ void
3551 m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3552 {
3553         mcache_free(m_cache(MC_16KCL), p);
3554 }
3555
3556 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
3557 __private_extern__ struct mbuf *
3558 m_m16kget(struct mbuf *m, int wait)
3559 {
3560         struct ext_ref *rfa;
3561
3562         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3563                 return (m);
3564
3565         m->m_ext.ext_buf =  m_16kalloc(wait);
3566         if (m->m_ext.ext_buf != NULL) {
3567                 MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3568         } else {
3569                 mcache_free(ref_cache, rfa);
3570         }
3571         return (m);
3572 }
3573
3574 /*
3575  * "Move" mbuf pkthdr from "from" to "to".
3576  * "from" must have M_PKTHDR set, and "to" must be empty.
3577  */
3578 void
3579 m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
3580 {
3581         /* We will be taking over the tags of 'to' */
3582         if (to->m_flags & M_PKTHDR)
3583                 m_tag_delete_chain(to, NULL);
3584         to->m_pkthdr = from->m_pkthdr;          /* especially tags */
3585         m_tag_init(from);                       /* purge tags from src */
3586         m_service_class_init(from);             /* reset svc class from src */
3587         from->m_pkthdr.aux_flags = 0;           /* clear aux flags from src */
3588         to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3589         if ((to->m_flags & M_EXT) == 0)
3590                 to->m_data = to->m_pktdat;
3591 }
3592
3593 /*
3594  * Duplicate "from"'s mbuf pkthdr in "to".
3595  * "from" must have M_PKTHDR set, and "to" must be empty.
3596  * In particular, this does a deep copy of the packet tags.
3597  */
3598 static int
3599 m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
3600 {
3601         if (to->m_flags & M_PKTHDR)
3602                 m_tag_delete_chain(to, NULL);
3603         to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3604         if ((to->m_flags & M_EXT) == 0)
3605                 to->m_data = to->m_pktdat;
3606         to->m_pkthdr = from->m_pkthdr;
3607         m_tag_init(to);
3608         return (m_tag_copy_chain(to, from, how));
3609 }
3610
3611 void
3612 m_copy_pftag(struct mbuf *to, struct mbuf *from)
3613 {
3614         to->m_pkthdr.pf_mtag = from->m_pkthdr.pf_mtag;
3615         to->m_pkthdr.pf_mtag.pftag_hdr = NULL;
3616         to->m_pkthdr.pf_mtag.pftag_flags &= ~(PF_TAG_HDR_INET|PF_TAG_HDR_INET6);
3617 }
3618
3619 /*
3620  * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
3621  * if wantall is not set, return whatever number were available.  Set up the
3622  * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
3623  * are chained on the m_nextpkt field.  Any packets requested beyond this
3624  * are chained onto the last packet header's m_next field.  The size of
3625  * the cluster is controlled by the parameter bufsize.
3626  */
3627 __private_extern__ struct mbuf *
3628 m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
3629     int wait, int wantall, size_t bufsize)
3630 {
3631         struct mbuf *m;
3632         struct mbuf **np, *top;
3633         unsigned int pnum, needed = *num_needed;
3634         mcache_obj_t *mp_list = NULL;
3635         int mcflags = MSLEEPF(wait);
3636         u_int32_t flag;
3637         struct ext_ref *rfa;
3638         mcache_t *cp;
3639         void *cl;
3640
3641         ASSERT(bufsize == m_maxsize(MC_CL) ||
3642             bufsize == m_maxsize(MC_BIGCL) ||
3643             bufsize == m_maxsize(MC_16KCL));
3644
3645         /*
3646          * Caller must first check for njcl because this
3647          * routine is internal and not exposed/used via KPI.
3648          */
3649         VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0);
3650
3651         top = NULL;
3652         np = &top;
3653         pnum = 0;
3654
3655         /*
3656          * The caller doesn't want all the requested buffers; only some.
3657          * Try hard to get what we can, but don't block.  This effectively
3658          * overrides MCR_SLEEP, since this thread will not go to sleep
3659          * if we can't get all the buffers.
3660          */
3661         if (!wantall || (mcflags & MCR_NOSLEEP))
3662                 mcflags |= MCR_TRYHARD;
3663
3664         /* Allocate the composite mbuf + cluster elements from the cache */
3665         if (bufsize == m_maxsize(MC_CL))
3666                 cp = m_cache(MC_MBUF_CL);
3667         else if (bufsize == m_maxsize(MC_BIGCL))
3668                 cp = m_cache(MC_MBUF_BIGCL);
3669         else
3670                 cp = m_cache(MC_MBUF_16KCL);
3671         needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
3672
3673         for (pnum = 0; pnum < needed; pnum++) {
3674                 m = (struct mbuf *)mp_list;
3675                 mp_list = mp_list->obj_next;
3676
3677                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3678                 cl = m->m_ext.ext_buf;
3679                 rfa = MEXT_RFA(m);
3680
3681                 ASSERT(cl != NULL && rfa != NULL);
3682                 VERIFY(MBUF_IS_COMPOSITE(m));
3683
3684                 flag = MEXT_FLAGS(m);
3685
3686                 MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
3687                 if (bufsize == m_maxsize(MC_16KCL)) {
3688                         MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
3689                 } else if (bufsize == m_maxsize(MC_BIGCL)) {
3690                         MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
3691                 } else {
3692                         MBUF_CL_INIT(m, cl, rfa, 1, flag);
3693                 }
3694
3695                 if (num_with_pkthdrs > 0) {
3696                         --num_with_pkthdrs;
3697 #if CONFIG_MACF_NET
3698                         if (mac_mbuf_label_init(m, wait) != 0) {
3699                                 m_freem(m);
3700                                 break;
3701                         }
3702 #endif /* MAC_NET */
3703                 }
3704
3705                 *np = m;
3706                 if (num_with_pkthdrs > 0)
3707                         np = &m->m_nextpkt;
3708                 else
3709                         np = &m->m_next;
3710         }
3711         ASSERT(pnum != *num_needed || mp_list == NULL);
3712         if (mp_list != NULL)
3713                 mcache_free_ext(cp, mp_list);
3714
3715         if (pnum > 0) {
3716                 mtype_stat_add(MT_DATA, pnum);
3717                 mtype_stat_sub(MT_FREE, pnum);
3718         }
3719
3720         if (wantall && (pnum != *num_needed)) {
3721                 if (top != NULL)
3722                         m_freem_list(top);
3723                 return (NULL);
3724         }
3725
3726         if (pnum > *num_needed) {
3727                 printf("%s: File a radar related to <rdar://10146739>. \
3728                         needed = %u, pnum = %u, num_needed = %u \n",
3729                         __func__, needed, pnum, *num_needed);
3730         }
3731
3732         *num_needed = pnum;
3733         return (top);
3734 }
3735
3736 /*
3737  * Return list of mbuf linked by m_nextpkt.  Try for numlist, and if
3738  * wantall is not set, return whatever number were available.  The size of
3739  * each mbuf in the list is controlled by the parameter packetlen.  Each
3740  * mbuf of the list may have a chain of mbufs linked by m_next.  Each mbuf
3741  * in the chain is called a segment.  If maxsegments is not null and the
3742  * value pointed to is not null, this specify the maximum number of segments
3743  * for a chain of mbufs.  If maxsegments is zero or the value pointed to
3744  * is zero the caller does not have any restriction on the number of segments.
3745  * The actual  number of segments of a mbuf chain is return in the value
3746  * pointed to by maxsegments.
3747  */
3748 __private_extern__ struct mbuf *
3749 m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
3750     unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
3751 {
3752         struct mbuf **np, *top, *first = NULL;
3753         size_t bufsize, r_bufsize;
3754         unsigned int num = 0;
3755         unsigned int nsegs = 0;
3756         unsigned int needed, resid;
3757         int mcflags = MSLEEPF(wait);
3758         mcache_obj_t *mp_list = NULL, *rmp_list = NULL;
3759         mcache_t *cp = NULL, *rcp = NULL;
3760
3761         if (*numlist == 0)
3762                 return (NULL);
3763
3764         top = NULL;
3765         np = &top;
3766
3767         if (wantsize == 0) {
3768                 if (packetlen <= MINCLSIZE) {
3769                         bufsize = packetlen;
3770                 } else if (packetlen > m_maxsize(MC_CL)) {
3771                         /* Use 4KB if jumbo cluster pool isn't available */
3772                         if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0)
3773                                 bufsize = m_maxsize(MC_BIGCL);
3774                         else
3775                                 bufsize = m_maxsize(MC_16KCL);
3776                 } else {
3777                         bufsize = m_maxsize(MC_CL);
3778                 }
3779         } else if (wantsize == m_maxsize(MC_CL) ||
3780             wantsize == m_maxsize(MC_BIGCL) ||
3781             (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) {
3782                 bufsize = wantsize;
3783         } else {
3784                 return (NULL);
3785         }
3786
3787         if (bufsize <= MHLEN) {
3788                 nsegs = 1;
3789         } else if (bufsize <= MINCLSIZE) {
3790                 if (maxsegments != NULL && *maxsegments == 1) {
3791                         bufsize = m_maxsize(MC_CL);
3792                         nsegs = 1;
3793                 } else {
3794                         nsegs = 2;
3795                 }
3796         } else if (bufsize == m_maxsize(MC_16KCL)) {
3797                 VERIFY(njcl > 0);
3798                 nsegs = ((packetlen - 1) >> (PGSHIFT + 2)) + 1;
3799         } else if (bufsize == m_maxsize(MC_BIGCL)) {
3800                 nsegs = ((packetlen - 1) >> PGSHIFT) + 1;
3801         } else {
3802                 nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
3803         }
3804         if (maxsegments != NULL) {
3805                 if (*maxsegments && nsegs > *maxsegments) {
3806                         *maxsegments = nsegs;
3807                         return (NULL);
3808                 }
3809                 *maxsegments = nsegs;
3810         }
3811
3812         /*
3813          * The caller doesn't want all the requested buffers; only some.
3814          * Try hard to get what we can, but don't block.  This effectively
3815          * overrides MCR_SLEEP, since this thread will not go to sleep
3816          * if we can't get all the buffers.
3817          */
3818         if (!wantall || (mcflags & MCR_NOSLEEP))
3819                 mcflags |= MCR_TRYHARD;
3820
3821         /*
3822          * Simple case where all elements in the lists/chains are mbufs.
3823          * Unless bufsize is greater than MHLEN, each segment chain is made
3824          * up of exactly 1 mbuf.  Otherwise, each segment chain is made up
3825          * of 2 mbufs; the second one is used for the residual data, i.e.
3826          * the remaining data that cannot fit into the first mbuf.
3827          */
3828         if (bufsize <= MINCLSIZE) {
3829                 /* Allocate the elements in one shot from the mbuf cache */
3830                 ASSERT(bufsize <= MHLEN || nsegs == 2);
3831                 cp = m_cache(MC_MBUF);
3832                 needed = mcache_alloc_ext(cp, &mp_list,
3833                     (*numlist) * nsegs, mcflags);
3834
3835                 /*
3836                  * The number of elements must be even if we are to use an
3837                  * mbuf (instead of a cluster) to store the residual data.
3838                  * If we couldn't allocate the requested number of mbufs,
3839                  * trim the number down (if it's odd) in order to avoid
3840                  * creating a partial segment chain.
3841                  */
3842                 if (bufsize > MHLEN && (needed & 0x1))
3843                         needed--;
3844
3845                 while (num < needed) {
3846                         struct mbuf *m;
3847
3848                         m = (struct mbuf *)mp_list;
3849                         mp_list = mp_list->obj_next;
3850                         ASSERT(m != NULL);
3851
3852                         MBUF_INIT(m, 1, MT_DATA);
3853 #if CONFIG_MACF_NET
3854                         if (mac_init_mbuf(m, wait) != 0) {
3855                                 m_free(m);
3856                                 break;
3857                         }
3858 #endif /* MAC_NET */
3859                         num++;
3860                         if (bufsize > MHLEN) {
3861                                 /* A second mbuf for this segment chain */
3862                                 m->m_next = (struct mbuf *)mp_list;
3863                                 mp_list = mp_list->obj_next;
3864                                 ASSERT(m->m_next != NULL);
3865
3866                                 MBUF_INIT(m->m_next, 0, MT_DATA);
3867                                 num++;
3868                         }
3869                         *np = m;
3870                         np = &m->m_nextpkt;
3871                 }
3872                 ASSERT(num != *numlist || mp_list == NULL);
3873
3874                 if (num > 0) {
3875                         mtype_stat_add(MT_DATA, num);
3876                         mtype_stat_sub(MT_FREE, num);
3877                 }
3878                 num /= nsegs;
3879
3880                 /* We've got them all; return to caller */
3881                 if (num == *numlist)
3882                         return (top);
3883
3884                 goto fail;
3885         }
3886
3887         /*
3888          * Complex cases where elements are made up of one or more composite
3889          * mbufs + cluster, depending on packetlen.  Each N-segment chain can
3890          * be illustrated as follows:
3891          *
3892          * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
3893          *
3894          * Every composite mbuf + cluster element comes from the intermediate
3895          * cache (either MC_MBUF_CL or MC_MBUF_BIGCL).  For space efficiency,
3896          * the last composite element will come from the MC_MBUF_CL cache,
3897          * unless the residual data is larger than 2KB where we use the
3898          * big cluster composite cache (MC_MBUF_BIGCL) instead.  Residual
3899          * data is defined as extra data beyond the first element that cannot
3900          * fit into the previous element, i.e. there is no residual data if
3901          * the chain only has 1 segment.
3902          */
3903         r_bufsize = bufsize;
3904         resid = packetlen > bufsize ? packetlen % bufsize : 0;
3905         if (resid > 0) {
3906                 /* There is residual data; figure out the cluster size */
3907                 if (wantsize == 0 && packetlen > MINCLSIZE) {
3908                         /*
3909                          * Caller didn't request that all of the segments
3910                          * in the chain use the same cluster size; use the
3911                          * smaller of the cluster sizes.
3912                          */
3913                         if (njcl > 0 && resid > m_maxsize(MC_BIGCL))
3914                                 r_bufsize = m_maxsize(MC_16KCL);
3915                         else if (resid > m_maxsize(MC_CL))
3916                                 r_bufsize = m_maxsize(MC_BIGCL);
3917                         else
3918                                 r_bufsize = m_maxsize(MC_CL);
3919                 } else {
3920                         /* Use the same cluster size as the other segments */
3921                         resid = 0;
3922                 }
3923         }
3924
3925         needed = *numlist;
3926         if (resid > 0) {
3927                 /*
3928                  * Attempt to allocate composite mbuf + cluster elements for
3929                  * the residual data in each chain; record the number of such
3930                  * elements that can be allocated so that we know how many
3931                  * segment chains we can afford to create.
3932                  */
3933                 if (r_bufsize <= m_maxsize(MC_CL))
3934                         rcp = m_cache(MC_MBUF_CL);
3935                 else if (r_bufsize <= m_maxsize(MC_BIGCL))
3936                         rcp = m_cache(MC_MBUF_BIGCL);
3937                 else
3938                         rcp = m_cache(MC_MBUF_16KCL);
3939                 needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
3940
3941                 if (needed == 0)
3942                         goto fail;
3943
3944                 /* This is temporarily reduced for calculation */
3945                 ASSERT(nsegs > 1);
3946                 nsegs--;
3947         }
3948
3949         /*
3950          * Attempt to allocate the rest of the composite mbuf + cluster
3951          * elements for the number of segment chains that we need.
3952          */
3953         if (bufsize <= m_maxsize(MC_CL))
3954                 cp = m_cache(MC_MBUF_CL);
3955         else if (bufsize <= m_maxsize(MC_BIGCL))
3956                 cp = m_cache(MC_MBUF_BIGCL);
3957         else
3958                 cp = m_cache(MC_MBUF_16KCL);
3959         needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
3960
3961         /* Round it down to avoid creating a partial segment chain */
3962         needed = (needed / nsegs) * nsegs;
3963         if (needed == 0)
3964                 goto fail;
3965
3966         if (resid > 0) {
3967                 /*
3968                  * We're about to construct the chain(s); take into account
3969                  * the number of segments we have created above to hold the
3970                  * residual data for each chain, as well as restore the
3971                  * original count of segments per chain.
3972                  */
3973                 ASSERT(nsegs > 0);
3974                 needed += needed / nsegs;
3975                 nsegs++;
3976         }
3977
3978         for (;;) {
3979                 struct mbuf *m;
3980                 u_int32_t flag;
3981                 struct ext_ref *rfa;
3982                 void *cl;
3983                 int pkthdr;
3984
3985                 ++num;
3986                 if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
3987                         m = (struct mbuf *)mp_list;
3988                         mp_list = mp_list->obj_next;
3989                 } else {
3990                         m = (struct mbuf *)rmp_list;
3991                         rmp_list = rmp_list->obj_next;
3992                 }
3993                 ASSERT(m != NULL);
3994                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3995                 VERIFY(m->m_ext.ext_free == NULL ||
3996                     m->m_ext.ext_free == m_bigfree ||
3997                     m->m_ext.ext_free == m_16kfree);
3998
3999                 cl = m->m_ext.ext_buf;
4000                 rfa = MEXT_RFA(m);
4001
4002                 ASSERT(cl != NULL && rfa != NULL);
4003                 VERIFY(MBUF_IS_COMPOSITE(m));
4004
4005                 flag = MEXT_FLAGS(m);
4006
4007                 pkthdr = (nsegs == 1 || (num % nsegs) == 1);
4008                 if (pkthdr)
4009                         first = m;
4010                 MBUF_INIT(m, pkthdr, MT_DATA);
4011                 if (m->m_ext.ext_free == m_16kfree) {
4012                         MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
4013                 } else if (m->m_ext.ext_free == m_bigfree) {
4014                         MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
4015                 } else {
4016                         MBUF_CL_INIT(m, cl, rfa, 1, flag);
4017                 }
4018 #if CONFIG_MACF_NET
4019                 if (pkthdr && mac_init_mbuf(m, wait) != 0) {
4020                         --num;
4021                         m_freem(m);
4022                         break;
4023                 }
4024 #endif /* MAC_NET */
4025
4026                 *np = m;
4027                 if ((num % nsegs) == 0)
4028                         np = &first->m_nextpkt;
4029                 else
4030                         np = &m->m_next;
4031
4032                 if (num == needed)
4033                         break;
4034         }
4035
4036         if (num > 0) {
4037                 mtype_stat_add(MT_DATA, num);
4038                 mtype_stat_sub(MT_FREE, num);
4039         }
4040
4041         num /= nsegs;
4042
4043         /* We've got them all; return to caller */
4044         if (num == *numlist) {
4045                 ASSERT(mp_list == NULL && rmp_list == NULL);
4046                 return (top);
4047         }
4048
4049 fail:
4050         /* Free up what's left of the above */
4051         if (mp_list != NULL)
4052                 mcache_free_ext(cp, mp_list);
4053         if (rmp_list != NULL)
4054                 mcache_free_ext(rcp, rmp_list);
4055         if (wantall && top != NULL) {
4056                 m_freem(top);
4057                 return (NULL);
4058         }
4059         *numlist = num;
4060         return (top);
4061 }
4062
4063 /*
4064  * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
4065  * packets on receive ring.
4066  */
4067 __private_extern__ struct mbuf *
4068 m_getpacket_how(int wait)
4069 {
4070         unsigned int num_needed = 1;
4071
4072         return (m_getpackets_internal(&num_needed, 1, wait, 1,
4073             m_maxsize(MC_CL)));
4074 }
4075
4076 /*
4077  * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
4078  * packets on receive ring.
4079  */
4080 struct mbuf *
4081 m_getpacket(void)
4082 {
4083         unsigned int num_needed = 1;
4084
4085         return (m_getpackets_internal(&num_needed, 1, M_WAIT, 1,
4086             m_maxsize(MC_CL)));
4087 }
4088
4089 /*
4090  * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
4091  * if this can't be met, return whatever number were available.  Set up the
4092  * first num_with_pkthdrs with mbuf hdrs configured as packet headers.  These
4093  * are chained on the m_nextpkt field.  Any packets requested beyond this are
4094  * chained onto the last packet header's m_next field.
4095  */
4096 struct mbuf *
4097 m_getpackets(int num_needed, int num_with_pkthdrs, int how)
4098 {
4099         unsigned int n = num_needed;
4100
4101         return (m_getpackets_internal(&n, num_with_pkthdrs, how, 0,
4102             m_maxsize(MC_CL)));
4103 }
4104
4105 /*
4106  * Return a list of mbuf hdrs set up as packet hdrs chained together
4107  * on the m_nextpkt field
4108  */
4109 struct mbuf *
4110 m_getpackethdrs(int num_needed, int how)
4111 {
4112         struct mbuf *m;
4113         struct mbuf **np, *top;
4114
4115         top = NULL;
4116         np = &top;
4117
4118         while (num_needed--) {
4119                 m = _M_RETRYHDR(how, MT_DATA);
4120                 if (m == NULL)
4121                         break;
4122
4123                 *np = m;
4124                 np = &m->m_nextpkt;
4125         }
4126
4127         return (top);
4128 }
4129
4130 /*
4131  * Free an mbuf list (m_nextpkt) while following m_next.  Returns the count
4132  * for mbufs packets freed.  Used by the drivers.
4133  */
4134 int
4135 m_freem_list(struct mbuf *m)
4136 {
4137         struct mbuf *nextpkt;
4138         mcache_obj_t *mp_list = NULL;
4139         mcache_obj_t *mcl_list = NULL;
4140         mcache_obj_t *mbc_list = NULL;
4141         mcache_obj_t *m16k_list = NULL;
4142         mcache_obj_t *m_mcl_list = NULL;
4143         mcache_obj_t *m_mbc_list = NULL;
4144         mcache_obj_t *m_m16k_list = NULL;
4145         mcache_obj_t *ref_list = NULL;
4146         int pktcount = 0;
4147         int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
4148
4149         while (m != NULL) {
4150                 pktcount++;
4151
4152                 nextpkt = m->m_nextpkt;
4153                 m->m_nextpkt = NULL;
4154
4155                 while (m != NULL) {
4156                         struct mbuf *next = m->m_next;
4157                         mcache_obj_t *o, *rfa;
4158                         u_int32_t refcnt, composite;
4159
4160                         if (m->m_type == MT_FREE)
4161                                 panic("m_free: freeing an already freed mbuf");
4162
4163                         if (m->m_type != MT_FREE)
4164                                 mt_free++;
4165
4166                         if (m->m_flags & M_PKTHDR) {
4167                                 m_tag_delete_chain(m, NULL);
4168                         }
4169
4170                         if (!(m->m_flags & M_EXT))
4171                                 goto simple_free;
4172
4173                         o = (mcache_obj_t *)(void *)m->m_ext.ext_buf;
4174                         refcnt = m_decref(m);
4175                         composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
4176                         if (refcnt == 0 && !composite) {
4177                                 if (m->m_ext.ext_free == NULL) {
4178                                         o->obj_next = mcl_list;
4179                                         mcl_list = o;
4180                                 } else if (m->m_ext.ext_free == m_bigfree) {
4181                                         o->obj_next = mbc_list;
4182                                         mbc_list = o;
4183                                 } else if (m->m_ext.ext_free == m_16kfree) {
4184                                         o->obj_next = m16k_list;
4185                                         m16k_list = o;
4186                                 } else {
4187                                         (*(m->m_ext.ext_free))((caddr_t)o,
4188                                             m->m_ext.ext_size,
4189                                             m->m_ext.ext_arg);
4190                                 }
4191                                 rfa = (mcache_obj_t *)(void *)MEXT_RFA(m);
4192                                 rfa->obj_next = ref_list;
4193                                 ref_list = rfa;
4194                                 MEXT_RFA(m) = NULL;
4195                         } else if (refcnt == 0 && composite) {
4196                                 VERIFY(m->m_type != MT_FREE);
4197                                 /*
4198                                  * Amortize the costs of atomic operations
4199                                  * by doing them at the end, if possible.
4200                                  */
4201                                 if (m->m_type == MT_DATA)
4202                                         mt_data++;
4203                                 else if (m->m_type == MT_HEADER)
4204                                         mt_header++;
4205                                 else if (m->m_type == MT_SONAME)
4206                                         mt_soname++;
4207                                 else if (m->m_type == MT_TAG)
4208                                         mt_tag++;
4209                                 else
4210                                         mtype_stat_dec(m->m_type);
4211
4212                                 m->m_type = MT_FREE;
4213                                 m->m_flags = M_EXT;
4214                                 m->m_len = 0;
4215                                 m->m_next = m->m_nextpkt = NULL;
4216
4217                                 MEXT_FLAGS(m) &= ~EXTF_READONLY;
4218
4219                                 /* "Free" into the intermediate cache */
4220                                 o = (mcache_obj_t *)m;
4221                                 if (m->m_ext.ext_free == NULL) {
4222                                         o->obj_next = m_mcl_list;
4223                                         m_mcl_list = o;
4224                                 } else if (m->m_ext.ext_free == m_bigfree) {
4225                                         o->obj_next = m_mbc_list;
4226                                         m_mbc_list = o;
4227                                 } else {
4228                                         VERIFY(m->m_ext.ext_free == m_16kfree);
4229                                         o->obj_next = m_m16k_list;
4230                                         m_m16k_list = o;
4231                                 }
4232                                 m = next;
4233                                 continue;
4234                         }
4235 simple_free:
4236                         /*
4237                          * Amortize the costs of atomic operations
4238                          * by doing them at the end, if possible.
4239                          */
4240                         if (m->m_type == MT_DATA)
4241                                 mt_data++;
4242                         else if (m->m_type == MT_HEADER)
4243                                 mt_header++;
4244                         else if (m->m_type == MT_SONAME)
4245                                 mt_soname++;
4246                         else if (m->m_type == MT_TAG)
4247                                 mt_tag++;
4248                         else if (m->m_type != MT_FREE)
4249                                 mtype_stat_dec(m->m_type);
4250
4251                         m->m_type = MT_FREE;
4252                         m->m_flags = m->m_len = 0;
4253                         m->m_next = m->m_nextpkt = NULL;
4254
4255                         ((mcache_obj_t *)m)->obj_next = mp_list;
4256                         mp_list = (mcache_obj_t *)m;
4257
4258                         m = next;
4259                 }
4260
4261                 m = nextpkt;
4262         }
4263
4264         if (mt_free > 0)
4265                 mtype_stat_add(MT_FREE, mt_free);
4266         if (mt_data > 0)
4267                 mtype_stat_sub(MT_DATA, mt_data);
4268         if (mt_header > 0)
4269                 mtype_stat_sub(MT_HEADER, mt_header);
4270         if (mt_soname > 0)
4271                 mtype_stat_sub(MT_SONAME, mt_soname);
4272         if (mt_tag > 0)
4273                 mtype_stat_sub(MT_TAG, mt_tag);
4274
4275         if (mp_list != NULL)
4276                 mcache_free_ext(m_cache(MC_MBUF), mp_list);
4277         if (mcl_list != NULL)
4278                 mcache_free_ext(m_cache(MC_CL), mcl_list);
4279         if (mbc_list != NULL)
4280                 mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
4281         if (m16k_list != NULL)
4282                 mcache_free_ext(m_cache(MC_16KCL), m16k_list);
4283         if (m_mcl_list != NULL)
4284                 mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
4285         if (m_mbc_list != NULL)
4286                 mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
4287         if (m_m16k_list != NULL)
4288                 mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
4289         if (ref_list != NULL)
4290                 mcache_free_ext(ref_cache, ref_list);
4291
4292         return (pktcount);
4293 }
4294
4295 void
4296 m_freem(struct mbuf *m)
4297 {
4298         while (m != NULL)
4299                 m = m_free(m);
4300 }
4301
4302 /*
4303  * Mbuffer utility routines.
4304  */
4305
4306 /*
4307  * Compute the amount of space available before the current start
4308  * of data in an mbuf.
4309  */
4310 int
4311 m_leadingspace(struct mbuf *m)
4312 {
4313         if (m->m_flags & M_EXT) {
4314                 if (MCLHASREFERENCE(m))
4315                         return (0);
4316                 return (m->m_data - m->m_ext.ext_buf);
4317         }
4318         if (m->m_flags & M_PKTHDR)
4319                 return (m->m_data - m->m_pktdat);
4320         return (m->m_data - m->m_dat);
4321 }
4322
4323 /*
4324  * Compute the amount of space available after the end of data in an mbuf.
4325  */
4326 int
4327 m_trailingspace(struct mbuf *m)
4328 {
4329         if (m->m_flags & M_EXT) {
4330                 if (MCLHASREFERENCE(m))
4331                         return (0);
4332                 return (m->m_ext.ext_buf + m->m_ext.ext_size -
4333                     (m->m_data + m->m_len));
4334         }
4335         return (&m->m_dat[MLEN] - (m->m_data + m->m_len));
4336 }
4337
4338 /*
4339  * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
4340  * copy junk along.  Does not adjust packet header length.
4341  */
4342 struct mbuf *
4343 m_prepend(struct mbuf *m, int len, int how)
4344 {
4345         struct mbuf *mn;
4346
4347         _MGET(mn, how, m->m_type);
4348         if (mn == NULL) {
4349                 m_freem(m);
4350                 return (NULL);
4351         }
4352         if (m->m_flags & M_PKTHDR) {
4353                 M_COPY_PKTHDR(mn, m);
4354                 m->m_flags &= ~M_PKTHDR;
4355         }
4356         mn->m_next = m;
4357         m = mn;
4358         if (len < MHLEN)
4359                 MH_ALIGN(m, len);
4360         m->m_len = len;
4361         return (m);
4362 }
4363
4364 /*
4365  * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
4366  * chain, copy junk along, and adjust length.
4367  */
4368 struct mbuf *
4369 m_prepend_2(struct mbuf *m, int len, int how)
4370 {
4371         if (M_LEADINGSPACE(m) >= len) {
4372                 m->m_data -= len;
4373                 m->m_len += len;
4374         } else {
4375                 m = m_prepend(m, len, how);
4376         }
4377         if ((m) && (m->m_flags & M_PKTHDR))
4378                 m->m_pkthdr.len += len;
4379         return (m);
4380 }
4381
4382 /*
4383  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
4384  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
4385  * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
4386  */
4387 int MCFail;
4388
4389 struct mbuf *
4390 m_copym(struct mbuf *m, int off0, int len, int wait)
4391 {
4392         struct mbuf *n, *mhdr = NULL, **np;
4393         int off = off0;
4394         struct mbuf *top;
4395         int copyhdr = 0;
4396
4397         if (off < 0 || len < 0)
4398                 panic("m_copym: invalid offset %d or len %d", off, len);
4399
4400         if (off == 0 && (m->m_flags & M_PKTHDR)) {
4401                 mhdr = m;
4402                 copyhdr = 1;
4403         }
4404
4405         while (off >= m->m_len) {
4406                 if (m->m_next == NULL)
4407                         panic("m_copym: invalid mbuf chain");
4408                 off -= m->m_len;
4409                 m = m->m_next;
4410         }
4411         np = &top;
4412         top = NULL;
4413
4414         while (len > 0) {
4415                 if (m == NULL) {
4416                         if (len != M_COPYALL)
4417                                 panic("m_copym: len != M_COPYALL");
4418                         break;
4419                 }
4420
4421                 n = _M_RETRY(wait, m->m_type);
4422                 *np = n;
4423
4424                 if (n == NULL)
4425                         goto nospace;
4426
4427                 if (copyhdr != 0) {
4428                         M_COPY_PKTHDR(n, mhdr);
4429                         if (len == M_COPYALL)
4430                                 n->m_pkthdr.len -= off0;
4431                         else
4432                                 n->m_pkthdr.len = len;
4433                         copyhdr = 0;
4434                 }
4435                 if (len == M_COPYALL) {
4436                         if (MIN(len, (m->m_len - off)) == len) {
4437                                 printf("m->m_len %d - off %d = %d, %d\n",
4438                                     m->m_len, off, m->m_len - off,
4439                                     MIN(len, (m->m_len - off)));
4440                         }
4441                 }
4442                 n->m_len = MIN(len, (m->m_len - off));
4443                 if (n->m_len == M_COPYALL) {
4444                         printf("n->m_len == M_COPYALL, fixing\n");
4445                         n->m_len = MHLEN;
4446                 }
4447                 if (m->m_flags & M_EXT) {
4448                         n->m_ext = m->m_ext;
4449                         m_incref(m);
4450                         n->m_data = m->m_data + off;
4451                         n->m_flags |= M_EXT;
4452                 } else {
4453                         bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4454                             (unsigned)n->m_len);
4455                 }
4456                 if (len != M_COPYALL)
4457                         len -= n->m_len;
4458                 off = 0;
4459                 m = m->m_next;
4460                 np = &n->m_next;
4461         }
4462
4463         if (top == NULL)
4464                 MCFail++;
4465
4466         return (top);
4467 nospace:
4468
4469         m_freem(top);
4470         MCFail++;
4471         return (NULL);
4472 }
4473
4474 /*
4475  * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
4476  * within this routine also, the last mbuf and offset accessed are passed
4477  * out and can be passed back in to avoid having to rescan the entire mbuf
4478  * list (normally hung off of the socket)
4479  */
4480 struct mbuf *
4481 m_copym_with_hdrs(struct mbuf *m, int off0, int len0, int wait,
4482     struct mbuf **m_lastm, int *m_off)
4483 {
4484         struct mbuf *n, **np = NULL;
4485         int off = off0, len = len0;
4486         struct mbuf *top = NULL;
4487         int mcflags = MSLEEPF(wait);
4488         int copyhdr = 0;
4489         int type = 0;
4490         mcache_obj_t *list = NULL;
4491         int needed = 0;
4492
4493         if (off == 0 && (m->m_flags & M_PKTHDR))
4494                 copyhdr = 1;
4495
4496         if (*m_lastm != NULL) {
4497                 m = *m_lastm;
4498                 off = *m_off;
4499         } else {
4500                 while (off >= m->m_len) {
4501                         off -= m->m_len;
4502                         m = m->m_next;
4503                 }
4504         }
4505
4506         n = m;
4507         while (len > 0) {
4508                 needed++;
4509                 ASSERT(n != NULL);
4510                 len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
4511                 n = n->m_next;
4512         }
4513         needed++;
4514         len = len0;
4515
4516         /*
4517          * If the caller doesn't want to be put to sleep, mark it with
4518          * MCR_TRYHARD so that we may reclaim buffers from other places
4519          * before giving up.
4520          */
4521         if (mcflags & MCR_NOSLEEP)
4522                 mcflags |= MCR_TRYHARD;
4523
4524         if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
4525             mcflags) != needed)
4526                 goto nospace;
4527
4528         needed = 0;
4529         while (len > 0) {
4530                 n = (struct mbuf *)list;
4531                 list = list->obj_next;
4532                 ASSERT(n != NULL && m != NULL);
4533
4534                 type = (top == NULL) ? MT_HEADER : m->m_type;
4535                 MBUF_INIT(n, (top == NULL), type);
4536 #if CONFIG_MACF_NET
4537                 if (top == NULL && mac_mbuf_label_init(n, wait) != 0) {
4538                         mtype_stat_inc(MT_HEADER);
4539                         mtype_stat_dec(MT_FREE);
4540                         m_free(n);
4541                         goto nospace;
4542                 }
4543 #endif /* MAC_NET */
4544
4545                 if (top == NULL) {
4546                         top = n;
4547                         np = &top->m_next;
4548                         continue;
4549                 } else {
4550                         needed++;
4551                         *np = n;
4552                 }
4553
4554                 if (copyhdr) {
4555                         M_COPY_PKTHDR(n, m);
4556                         n->m_pkthdr.len = len;
4557                         copyhdr = 0;
4558                 }
4559                 n->m_len = MIN(len, (m->m_len - off));
4560
4561                 if (m->m_flags & M_EXT) {
4562                         n->m_ext = m->m_ext;
4563                         m_incref(m);
4564                         n->m_data = m->m_data + off;
4565                         n->m_flags |= M_EXT;
4566                 } else {
4567                         bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4568                             (unsigned)n->m_len);
4569                 }
4570                 len -= n->m_len;
4571
4572                 if (len == 0) {
4573                         if ((off + n->m_len) == m->m_len) {
4574                                 *m_lastm = m->m_next;
4575                                 *m_off  = 0;
4576                         } else {
4577                                 *m_lastm = m;
4578                                 *m_off  = off + n->m_len;
4579                         }
4580                         break;
4581                 }
4582                 off = 0;
4583                 m = m->m_next;
4584                 np = &n->m_next;
4585         }
4586
4587         mtype_stat_inc(MT_HEADER);
4588         mtype_stat_add(type, needed);
4589         mtype_stat_sub(MT_FREE, needed + 1);
4590
4591         ASSERT(list == NULL);
4592         return (top);
4593
4594 nospace:
4595         if (list != NULL)
4596                 mcache_free_ext(m_cache(MC_MBUF), list);
4597         if (top != NULL)
4598                 m_freem(top);
4599         MCFail++;
4600         return (NULL);
4601 }
4602
4603 /*
4604  * Copy data from an mbuf chain starting "off" bytes from the beginning,
4605  * continuing for "len" bytes, into the indicated buffer.
4606  */
4607 void
4608 m_copydata(struct mbuf *m, int off, int len, void *vp)
4609 {
4610         unsigned count;
4611         char *cp = vp;
4612
4613         if (off < 0 || len < 0)
4614                 panic("m_copydata: invalid offset %d or len %d", off, len);
4615
4616         while (off > 0) {
4617                 if (m == NULL)
4618                         panic("m_copydata: invalid mbuf chain");
4619                 if (off < m->m_len)
4620                         break;
4621                 off -= m->m_len;
4622                 m = m->m_next;
4623         }
4624         while (len > 0) {
4625                 if (m == NULL)
4626                         panic("m_copydata: invalid mbuf chain");
4627                 count = MIN(m->m_len - off, len);
4628                 bcopy(MTOD(m, caddr_t) + off, cp, count);
4629                 len -= count;
4630                 cp += count;
4631                 off = 0;
4632                 m = m->m_next;
4633         }
4634 }
4635
4636 /*
4637  * Concatenate mbuf chain n to m.  Both chains must be of the same type
4638  * (e.g. MT_DATA).  Any m_pkthdr is not updated.
4639  */
4640 void
4641 m_cat(struct mbuf *m, struct mbuf *n)
4642 {
4643         while (m->m_next)
4644                 m = m->m_next;
4645         while (n) {
4646                 if ((m->m_flags & M_EXT) ||
4647                     m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
4648                         /* just join the two chains */
4649                         m->m_next = n;
4650                         return;
4651                 }
4652                 /* splat the data from one into the other */
4653                 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4654                     (u_int)n->m_len);
4655                 m->m_len += n->m_len;
4656                 n = m_free(n);
4657         }
4658 }
4659
4660 void
4661 m_adj(struct mbuf *mp, int req_len)
4662 {
4663         int len = req_len;
4664         struct mbuf *m;
4665         int count;
4666
4667         if ((m = mp) == NULL)
4668                 return;
4669         if (len >= 0) {
4670                 /*
4671                  * Trim from head.
4672                  */
4673                 while (m != NULL && len > 0) {
4674                         if (m->m_len <= len) {
4675                                 len -= m->m_len;
4676                                 m->m_len = 0;
4677                                 m = m->m_next;
4678                         } else {
4679                                 m->m_len -= len;
4680                                 m->m_data += len;
4681                                 len = 0;
4682                         }
4683                 }
4684                 m = mp;
4685                 if (m->m_flags & M_PKTHDR)
4686                         m->m_pkthdr.len -= (req_len - len);
4687         } else {
4688                 /*
4689                  * Trim from tail.  Scan the mbuf chain,
4690                  * calculating its length and finding the last mbuf.
4691                  * If the adjustment only affects this mbuf, then just
4692                  * adjust and return.  Otherwise, rescan and truncate
4693                  * after the remaining size.
4694                  */
4695                 len = -len;
4696                 count = 0;
4697                 for (;;) {
4698                         count += m->m_len;
4699                         if (m->m_next == (struct mbuf *)0)
4700                                 break;
4701                         m = m->m_next;
4702                 }
4703                 if (m->m_len >= len) {
4704                         m->m_len -= len;
4705                         m = mp;
4706                         if (m->m_flags & M_PKTHDR)
4707                                 m->m_pkthdr.len -= len;
4708                         return;
4709                 }
4710                 count -= len;
4711                 if (count < 0)
4712                         count = 0;
4713                 /*
4714                  * Correct length for chain is "count".
4715                  * Find the mbuf with last data, adjust its length,
4716                  * and toss data from remaining mbufs on chain.
4717                  */
4718                 m = mp;
4719                 if (m->m_flags & M_PKTHDR)
4720                         m->m_pkthdr.len = count;
4721                 for (; m; m = m->m_next) {
4722                         if (m->m_len >= count) {
4723                                 m->m_len = count;
4724                                 break;
4725                         }
4726                         count -= m->m_len;
4727                 }
4728                 while ((m = m->m_next))
4729                         m->m_len = 0;
4730         }
4731 }
4732
4733 /*
4734  * Rearange an mbuf chain so that len bytes are contiguous
4735  * and in the data area of an mbuf (so that mtod and dtom
4736  * will work for a structure of size len).  Returns the resulting
4737  * mbuf chain on success, frees it and returns null on failure.
4738  * If there is room, it will add up to max_protohdr-len extra bytes to the
4739  * contiguous region in an attempt to avoid being called next time.
4740  */
4741 int MPFail;
4742
4743 struct mbuf *
4744 m_pullup(struct mbuf *n, int len)
4745 {
4746         struct mbuf *m;
4747         int count;
4748         int space;
4749
4750         /*
4751          * If first mbuf has no cluster, and has room for len bytes
4752          * without shifting current data, pullup into it,
4753          * otherwise allocate a new mbuf to prepend to the chain.
4754          */
4755         if ((n->m_flags & M_EXT) == 0 &&
4756             n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
4757                 if (n->m_len >= len)
4758                         return (n);
4759                 m = n;
4760                 n = n->m_next;
4761                 len -= m->m_len;
4762         } else {
4763                 if (len > MHLEN)
4764                         goto bad;
4765                 _MGET(m, M_DONTWAIT, n->m_type);
4766                 if (m == 0)
4767                         goto bad;
4768                 m->m_len = 0;
4769                 if (n->m_flags & M_PKTHDR) {
4770                         M_COPY_PKTHDR(m, n);
4771                         n->m_flags &= ~M_PKTHDR;
4772                 }
4773         }
4774         space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
4775         do {
4776                 count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
4777                 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4778                     (unsigned)count);
4779                 len -= count;
4780                 m->m_len += count;
4781                 n->m_len -= count;
4782                 space -= count;
4783                 if (n->m_len)
4784                         n->m_data += count;
4785                 else
4786                         n = m_free(n);
4787         } while (len > 0 && n);
4788         if (len > 0) {
4789                 (void) m_free(m);
4790                 goto bad;
4791         }
4792         m->m_next = n;
4793         return (m);
4794 bad:
4795         m_freem(n);
4796         MPFail++;
4797         return (0);
4798 }
4799
4800 /*
4801  * Like m_pullup(), except a new mbuf is always allocated, and we allow
4802  * the amount of empty space before the data in the new mbuf to be specified
4803  * (in the event that the caller expects to prepend later).
4804  */
4805 __private_extern__ int MSFail = 0;
4806
4807 __private_extern__ struct mbuf *
4808 m_copyup(struct mbuf *n, int len, int dstoff)
4809 {
4810         struct mbuf *m;
4811         int count, space;
4812
4813         if (len > (MHLEN - dstoff))
4814                 goto bad;
4815         MGET(m, M_DONTWAIT, n->m_type);
4816         if (m == NULL)
4817                 goto bad;
4818         m->m_len = 0;
4819         if (n->m_flags & M_PKTHDR) {
4820                 m_copy_pkthdr(m, n);
4821                 n->m_flags &= ~M_PKTHDR;
4822         }
4823         m->m_data += dstoff;
4824         space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
4825         do {
4826                 count = min(min(max(len, max_protohdr), space), n->m_len);
4827                 memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
4828                     (unsigned)count);
4829                 len -= count;
4830                 m->m_len += count;
4831                 n->m_len -= count;
4832                 space -= count;
4833                 if (n->m_len)
4834                         n->m_data += count;
4835                 else
4836                         n = m_free(n);
4837         } while (len > 0 && n);
4838         if (len > 0) {
4839                 (void) m_free(m);
4840                 goto bad;
4841         }
4842         m->m_next = n;
4843         return (m);
4844 bad:
4845         m_freem(n);
4846         MSFail++;
4847         return (NULL);
4848 }
4849
4850 /*
4851  * Partition an mbuf chain in two pieces, returning the tail --
4852  * all but the first len0 bytes.  In case of failure, it returns NULL and
4853  * attempts to restore the chain to its original state.
4854  */
4855 struct mbuf *
4856 m_split(struct mbuf *m0, int len0, int wait)
4857 {
4858         return (m_split0(m0, len0, wait, 1));
4859 }
4860
4861 static struct mbuf *
4862 m_split0(struct mbuf *m0, int len0, int wait, int copyhdr)
4863 {
4864         struct mbuf *m, *n;
4865         unsigned len = len0, remain;
4866
4867         for (m = m0; m && len > m->m_len; m = m->m_next)
4868                 len -= m->m_len;
4869         if (m == NULL)
4870                 return (NULL);
4871         remain = m->m_len - len;
4872         if (copyhdr && (m0->m_flags & M_PKTHDR)) {
4873                 _MGETHDR(n, wait, m0->m_type);
4874                 if (n == NULL)
4875                         return (NULL);
4876                 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
4877                 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
4878                 m0->m_pkthdr.len = len0;
4879                 if (m->m_flags & M_EXT)
4880                         goto extpacket;
4881                 if (remain > MHLEN) {
4882                         /* m can't be the lead packet */
4883                         MH_ALIGN(n, 0);
4884                         n->m_next = m_split(m, len, wait);
4885                         if (n->m_next == NULL) {
4886                                 (void) m_free(n);
4887                                 return (NULL);
4888                         } else
4889                                 return (n);
4890                 } else
4891                         MH_ALIGN(n, remain);
4892         } else if (remain == 0) {
4893                 n = m->m_next;
4894                 m->m_next = NULL;
4895                 return (n);
4896         } else {
4897                 _MGET(n, wait, m->m_type);
4898                 if (n == NULL)
4899                         return (NULL);
4900                 M_ALIGN(n, remain);
4901         }
4902 extpacket:
4903         if (m->m_flags & M_EXT) {
4904                 n->m_flags |= M_EXT;
4905                 n->m_ext = m->m_ext;
4906                 m_incref(m);
4907                 n->m_data = m->m_data + len;
4908         } else {
4909                 bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain);
4910         }
4911         n->m_len = remain;
4912         m->m_len = len;
4913         n->m_next = m->m_next;
4914         m->m_next = NULL;
4915         return (n);
4916 }
4917
4918 /*
4919  * Routine to copy from device local memory into mbufs.
4920  */
4921 struct mbuf *
4922 m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
4923     void (*copy)(const void *, void *, size_t))
4924 {
4925         struct mbuf *m;
4926         struct mbuf *top = NULL, **mp = &top;
4927         int off = off0, len;
4928         char *cp;
4929         char *epkt;
4930
4931         cp = buf;
4932         epkt = cp + totlen;
4933         if (off) {
4934                 /*
4935                  * If 'off' is non-zero, packet is trailer-encapsulated,
4936                  * so we have to skip the type and length fields.
4937                  */
4938                 cp += off + 2 * sizeof (u_int16_t);
4939                 totlen -= 2 * sizeof (u_int16_t);
4940         }
4941         _MGETHDR(m, M_DONTWAIT, MT_DATA);
4942         if (m == NULL)
4943                 return (NULL);
4944         m->m_pkthdr.rcvif = ifp;
4945         m->m_pkthdr.len = totlen;
4946         m->m_len = MHLEN;
4947
4948         while (totlen > 0) {
4949                 if (top != NULL) {
4950                         _MGET(m, M_DONTWAIT, MT_DATA);
4951                         if (m == NULL) {
4952                                 m_freem(top);
4953                                 return (NULL);
4954                         }
4955                         m->m_len = MLEN;
4956                 }
4957                 len = MIN(totlen, epkt - cp);
4958                 if (len >= MINCLSIZE) {
4959                         MCLGET(m, M_DONTWAIT);
4960                         if (m->m_flags & M_EXT) {
4961                                 m->m_len = len = MIN(len, m_maxsize(MC_CL));
4962                         } else {
4963                                 /* give up when it's out of cluster mbufs */
4964                                 if (top != NULL)
4965                                         m_freem(top);
4966                                 m_freem(m);
4967                                 return (NULL);
4968                         }
4969                 } else {
4970                         /*
4971                          * Place initial small packet/header at end of mbuf.
4972                          */
4973                         if (len < m->m_len) {
4974                                 if (top == NULL &&
4975                                     len + max_linkhdr <= m->m_len)
4976                                         m->m_data += max_linkhdr;
4977                                 m->m_len = len;
4978                         } else {
4979                                 len = m->m_len;
4980                         }
4981                 }
4982                 if (copy)
4983                         copy(cp, MTOD(m, caddr_t), (unsigned)len);
4984                 else
4985                         bcopy(cp, MTOD(m, caddr_t), (unsigned)len);
4986                 cp += len;
4987                 *mp = m;
4988                 mp = &m->m_next;
4989                 totlen -= len;
4990                 if (cp == epkt)
4991                         cp = buf;
4992         }
4993         return (top);
4994 }
4995
4996 #ifndef MBUF_GROWTH_NORMAL_THRESH
4997 #define MBUF_GROWTH_NORMAL_THRESH 25
4998 #endif
4999
5000 /*
5001  * Cluster freelist allocation check.
5002  */
5003 static int
5004 m_howmany(int num, size_t bufsize)
5005 {
5006         int i = 0, j = 0;
5007         u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters;
5008         u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree;
5009         u_int32_t sumclusters, freeclusters;
5010         u_int32_t percent_pool, percent_kmem;
5011         u_int32_t mb_growth, mb_growth_thresh;
5012
5013         VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
5014             bufsize == m_maxsize(MC_16KCL));
5015
5016         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5017
5018         /* Numbers in 2K cluster units */
5019         m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
5020         m_clusters = m_total(MC_CL);
5021         m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
5022         m_16kclusters = m_total(MC_16KCL);
5023         sumclusters = m_mbclusters + m_clusters + m_bigclusters;
5024
5025         m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT;
5026         m_clfree = m_infree(MC_CL);
5027         m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT;
5028         m_16kclfree = m_infree(MC_16KCL);
5029         freeclusters = m_mbfree + m_clfree + m_bigclfree;
5030
5031         /* Bail if we've maxed out the mbuf memory map */
5032         if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) ||
5033             (njcl > 0 && bufsize == m_maxsize(MC_16KCL) &&
5034             (m_16kclusters << NCLPJCLSHIFT) >= njcl)) {
5035                 return (0);
5036         }
5037
5038         if (bufsize == m_maxsize(MC_BIGCL)) {
5039                 /* Under minimum */
5040                 if (m_bigclusters < m_minlimit(MC_BIGCL))
5041                         return (m_minlimit(MC_BIGCL) - m_bigclusters);
5042
5043                 percent_pool =
5044                     ((sumclusters - freeclusters) * 100) / sumclusters;
5045                 percent_kmem = (sumclusters * 100) / nclusters;
5046
5047                 /*
5048                  * If a light/normal user, grow conservatively (75%)
5049                  * If a heavy user, grow aggressively (50%)
5050                  */
5051                 if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH)
5052                         mb_growth = MB_GROWTH_NORMAL;
5053                 else
5054                         mb_growth = MB_GROWTH_AGGRESSIVE;
5055
5056                 if (percent_kmem < 5) {
5057                         /* For initial allocations */
5058                         i = num;
5059                 } else {
5060                         /* Return if >= MBIGCL_LOWAT clusters available */
5061                         if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT &&
5062                             m_total(MC_BIGCL) >=
5063                             MBIGCL_LOWAT + m_minlimit(MC_BIGCL))
5064                                 return (0);
5065
5066                         /* Ensure at least num clusters are accessible */
5067                         if (num >= m_infree(MC_BIGCL))
5068                                 i = num - m_infree(MC_BIGCL);
5069                         if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL))
5070                                 j = num - (m_total(MC_BIGCL) -
5071                                     m_minlimit(MC_BIGCL));
5072
5073                         i = MAX(i, j);
5074
5075                         /*
5076                          * Grow pool if percent_pool > 75 (normal growth)
5077                          * or percent_pool > 50 (aggressive growth).
5078                          */
5079                         mb_growth_thresh = 100 - (100 / (1 << mb_growth));
5080                         if (percent_pool > mb_growth_thresh)
5081                                 j = ((sumclusters + num) >> mb_growth) -
5082                                     freeclusters;
5083                         i = MAX(i, j);
5084                 }
5085
5086                 /* Check to ensure we didn't go over limits */
5087                 if (i + m_bigclusters >= m_maxlimit(MC_BIGCL))
5088                         i = m_maxlimit(MC_BIGCL) - m_bigclusters;
5089                 if ((i << 1) + sumclusters >= nclusters)
5090                         i = (nclusters - sumclusters) >> 1;
5091                 VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
5092                 VERIFY(sumclusters + (i << 1) <= nclusters);
5093
5094         } else { /* 16K CL */
5095                 VERIFY(njcl > 0);
5096                 /* Under minimum */
5097                 if (m_16kclusters < MIN16KCL)
5098                         return (MIN16KCL - m_16kclusters);
5099                 if (m_16kclfree >= M16KCL_LOWAT)
5100                         return (0);
5101
5102                 /* Ensure at least num clusters are available */
5103                 if (num >= m_16kclfree)
5104                         i = num - m_16kclfree;
5105
5106                 /* Always grow 16KCL pool aggressively */
5107                 if (((m_16kclusters + num) >> 1) > m_16kclfree)
5108                         j = ((m_16kclusters + num) >> 1) - m_16kclfree;
5109                 i = MAX(i, j);
5110
5111                 /* Check to ensure we don't go over limit */
5112                 if (i + m_16kclusters >= m_maxlimit(MC_16KCL))
5113                         i = m_maxlimit(MC_16KCL) - m_16kclusters;
5114                 VERIFY((m_total(MC_16KCL) + i) <= m_maxlimit(MC_16KCL));
5115         }
5116         return (i);
5117 }
5118 /*
5119  * Return the number of bytes in the mbuf chain, m.
5120  */
5121 unsigned int
5122 m_length(struct mbuf *m)
5123 {
5124         struct mbuf *m0;
5125         unsigned int pktlen;
5126
5127         if (m->m_flags & M_PKTHDR)
5128                 return (m->m_pkthdr.len);
5129
5130         pktlen = 0;
5131         for (m0 = m; m0 != NULL; m0 = m0->m_next)
5132                 pktlen += m0->m_len;
5133         return (pktlen);
5134 }
5135
5136 /*
5137  * Copy data from a buffer back into the indicated mbuf chain,
5138  * starting "off" bytes from the beginning, extending the mbuf
5139  * chain if necessary.
5140  */
5141 void
5142 m_copyback(struct mbuf *m0, int off, int len, const void *cp)
5143 {
5144 #if DEBUG
5145         struct mbuf *origm = m0;
5146         int error;
5147 #endif /* DEBUG */
5148
5149         if (m0 == NULL)
5150                 return;
5151
5152 #if DEBUG
5153         error =
5154 #endif /* DEBUG */
5155         m_copyback0(&m0, off, len, cp,
5156             M_COPYBACK0_COPYBACK | M_COPYBACK0_EXTEND, M_DONTWAIT);
5157
5158 #if DEBUG
5159         if (error != 0 || (m0 != NULL && origm != m0))
5160                 panic("m_copyback");
5161 #endif /* DEBUG */
5162 }
5163
5164 struct mbuf *
5165 m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp, int how)
5166 {
5167         int error;
5168
5169         /* don't support chain expansion */
5170         VERIFY(off + len <= m_length(m0));
5171
5172         error = m_copyback0(&m0, off, len, cp,
5173             M_COPYBACK0_COPYBACK | M_COPYBACK0_COW, how);
5174         if (error) {
5175                 /*
5176                  * no way to recover from partial success.
5177                  * just free the chain.
5178                  */
5179                 m_freem(m0);
5180                 return (NULL);
5181         }
5182         return (m0);
5183 }
5184
5185 /*
5186  * m_makewritable: ensure the specified range writable.
5187  */
5188 int
5189 m_makewritable(struct mbuf **mp, int off, int len, int how)
5190 {
5191         int error;
5192 #if DEBUG
5193         struct mbuf *n;
5194         int origlen, reslen;
5195
5196         origlen = m_length(*mp);
5197 #endif /* DEBUG */
5198
5199 #if 0 /* M_COPYALL is large enough */
5200         if (len == M_COPYALL)
5201                 len = m_length(*mp) - off; /* XXX */
5202 #endif
5203
5204         error = m_copyback0(mp, off, len, NULL,
5205             M_COPYBACK0_PRESERVE | M_COPYBACK0_COW, how);
5206
5207 #if DEBUG
5208         reslen = 0;
5209         for (n = *mp; n; n = n->m_next)
5210                 reslen += n->m_len;
5211         if (origlen != reslen)
5212                 panic("m_makewritable: length changed");
5213         if (((*mp)->m_flags & M_PKTHDR) && reslen != (*mp)->m_pkthdr.len)
5214                 panic("m_makewritable: inconsist");
5215 #endif /* DEBUG */
5216
5217         return (error);
5218 }
5219
5220 static int
5221 m_copyback0(struct mbuf **mp0, int off, int len, const void *vp, int flags,
5222     int how)
5223 {
5224         int mlen;
5225         struct mbuf *m, *n;
5226         struct mbuf **mp;
5227         int totlen = 0;
5228         const char *cp = vp;
5229
5230         VERIFY(mp0 != NULL);
5231         VERIFY(*mp0 != NULL);
5232         VERIFY((flags & M_COPYBACK0_PRESERVE) == 0 || cp == NULL);
5233         VERIFY((flags & M_COPYBACK0_COPYBACK) == 0 || cp != NULL);
5234
5235         /*
5236          * we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
5237          * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
5238          */
5239
5240         VERIFY((~flags & (M_COPYBACK0_EXTEND|M_COPYBACK0_COW)) != 0);
5241
5242         mp = mp0;
5243         m = *mp;
5244         while (off > (mlen = m->m_len)) {
5245                 off -= mlen;
5246                 totlen += mlen;
5247                 if (m->m_next == NULL) {
5248                         int tspace;
5249 extend:
5250                         if (!(flags & M_COPYBACK0_EXTEND))
5251                                 goto out;
5252
5253                         /*
5254                          * try to make some space at the end of "m".
5255                          */
5256
5257                         mlen = m->m_len;
5258                         if (off + len >= MINCLSIZE &&
5259                             !(m->m_flags & M_EXT) && m->m_len == 0) {
5260                                 MCLGET(m, how);
5261                         }
5262                         tspace = M_TRAILINGSPACE(m);
5263                         if (tspace > 0) {
5264                                 tspace = MIN(tspace, off + len);
5265                                 VERIFY(tspace > 0);
5266                                 bzero(mtod(m, char *) + m->m_len,
5267                                     MIN(off, tspace));
5268                                 m->m_len += tspace;
5269                                 off += mlen;
5270                                 totlen -= mlen;
5271                                 continue;
5272                         }
5273
5274                         /*
5275                          * need to allocate an mbuf.
5276                          */
5277
5278                         if (off + len >= MINCLSIZE) {
5279                                 n = m_getcl(how, m->m_type, 0);
5280                         } else {
5281                                 n = _M_GET(how, m->m_type);
5282                         }
5283                         if (n == NULL) {
5284                                 goto out;
5285                         }
5286                         n->m_len = 0;
5287                         n->m_len = MIN(M_TRAILINGSPACE(n), off + len);
5288                         bzero(mtod(n, char *), MIN(n->m_len, off));
5289                         m->m_next = n;
5290                 }
5291                 mp = &m->m_next;
5292                 m = m->m_next;
5293         }
5294         while (len > 0) {
5295                 mlen = m->m_len - off;
5296                 if (mlen != 0 && m_mclhasreference(m)) {
5297                         char *datap;
5298                         int eatlen;
5299
5300                         /*
5301                          * this mbuf is read-only.
5302                          * allocate a new writable mbuf and try again.
5303                          */
5304
5305 #if defined(DIAGNOSTIC)
5306                         if (!(flags & M_COPYBACK0_COW))
5307                                 panic("m_copyback0: read-only");
5308 #endif /* defined(DIAGNOSTIC) */
5309
5310                         /*
5311                          * if we're going to write into the middle of
5312                          * a mbuf, split it first.
5313                          */
5314                         if (off > 0 && len < mlen) {
5315                                 n = m_split0(m, off, how, 0);
5316                                 if (n == NULL)
5317                                         goto enobufs;
5318                                 m->m_next = n;
5319                                 mp = &m->m_next;
5320                                 m = n;
5321                                 off = 0;
5322                                 continue;
5323                         }
5324
5325                         /*
5326                          * XXX TODO coalesce into the trailingspace of
5327                          * the previous mbuf when possible.
5328                          */
5329
5330                         /*
5331                          * allocate a new mbuf.  copy packet header if needed.
5332                          */
5333                         n = _M_GET(how, m->m_type);
5334                         if (n == NULL)
5335                                 goto enobufs;
5336                         if (off == 0 && (m->m_flags & M_PKTHDR)) {
5337                                 M_COPY_PKTHDR(n, m);
5338                                 n->m_len = MHLEN;
5339                         } else {
5340                                 if (len >= MINCLSIZE)
5341                                         MCLGET(n, M_DONTWAIT);
5342                                 n->m_len =
5343                                     (n->m_flags & M_EXT) ? MCLBYTES : MLEN;
5344                         }
5345                         if (n->m_len > len)
5346                                 n->m_len = len;
5347
5348                         /*
5349                          * free the region which has been overwritten.
5350                          * copying data from old mbufs if requested.
5351                          */
5352                         if (flags & M_COPYBACK0_PRESERVE)
5353                                 datap = mtod(n, char *);
5354                         else
5355                                 datap = NULL;
5356                         eatlen = n->m_len;
5357                         VERIFY(off == 0 || eatlen >= mlen);
5358                         if (off > 0) {
5359                                 VERIFY(len >= mlen);
5360                                 m->m_len = off;
5361                                 m->m_next = n;
5362                                 if (datap) {
5363                                         m_copydata(m, off, mlen, datap);
5364                                         datap += mlen;
5365                                 }
5366                                 eatlen -= mlen;
5367                                 mp = &m->m_next;
5368                                 m = m->m_next;
5369                         }
5370                         while (m != NULL && m_mclhasreference(m) &&
5371                             n->m_type == m->m_type && eatlen > 0) {
5372                                 mlen = MIN(eatlen, m->m_len);
5373                                 if (datap) {
5374                                         m_copydata(m, 0, mlen, datap);
5375                                         datap += mlen;
5376                                 }
5377                                 m->m_data += mlen;
5378                                 m->m_len -= mlen;
5379                                 eatlen -= mlen;
5380                                 if (m->m_len == 0)
5381                                         *mp = m = m_free(m);
5382                         }
5383                         if (eatlen > 0)
5384                                 n->m_len -= eatlen;
5385                         n->m_next = m;
5386                         *mp = m = n;
5387                         continue;
5388                 }
5389                 mlen = MIN(mlen, len);
5390                 if (flags & M_COPYBACK0_COPYBACK) {
5391                         bcopy(cp, mtod(m, caddr_t) + off, (unsigned)mlen);
5392                         cp += mlen;
5393                 }
5394                 len -= mlen;
5395                 mlen += off;
5396                 off = 0;
5397                 totlen += mlen;
5398                 if (len == 0)
5399                         break;
5400                 if (m->m_next == NULL) {
5401                         goto extend;
5402                 }
5403                 mp = &m->m_next;
5404                 m = m->m_next;
5405         }
5406 out:
5407         if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) {
5408                 VERIFY(flags & M_COPYBACK0_EXTEND);
5409                 m->m_pkthdr.len = totlen;
5410         }
5411
5412         return (0);
5413
5414 enobufs:
5415         return (ENOBUFS);
5416 }
5417
5418 char *
5419 mcl_to_paddr(char *addr)
5420 {
5421         vm_offset_t base_phys;
5422
5423         if (!MBUF_IN_MAP(addr))
5424                 return (NULL);
5425         base_phys = mcl_paddr[(addr - (char *)mbutl) >> PGSHIFT];
5426
5427         if (base_phys == 0)
5428                 return (NULL);
5429         return ((char *)((uintptr_t)base_phys | ((uintptr_t)addr & PGOFSET)));
5430 }
5431
5432 /*
5433  * Dup the mbuf chain passed in.  The whole thing.  No cute additional cruft.
5434  * And really copy the thing.  That way, we don't "precompute" checksums
5435  * for unsuspecting consumers.  Assumption: m->m_nextpkt == 0.  Trick: for
5436  * small packets, don't dup into a cluster.  That way received  packets
5437  * don't take up too much room in the sockbuf (cf. sbspace()).
5438  */
5439 int MDFail;
5440
5441 struct mbuf *
5442 m_dup(struct mbuf *m, int how)
5443 {
5444         struct mbuf *n, **np;
5445         struct mbuf *top;
5446         int copyhdr = 0;
5447
5448         np = &top;
5449         top = NULL;
5450         if (m->m_flags & M_PKTHDR)
5451                 copyhdr = 1;
5452
5453         /*
5454          * Quick check: if we have one mbuf and its data fits in an
5455          *  mbuf with packet header, just copy and go.
5456          */
5457         if (m->m_next == NULL) {
5458                 /* Then just move the data into an mbuf and be done... */
5459                 if (copyhdr) {
5460                         if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
5461                                 if ((n = _M_GETHDR(how, m->m_type)) == NULL)
5462                                         return (NULL);
5463                                 n->m_len = m->m_len;
5464                                 m_dup_pkthdr(n, m, how);
5465                                 bcopy(m->m_data, n->m_data, m->m_len);
5466                                 return (n);
5467                         }
5468                 } else if (m->m_len <= MLEN) {
5469                         if ((n = _M_GET(how, m->m_type)) == NULL)
5470                                 return (NULL);
5471                         bcopy(m->m_data, n->m_data, m->m_len);
5472                         n->m_len = m->m_len;
5473                         return (n);
5474                 }
5475         }
5476         while (m != NULL) {
5477 #if BLUE_DEBUG
5478                 kprintf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
5479                     m->m_data);
5480 #endif
5481                 if (copyhdr)
5482                         n = _M_GETHDR(how, m->m_type);
5483                 else
5484                         n = _M_GET(how, m->m_type);
5485                 if (n == NULL)
5486                         goto nospace;
5487                 if (m->m_flags & M_EXT) {
5488                         if (m->m_len <= m_maxsize(MC_CL))
5489                                 MCLGET(n, how);
5490                         else if (m->m_len <= m_maxsize(MC_BIGCL))
5491                                 n = m_mbigget(n, how);
5492                         else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0)
5493                                 n = m_m16kget(n, how);
5494                         if (!(n->m_flags & M_EXT)) {
5495                                 (void) m_free(n);
5496                                 goto nospace;
5497                         }
5498                 }
5499                 *np = n;
5500                 if (copyhdr) {
5501                         /* Don't use M_COPY_PKTHDR: preserve m_data */
5502                         m_dup_pkthdr(n, m, how);
5503                         copyhdr = 0;
5504                         if (!(n->m_flags & M_EXT))
5505                                 n->m_data = n->m_pktdat;
5506                 }
5507                 n->m_len = m->m_len;
5508                 /*
5509                  * Get the dup on the same bdry as the original
5510                  * Assume that the two mbufs have the same offset to data area
5511                  * (up to word boundaries)
5512                  */
5513                 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len);
5514                 m = m->m_next;
5515                 np = &n->m_next;
5516 #if BLUE_DEBUG
5517                 kprintf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
5518                     n->m_data);
5519 #endif
5520         }
5521
5522         if (top == NULL)
5523                 MDFail++;
5524         return (top);
5525
5526 nospace:
5527         m_freem(top);
5528         MDFail++;
5529         return (NULL);
5530 }
5531
5532 #define MBUF_MULTIPAGES(m)                                              \
5533         (((m)->m_flags & M_EXT) &&                                      \
5534         ((IS_P2ALIGNED((m)->m_data, NBPG) && (m)->m_len > NBPG) ||      \
5535         (!IS_P2ALIGNED((m)->m_data, NBPG) &&                            \
5536         P2ROUNDUP((m)->m_data, NBPG) < ((uintptr_t)(m)->m_data + (m)->m_len))))
5537
5538 static struct mbuf *
5539 m_expand(struct mbuf *m, struct mbuf **last)
5540 {
5541         struct mbuf *top = NULL;
5542         struct mbuf **nm = &top;
5543         uintptr_t data0, data;
5544         unsigned int len0, len;
5545
5546         VERIFY(MBUF_MULTIPAGES(m));
5547         VERIFY(m->m_next == NULL);
5548         data0 = (uintptr_t)m->m_data;
5549         len0 = m->m_len;
5550         *last = top;
5551
5552         for (;;) {
5553                 struct mbuf *n;
5554
5555                 data = data0;
5556                 if (IS_P2ALIGNED(data, NBPG) && len0 > NBPG)
5557                         len = NBPG;
5558                 else if (!IS_P2ALIGNED(data, NBPG) &&
5559                     P2ROUNDUP(data, NBPG) < (data + len0))
5560                         len = P2ROUNDUP(data, NBPG) - data;
5561                 else
5562                         len = len0;
5563
5564                 VERIFY(len > 0);
5565                 VERIFY(m->m_flags & M_EXT);
5566                 m->m_data = (void *)data;
5567                 m->m_len = len;
5568
5569                 *nm = *last = m;
5570                 nm = &m->m_next;
5571                 m->m_next = NULL;
5572
5573                 data0 += len;
5574                 len0 -= len;
5575                 if (len0 == 0)
5576                         break;
5577
5578                 n = _M_RETRY(M_DONTWAIT, MT_DATA);
5579                 if (n == NULL) {
5580                         m_freem(top);
5581                         top = *last = NULL;
5582                         break;
5583                 }
5584
5585                 n->m_ext = m->m_ext;
5586                 m_incref(m);
5587                 n->m_flags |= M_EXT;
5588                 m = n;
5589         }
5590         return (top);
5591 }
5592
5593 struct mbuf *
5594 m_normalize(struct mbuf *m)
5595 {
5596         struct mbuf *top = NULL;
5597         struct mbuf **nm = &top;
5598         boolean_t expanded = FALSE;
5599
5600         while (m != NULL) {
5601                 struct mbuf *n;
5602
5603                 n = m->m_next;
5604                 m->m_next = NULL;
5605
5606                 /* Does the data cross one or more page boundaries? */
5607                 if (MBUF_MULTIPAGES(m)) {
5608                         struct mbuf *last;
5609                         if ((m = m_expand(m, &last)) == NULL) {
5610                                 m_freem(n);
5611                                 m_freem(top);
5612                                 top = NULL;
5613                                 break;
5614                         }
5615                         *nm = m;
5616                         nm = &last->m_next;
5617                         expanded = TRUE;
5618                 } else {
5619                         *nm = m;
5620                         nm = &m->m_next;
5621                 }
5622                 m = n;
5623         }
5624         if (expanded)
5625                 atomic_add_32(&mb_normalized, 1);
5626         return (top);
5627 }
5628
5629 /*
5630  * Append the specified data to the indicated mbuf chain,
5631  * Extend the mbuf chain if the new data does not fit in
5632  * existing space.
5633  *
5634  * Return 1 if able to complete the job; otherwise 0.
5635  */
5636 int
5637 m_append(struct mbuf *m0, int len, caddr_t cp)
5638 {
5639         struct mbuf *m, *n;
5640         int remainder, space;
5641
5642         for (m = m0; m->m_next != NULL; m = m->m_next)
5643                 ;
5644         remainder = len;
5645         space = M_TRAILINGSPACE(m);
5646         if (space > 0) {
5647                 /*
5648                  * Copy into available space.
5649                  */
5650                 if (space > remainder)
5651                         space = remainder;
5652                 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
5653                 m->m_len += space;
5654                 cp += space, remainder -= space;
5655         }
5656         while (remainder > 0) {
5657                 /*
5658                  * Allocate a new mbuf; could check space
5659                  * and allocate a cluster instead.
5660                  */
5661                 n = m_get(M_WAITOK, m->m_type);
5662                 if (n == NULL)
5663                         break;
5664                 n->m_len = min(MLEN, remainder);
5665                 bcopy(cp, mtod(n, caddr_t), n->m_len);
5666                 cp += n->m_len;
5667                 remainder -= n->m_len;
5668                 m->m_next = n;
5669                 m = n;
5670         }
5671         if (m0->m_flags & M_PKTHDR)
5672                 m0->m_pkthdr.len += len - remainder;
5673         return (remainder == 0);
5674 }
5675
5676 struct mbuf *
5677 m_last(struct mbuf *m)
5678 {
5679         while (m->m_next != NULL)
5680                 m = m->m_next;
5681         return (m);
5682 }
5683
5684 unsigned int
5685 m_fixhdr(struct mbuf *m0)
5686 {
5687         u_int len;
5688
5689         len = m_length2(m0, NULL);
5690         m0->m_pkthdr.len = len;
5691         return (len);
5692 }
5693
5694 unsigned int
5695 m_length2(struct mbuf *m0, struct mbuf **last)
5696 {
5697         struct mbuf *m;
5698         u_int len;
5699
5700         len = 0;
5701         for (m = m0; m != NULL; m = m->m_next) {
5702                 len += m->m_len;
5703                 if (m->m_next == NULL)
5704                         break;
5705         }
5706         if (last != NULL)
5707                 *last = m;
5708         return (len);
5709 }
5710
5711 /*
5712  * Defragment a mbuf chain, returning the shortest possible chain of mbufs
5713  * and clusters.  If allocation fails and this cannot be completed, NULL will
5714  * be returned, but the passed in chain will be unchanged.  Upon success,
5715  * the original chain will be freed, and the new chain will be returned.
5716  *
5717  * If a non-packet header is passed in, the original mbuf (chain?) will
5718  * be returned unharmed.
5719  *
5720  * If offset is specfied, the first mbuf in the chain will have a leading
5721  * space of the amount stated by the "off" parameter.
5722  *
5723  * This routine requires that the m_pkthdr.header field of the original
5724  * mbuf chain is cleared by the caller.
5725  */
5726 struct mbuf *
5727 m_defrag_offset(struct mbuf *m0, u_int32_t off, int how)
5728 {
5729         struct mbuf *m_new = NULL, *m_final = NULL;
5730         int progress = 0, length, pktlen;
5731
5732         if (!(m0->m_flags & M_PKTHDR))
5733                 return (m0);
5734
5735         VERIFY(off < MHLEN);
5736         m_fixhdr(m0); /* Needed sanity check */
5737
5738         pktlen = m0->m_pkthdr.len + off;
5739         if (pktlen > MHLEN)
5740                 m_final = m_getcl(how, MT_DATA, M_PKTHDR);
5741         else
5742                 m_final = m_gethdr(how, MT_DATA);
5743
5744         if (m_final == NULL)
5745                 goto nospace;
5746
5747         if (off > 0) {
5748                 pktlen -= off;
5749                 m_final->m_len -= off;
5750                 m_final->m_data += off;
5751         }
5752
5753         /*
5754          * Caller must have handled the contents pointed to by this
5755          * pointer before coming here, as otherwise it will point to
5756          * the original mbuf which will get freed upon success.
5757          */
5758         VERIFY(m0->m_pkthdr.header == NULL);
5759
5760         if (m_dup_pkthdr(m_final, m0, how) == 0)
5761                 goto nospace;
5762
5763         m_new = m_final;
5764
5765         while (progress < pktlen) {
5766                 length = pktlen - progress;
5767                 if (length > MCLBYTES)
5768                         length = MCLBYTES;
5769
5770                 if (m_new == NULL) {
5771                         if (length > MLEN)
5772                                 m_new = m_getcl(how, MT_DATA, 0);
5773                         else
5774                                 m_new = m_get(how, MT_DATA);
5775                         if (m_new == NULL)
5776                                 goto nospace;
5777                 }
5778
5779                 m_copydata(m0, progress, length, mtod(m_new, caddr_t));
5780                 progress += length;
5781                 m_new->m_len = length;
5782                 if (m_new != m_final)
5783                         m_cat(m_final, m_new);
5784                 m_new = NULL;
5785         }
5786         m_freem(m0);
5787         m0 = m_final;
5788         return (m0);
5789 nospace:
5790         if (m_final)
5791                 m_freem(m_final);
5792         return (NULL);
5793 }
5794
5795 struct mbuf *
5796 m_defrag(struct mbuf *m0, int how)
5797 {
5798         return (m_defrag_offset(m0, 0, how));
5799 }
5800
5801 void
5802 m_mchtype(struct mbuf *m, int t)
5803 {
5804         mtype_stat_inc(t);
5805         mtype_stat_dec(m->m_type);
5806         (m)->m_type = t;
5807 }
5808
5809 void *
5810 m_mtod(struct mbuf *m)
5811 {
5812         return (MTOD(m, void *));
5813 }
5814
5815 struct mbuf *
5816 m_dtom(void *x)
5817 {
5818         return ((struct mbuf *)((uintptr_t)(x) & ~(MSIZE-1)));
5819 }
5820
5821 void
5822 m_mcheck(struct mbuf *m)
5823 {
5824         _MCHECK(m);
5825 }
5826
5827 /*
5828  * Return a pointer to mbuf/offset of location in mbuf chain.
5829  */
5830 struct mbuf *
5831 m_getptr(struct mbuf *m, int loc, int *off)
5832 {
5833
5834         while (loc >= 0) {
5835                 /* Normal end of search. */
5836                 if (m->m_len > loc) {
5837                         *off = loc;
5838                         return (m);
5839                 } else {
5840                         loc -= m->m_len;
5841                         if (m->m_next == NULL) {
5842                                 if (loc == 0) {
5843                                         /* Point at the end of valid data. */
5844                                         *off = m->m_len;
5845                                         return (m);
5846                                 }
5847                                 return (NULL);
5848                         }
5849                         m = m->m_next;
5850                 }
5851         }
5852         return (NULL);
5853 }
5854
5855 /*
5856  * Inform the corresponding mcache(s) that there's a waiter below.
5857  */
5858 static void
5859 mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
5860 {
5861         mcache_waiter_inc(m_cache(class));
5862         if (comp) {
5863                 if (class == MC_CL) {
5864                         mcache_waiter_inc(m_cache(MC_MBUF_CL));
5865                 } else if (class == MC_BIGCL) {
5866                         mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
5867                 } else if (class == MC_16KCL) {
5868                         mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
5869                 } else {
5870                         mcache_waiter_inc(m_cache(MC_MBUF_CL));
5871                         mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
5872                 }
5873         }
5874 }
5875
5876 /*
5877  * Inform the corresponding mcache(s) that there's no more waiter below.
5878  */
5879 static void
5880 mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
5881 {
5882         mcache_waiter_dec(m_cache(class));
5883         if (comp) {
5884                 if (class == MC_CL) {
5885                         mcache_waiter_dec(m_cache(MC_MBUF_CL));
5886                 } else if (class == MC_BIGCL) {
5887                         mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
5888                 } else if (class == MC_16KCL) {
5889                         mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
5890                 } else {
5891                         mcache_waiter_dec(m_cache(MC_MBUF_CL));
5892                         mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
5893                 }
5894         }
5895 }
5896
5897 /*
5898  * Called during slab (blocking and non-blocking) allocation.  If there
5899  * is at least one waiter, and the time since the first waiter is blocked
5900  * is greater than the watchdog timeout, panic the system.
5901  */
5902 static void
5903 mbuf_watchdog(void)
5904 {
5905         struct timeval now;
5906         unsigned int since;
5907
5908         if (mb_waiters == 0 || !mb_watchdog)
5909                 return;
5910
5911         microuptime(&now);
5912         since = now.tv_sec - mb_wdtstart.tv_sec;
5913         if (since >= MB_WDT_MAXTIME) {
5914                 panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__,
5915                     mb_waiters, since, mbuf_dump());
5916                 /* NOTREACHED */
5917         }
5918 }
5919
5920 /*
5921  * Called during blocking allocation.  Returns TRUE if one or more objects
5922  * are available at the per-CPU caches layer and that allocation should be
5923  * retried at that level.
5924  */
5925 static boolean_t
5926 mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
5927 {
5928         boolean_t mcache_retry = FALSE;
5929
5930         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5931
5932         /* Check if there's anything at the cache layer */
5933         if (mbuf_cached_above(class, wait)) {
5934                 mcache_retry = TRUE;
5935                 goto done;
5936         }
5937
5938         /* Nothing?  Then try hard to get it from somewhere */
5939         m_reclaim(class, num, (wait & MCR_COMP));
5940
5941         /* We tried hard and got something? */
5942         if (m_infree(class) > 0) {
5943                 mbstat.m_wait++;
5944                 goto done;
5945         } else if (mbuf_cached_above(class, wait)) {
5946                 mbstat.m_wait++;
5947                 mcache_retry = TRUE;
5948                 goto done;
5949         } else if (wait & MCR_TRYHARD) {
5950                 mcache_retry = TRUE;
5951                 goto done;
5952         }
5953
5954         /*
5955          * There's really nothing for us right now; inform the
5956          * cache(s) that there is a waiter below and go to sleep.
5957          */
5958         mbuf_waiter_inc(class, (wait & MCR_COMP));
5959
5960         VERIFY(!(wait & MCR_NOSLEEP));
5961
5962         /*
5963          * If this is the first waiter, arm the watchdog timer.  Otherwise
5964          * check if we need to panic the system due to watchdog timeout.
5965          */
5966         if (mb_waiters == 0)
5967                 microuptime(&mb_wdtstart);
5968         else
5969                 mbuf_watchdog();
5970
5971         mb_waiters++;
5972         (void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL);
5973
5974         /* We are now up; stop getting notified until next round */
5975         mbuf_waiter_dec(class, (wait & MCR_COMP));
5976
5977         /* We waited and got something */
5978         if (m_infree(class) > 0) {
5979                 mbstat.m_wait++;
5980                 goto done;
5981         } else if (mbuf_cached_above(class, wait)) {
5982                 mbstat.m_wait++;
5983                 mcache_retry = TRUE;
5984         }
5985 done:
5986         return (mcache_retry);
5987 }
5988
5989 static void
5990 mbuf_worker_thread(void)
5991 {
5992         int mbuf_expand;
5993
5994         while (1) {
5995                 lck_mtx_lock(mbuf_mlock);
5996
5997                 mbuf_expand = 0;
5998                 if (mbuf_expand_mcl) {
5999                         int n;
6000
6001                         /* Adjust to current number of cluster in use */
6002                         n = mbuf_expand_mcl -
6003                             (m_total(MC_CL) - m_infree(MC_CL));
6004                         if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL))
6005                                 n = m_maxlimit(MC_CL) - m_total(MC_CL);
6006                         mbuf_expand_mcl = 0;
6007
6008                         if (n > 0 && freelist_populate(MC_CL, n, M_WAIT) > 0)
6009                                 mbuf_expand++;
6010                 }
6011                 if (mbuf_expand_big) {
6012                         int n;
6013
6014                         /* Adjust to current number of 4 KB cluster in use */
6015                         n = mbuf_expand_big -
6016                             (m_total(MC_BIGCL) - m_infree(MC_BIGCL));
6017                         if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL))
6018                                 n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
6019                         mbuf_expand_big = 0;
6020
6021                         if (n > 0 && freelist_populate(MC_BIGCL, n, M_WAIT) > 0)
6022                                 mbuf_expand++;
6023                 }
6024                 if (mbuf_expand_16k) {
6025                         int n;
6026
6027                         /* Adjust to current number of 16 KB cluster in use */
6028                         n = mbuf_expand_16k -
6029                             (m_total(MC_16KCL) - m_infree(MC_16KCL));
6030                         if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL))
6031                                 n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
6032                         mbuf_expand_16k = 0;
6033
6034                         if (n > 0)
6035                                 (void) freelist_populate(MC_16KCL, n, M_WAIT);
6036                 }
6037
6038                 /*
6039                  * Because we can run out of memory before filling the mbuf
6040                  * map, we should not allocate more clusters than they are
6041                  * mbufs -- otherwise we could have a large number of useless
6042                  * clusters allocated.
6043                  */
6044                 if (mbuf_expand) {
6045                         while (m_total(MC_MBUF) <
6046                             (m_total(MC_BIGCL) + m_total(MC_CL))) {
6047                                 if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0)
6048                                         break;
6049                         }
6050                 }
6051
6052                 lck_mtx_unlock(mbuf_mlock);
6053
6054                 assert_wait(&mbuf_worker_run, THREAD_UNINT);
6055                 (void) thread_block((thread_continue_t)mbuf_worker_thread);
6056         }
6057 }
6058
6059 static void
6060 mbuf_worker_thread_init(void)
6061 {
6062         mbuf_worker_ready++;
6063         mbuf_worker_thread();
6064 }
6065
6066 static mcl_slab_t *
6067 slab_get(void *buf)
6068 {
6069         mcl_slabg_t *slg;
6070         unsigned int ix, k;
6071
6072         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6073
6074         VERIFY(MBUF_IN_MAP(buf));
6075         ix = ((char *)buf - (char *)mbutl) >> MBSHIFT;
6076         VERIFY(ix < maxslabgrp);
6077
6078         if ((slg = slabstbl[ix]) == NULL) {
6079                 /*
6080                  * In the current implementation, we never shrink the memory
6081                  * pool (hence the cluster map); if we attempt to reallocate
6082                  * a cluster group when it's already allocated, panic since
6083                  * this is a sign of a memory corruption (slabstbl[ix] got
6084                  * nullified).  This also means that there shouldn't be any
6085                  * hole in the kernel sub-map for the mbuf pool.
6086                  */
6087                 ++slabgrp;
6088                 VERIFY(ix < slabgrp);
6089                 /*
6090                  * Slabs expansion can only be done single threaded; when
6091                  * we get here, it must be as a result of m_clalloc() which
6092                  * is serialized and therefore mb_clalloc_busy must be set.
6093                  */
6094                 VERIFY(mb_clalloc_busy);
6095                 lck_mtx_unlock(mbuf_mlock);
6096
6097                 /* This is a new buffer; create the slabs group for it */
6098                 MALLOC(slg, mcl_slabg_t *, sizeof (*slg), M_TEMP,
6099                     M_WAITOK | M_ZERO);
6100                 VERIFY(slg != NULL);
6101
6102                 lck_mtx_lock(mbuf_mlock);
6103                 /*
6104                  * No other thread could have gone into m_clalloc() after
6105                  * we dropped the lock above, so verify that it's true.
6106                  */
6107                 VERIFY(mb_clalloc_busy);
6108
6109                 slabstbl[ix] = slg;
6110
6111                 /* Chain each slab in the group to its forward neighbor */
6112                 for (k = 1; k < NSLABSPMB; k++)
6113                         slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
6114                 VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
6115
6116                 /* And chain the last slab in the previous group to this */
6117                 if (ix > 0) {
6118                         VERIFY(slabstbl[ix - 1]->
6119                             slg_slab[NSLABSPMB - 1].sl_next == NULL);
6120                         slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
6121                             &slg->slg_slab[0];
6122                 }
6123         }
6124
6125         ix = MTOBG(buf) % NSLABSPMB;
6126         VERIFY(ix < NSLABSPMB);
6127
6128         return (&slg->slg_slab[ix]);
6129 }
6130
6131 static void
6132 slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
6133     void *base, void *head, unsigned int len, int refcnt, int chunks)
6134 {
6135         sp->sl_class = class;
6136         sp->sl_flags = flags;
6137         sp->sl_base = base;
6138         sp->sl_head = head;
6139         sp->sl_len = len;
6140         sp->sl_refcnt = refcnt;
6141         sp->sl_chunks = chunks;
6142         slab_detach(sp);
6143 }
6144
6145 static void
6146 slab_insert(mcl_slab_t *sp, mbuf_class_t class)
6147 {
6148         VERIFY(slab_is_detached(sp));
6149         m_slab_cnt(class)++;
6150         TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
6151         sp->sl_flags &= ~SLF_DETACHED;
6152         if (class == MC_16KCL) {
6153                 int k;
6154                 for (k = 1; k < NSLABSP16KB; k++) {
6155                         sp = sp->sl_next;
6156                         /* Next slab must already be present */
6157                         VERIFY(sp != NULL);
6158                         VERIFY(slab_is_detached(sp));
6159                         sp->sl_flags &= ~SLF_DETACHED;
6160                 }
6161         }
6162 }
6163
6164 static void
6165 slab_remove(mcl_slab_t *sp, mbuf_class_t class)
6166 {
6167         VERIFY(!slab_is_detached(sp));
6168         VERIFY(m_slab_cnt(class) > 0);
6169         m_slab_cnt(class)--;
6170         TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
6171         slab_detach(sp);
6172         if (class == MC_16KCL) {
6173                 int k;
6174                 for (k = 1; k < NSLABSP16KB; k++) {
6175                         sp = sp->sl_next;
6176                         /* Next slab must already be present */
6177                         VERIFY(sp != NULL);
6178                         VERIFY(!slab_is_detached(sp));
6179                         slab_detach(sp);
6180                 }
6181         }
6182 }
6183
6184 static boolean_t
6185 slab_inrange(mcl_slab_t *sp, void *buf)
6186 {
6187         return ((uintptr_t)buf >= (uintptr_t)sp->sl_base &&
6188             (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len));
6189 }
6190
6191 #undef panic
6192
6193 static void
6194 slab_nextptr_panic(mcl_slab_t *sp, void *addr)
6195 {
6196         int i;
6197         unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
6198         uintptr_t buf = (uintptr_t)sp->sl_base;
6199
6200         for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
6201                 void *next = ((mcache_obj_t *)buf)->obj_next;
6202                 if (next != addr)
6203                         continue;
6204                 if (!mclverify) {
6205                         if (next != NULL && !MBUF_IN_MAP(next)) {
6206                                 mcache_t *cp = m_cache(sp->sl_class);
6207                                 panic("%s: %s buffer %p in slab %p modified "
6208                                     "after free at offset 0: %p out of range "
6209                                     "[%p-%p)\n", __func__, cp->mc_name,
6210                                     (void *)buf, sp, next, mbutl, embutl);
6211                                 /* NOTREACHED */
6212                         }
6213                 } else {
6214                         mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
6215                             (mcache_obj_t *)buf);
6216                         mcl_audit_verify_nextptr(next, mca);
6217                 }
6218         }
6219 }
6220
6221 static void
6222 slab_detach(mcl_slab_t *sp)
6223 {
6224         sp->sl_link.tqe_next = (mcl_slab_t *)-1;
6225         sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
6226         sp->sl_flags |= SLF_DETACHED;
6227 }
6228
6229 static boolean_t
6230 slab_is_detached(mcl_slab_t *sp)
6231 {
6232         return ((intptr_t)sp->sl_link.tqe_next == -1 &&
6233             (intptr_t)sp->sl_link.tqe_prev == -1 &&
6234             (sp->sl_flags & SLF_DETACHED));
6235 }
6236
6237 static void
6238 mcl_audit_init(void *buf, mcache_audit_t **mca_list,
6239     mcache_obj_t **con_list, size_t con_size, unsigned int num)
6240 {
6241         mcache_audit_t *mca, *mca_tail;
6242         mcache_obj_t *con = NULL;
6243         boolean_t save_contents = (con_list != NULL);
6244         unsigned int i, ix;
6245
6246         ASSERT(num <= NMBPBG);
6247         ASSERT(con_list == NULL || con_size != 0);
6248
6249         ix = MTOBG(buf);
6250         VERIFY(ix < maxclaudit);
6251
6252         /* Make sure we haven't been here before */
6253         for (i = 0; i < NMBPBG; i++)
6254                 VERIFY(mclaudit[ix].cl_audit[i] == NULL);
6255
6256         mca = mca_tail = *mca_list;
6257         if (save_contents)
6258                 con = *con_list;
6259
6260         for (i = 0; i < num; i++) {
6261                 mcache_audit_t *next;
6262
6263                 next = mca->mca_next;
6264                 bzero(mca, sizeof (*mca));
6265                 mca->mca_next = next;
6266                 mclaudit[ix].cl_audit[i] = mca;
6267
6268                 /* Attach the contents buffer if requested */
6269                 if (save_contents) {
6270                         VERIFY(con != NULL);
6271                         mca->mca_contents_size = con_size;
6272                         mca->mca_contents = con;
6273                         con = con->obj_next;
6274                         bzero(mca->mca_contents, mca->mca_contents_size);
6275                 }
6276
6277                 mca_tail = mca;
6278                 mca = mca->mca_next;
6279         }
6280
6281         if (save_contents)
6282                 *con_list = con;
6283
6284         *mca_list = mca_tail->mca_next;
6285         mca_tail->mca_next = NULL;
6286 }
6287
6288 /*
6289  * Given an address of a buffer (mbuf/2KB/4KB/16KB), return
6290  * the corresponding audit structure for that buffer.
6291  */
6292 static mcache_audit_t *
6293 mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *o)
6294 {
6295         mcache_audit_t *mca = NULL;
6296         int ix = MTOBG(o);
6297
6298         VERIFY(ix < maxclaudit);
6299         VERIFY(IS_P2ALIGNED(o, MIN(m_maxsize(class), NBPG)));
6300
6301         switch (class) {
6302         case MC_MBUF:
6303                 /*
6304                  * For the mbuf case, find the index of the page
6305                  * used by the mbuf and use that index to locate the
6306                  * base address of the page.  Then find out the
6307                  * mbuf index relative to the page base and use
6308                  * it to locate the audit structure.
6309                  */
6310                 VERIFY(MCLIDX(BGTOM(ix), o) < (int)NMBPBG);
6311                 mca = mclaudit[ix].cl_audit[MCLIDX(BGTOM(ix), o)];
6312                 break;
6313
6314         case MC_CL:
6315                 /*
6316                  * Same thing as above, but for 2KB clusters in a page.
6317                  */
6318                 VERIFY(CLBGIDX(BGTOM(ix), o) < (int)NCLPBG);
6319                 mca = mclaudit[ix].cl_audit[CLBGIDX(BGTOM(ix), o)];
6320                 break;
6321
6322         case MC_BIGCL:
6323         case MC_16KCL:
6324                 /*
6325                  * Same as above, but only return the first element.
6326                  */
6327                 mca = mclaudit[ix].cl_audit[0];
6328                 break;
6329
6330         default:
6331                 VERIFY(0);
6332                 /* NOTREACHED */
6333         }
6334
6335         return (mca);
6336 }
6337
6338 static void
6339 mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite,
6340     boolean_t alloc)
6341 {
6342         struct mbuf *m = addr;
6343         mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next;
6344
6345         VERIFY(mca->mca_contents != NULL &&
6346             mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
6347
6348         if (mclverify)
6349                 mcl_audit_verify_nextptr(next, mca);
6350
6351         if (!alloc) {
6352                 /* Save constructed mbuf fields */
6353                 mcl_audit_save_mbuf(m, mca);
6354                 if (mclverify) {
6355                         mcache_set_pattern(MCACHE_FREE_PATTERN, m,
6356                             m_maxsize(MC_MBUF));
6357                 }
6358                 ((mcache_obj_t *)m)->obj_next = next;
6359                 return;
6360         }
6361
6362         /* Check if the buffer has been corrupted while in freelist */
6363         if (mclverify) {
6364                 mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
6365         }
6366         /* Restore constructed mbuf fields */
6367         mcl_audit_restore_mbuf(m, mca, composite);
6368 }
6369
6370 static void
6371 mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite)
6372 {
6373         struct mbuf *ms = (struct mbuf *)mca->mca_contents;
6374
6375         if (composite) {
6376                 struct mbuf *next = m->m_next;
6377                 VERIFY(ms->m_flags == M_EXT && MEXT_RFA(ms) != NULL &&
6378                     MBUF_IS_COMPOSITE(ms));
6379                 /*
6380                  * We could have hand-picked the mbuf fields and restore
6381                  * them individually, but that will be a maintenance
6382                  * headache.  Instead, restore everything that was saved;
6383                  * the mbuf layer will recheck and reinitialize anyway.
6384                  */
6385                 bcopy(ms, m, mca->mca_contents_size);
6386                 m->m_next = next;
6387         } else {
6388                 /*
6389                  * For a regular mbuf (no cluster attached) there's nothing
6390                  * to restore other than the type field, which is expected
6391                  * to be MT_FREE.
6392                  */
6393                 m->m_type = ms->m_type;
6394         }
6395         _MCHECK(m);
6396 }
6397
6398 static void
6399 mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca)
6400 {
6401         _MCHECK(m);
6402         bcopy(m, mca->mca_contents, mca->mca_contents_size);
6403 }
6404
6405 static void
6406 mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc,
6407     boolean_t save_next)
6408 {
6409         mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next;
6410
6411         if (!alloc) {
6412                 if (mclverify) {
6413                         mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
6414                 }
6415                 if (save_next) {
6416                         mcl_audit_verify_nextptr(next, mca);
6417                         ((mcache_obj_t *)addr)->obj_next = next;
6418                 }
6419         } else if (mclverify) {
6420                 /* Check if the buffer has been corrupted while in freelist */
6421                 mcl_audit_verify_nextptr(next, mca);
6422                 mcache_audit_free_verify_set(mca, addr, 0, size);
6423         }
6424 }
6425
6426 static void
6427 mcl_audit_mcheck_panic(struct mbuf *m)
6428 {
6429         mcache_audit_t *mca;
6430
6431         MRANGE(m);
6432         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
6433
6434         panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n",
6435             m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(mca));
6436         /* NOTREACHED */
6437 }
6438
6439 static void
6440 mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
6441 {
6442         if (next != NULL && !MBUF_IN_MAP(next) &&
6443             (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) {
6444                 panic("mcl_audit: buffer %p modified after free at offset 0: "
6445                     "%p out of range [%p-%p)\n%s\n",
6446                     mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca));
6447                 /* NOTREACHED */
6448         }
6449 }
6450
6451 /* This function turns on mbuf leak detection */
6452 static void
6453 mleak_activate(void)
6454 {
6455         mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR;
6456         PE_parse_boot_argn("mleak_sample_factor",
6457             &mleak_table.mleak_sample_factor,
6458             sizeof (mleak_table.mleak_sample_factor));
6459
6460         if (mleak_table.mleak_sample_factor == 0)
6461                 mclfindleak = 0;
6462
6463         if (mclfindleak == 0)
6464                 return;
6465
6466         vm_size_t alloc_size =
6467             mleak_alloc_buckets * sizeof (struct mallocation);
6468         vm_size_t trace_size = mleak_trace_buckets * sizeof (struct mtrace);
6469
6470         MALLOC(mleak_allocations, struct mallocation *, alloc_size,
6471             M_TEMP, M_WAITOK | M_ZERO);
6472         VERIFY(mleak_allocations != NULL);
6473
6474         MALLOC(mleak_traces, struct mtrace *, trace_size,
6475             M_TEMP, M_WAITOK | M_ZERO);
6476         VERIFY(mleak_traces != NULL);
6477
6478         MALLOC(mleak_stat, mleak_stat_t *, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES),
6479             M_TEMP, M_WAITOK | M_ZERO);
6480         VERIFY(mleak_stat != NULL);
6481         mleak_stat->ml_cnt = MLEAK_NUM_TRACES;
6482 #ifdef __LP64__
6483         mleak_stat->ml_isaddr64 = 1;
6484 #endif /* __LP64__ */
6485 }
6486
6487 static void
6488 mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc)
6489 {
6490         int temp;
6491
6492         if (mclfindleak == 0)
6493                 return;
6494
6495         if (!alloc)
6496                 return (mleak_free(addr));
6497
6498         temp = atomic_add_32_ov(&mleak_table.mleak_capture, 1);
6499
6500         if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) {
6501                 uintptr_t bt[MLEAK_STACK_DEPTH];
6502                 int logged = fastbacktrace(bt, MLEAK_STACK_DEPTH);
6503                 mleak_log(bt, addr, logged, num);
6504         }
6505 }
6506
6507 /*
6508  * This function records the allocation in the mleak_allocations table
6509  * and the backtrace in the mleak_traces table; if allocation slot is in use,
6510  * replace old allocation with new one if the trace slot is in use, return
6511  * (or increment refcount if same trace).
6512  */
6513 static boolean_t
6514 mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num)
6515 {
6516         struct mallocation *allocation;
6517         struct mtrace *trace;
6518         uint32_t trace_index;
6519
6520         /* Quit if someone else modifying the tables */
6521         if (!lck_mtx_try_lock_spin(mleak_lock)) {
6522                 mleak_table.total_conflicts++;
6523                 return (FALSE);
6524         }
6525
6526         allocation = &mleak_allocations[hashaddr((uintptr_t)addr,
6527             mleak_alloc_buckets)];
6528         trace_index = hashbacktrace(bt, depth, mleak_trace_buckets);
6529         trace = &mleak_traces[trace_index];
6530
6531         VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]);
6532         VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]);
6533
6534         allocation->hitcount++;
6535         trace->hitcount++;
6536
6537         /*
6538          * If the allocation bucket we want is occupied
6539          * and the occupier has the same trace, just bail.
6540          */
6541         if (allocation->element != NULL &&
6542             trace_index == allocation->trace_index) {
6543                 mleak_table.alloc_collisions++;
6544                 lck_mtx_unlock(mleak_lock);
6545                 return (TRUE);
6546         }
6547
6548         /*
6549          * Store the backtrace in the traces array;
6550          * Size of zero = trace bucket is free.
6551          */
6552         if (trace->allocs > 0 &&
6553             bcmp(trace->addr, bt, (depth * sizeof (uintptr_t))) != 0) {
6554                 /* Different, unique trace, but the same hash! Bail out. */
6555                 trace->collisions++;
6556                 mleak_table.trace_collisions++;
6557                 lck_mtx_unlock(mleak_lock);
6558                 return (TRUE);
6559         } else if (trace->allocs > 0) {
6560                 /* Same trace, already added, so increment refcount */
6561                 trace->allocs++;
6562         } else {
6563                 /* Found an unused trace bucket, so record the trace here */
6564                 if (trace->depth != 0) {
6565                         /* this slot previously used but not currently in use */
6566                         mleak_table.trace_overwrites++;
6567                 }
6568                 mleak_table.trace_recorded++;
6569                 trace->allocs = 1;
6570                 memcpy(trace->addr, bt, (depth * sizeof (uintptr_t)));
6571                 trace->depth = depth;
6572                 trace->collisions = 0;
6573         }
6574
6575         /* Step 2: Store the allocation record in the allocations array */
6576         if (allocation->element != NULL) {
6577                 /*
6578                  * Replace an existing allocation.  No need to preserve
6579                  * because only a subset of the allocations are being
6580                  * recorded anyway.
6581                  */
6582                 mleak_table.alloc_collisions++;
6583         } else if (allocation->trace_index != 0) {
6584                 mleak_table.alloc_overwrites++;
6585         }
6586         allocation->element = addr;
6587         allocation->trace_index = trace_index;
6588         allocation->count = num;
6589         mleak_table.alloc_recorded++;
6590         mleak_table.outstanding_allocs++;
6591
6592         lck_mtx_unlock(mleak_lock);
6593         return (TRUE);
6594 }
6595
6596 static void
6597 mleak_free(mcache_obj_t *addr)
6598 {
6599         while (addr != NULL) {
6600                 struct mallocation *allocation = &mleak_allocations
6601                     [hashaddr((uintptr_t)addr, mleak_alloc_buckets)];
6602
6603                 if (allocation->element == addr &&
6604                     allocation->trace_index < mleak_trace_buckets) {
6605                         lck_mtx_lock_spin(mleak_lock);
6606                         if (allocation->element == addr &&
6607                             allocation->trace_index < mleak_trace_buckets) {
6608                                 struct mtrace *trace;
6609                                 trace = &mleak_traces[allocation->trace_index];
6610                                 /* allocs = 0 means trace bucket is unused */
6611                                 if (trace->allocs > 0)
6612                                         trace->allocs--;
6613                                 if (trace->allocs == 0)
6614                                         trace->depth = 0;
6615                                 /* NULL element means alloc bucket is unused */
6616                                 allocation->element = NULL;
6617                                 mleak_table.outstanding_allocs--;
6618                         }
6619                         lck_mtx_unlock(mleak_lock);
6620                 }
6621                 addr = addr->obj_next;
6622         }
6623 }
6624
6625 static void
6626 mleak_sort_traces()
6627 {
6628         int i, j, k;
6629         struct mtrace *swap;
6630
6631         for(i = 0; i < MLEAK_NUM_TRACES; i++)
6632                 mleak_top_trace[i] = NULL;
6633
6634         for(i = 0, j = 0; j < MLEAK_NUM_TRACES && i < mleak_trace_buckets; i++)
6635         {
6636                 if (mleak_traces[i].allocs <= 0)
6637                         continue;
6638
6639                 mleak_top_trace[j] = &mleak_traces[i];
6640                 for (k = j; k > 0; k--) {
6641                         if (mleak_top_trace[k]->allocs <=
6642                             mleak_top_trace[k-1]->allocs)
6643                                 break;
6644
6645                         swap = mleak_top_trace[k-1];
6646                         mleak_top_trace[k-1] = mleak_top_trace[k];
6647                         mleak_top_trace[k] = swap;
6648                 }
6649                 j++;
6650         }
6651
6652         j--;
6653         for(; i < mleak_trace_buckets; i++) {
6654                 if (mleak_traces[i].allocs <= mleak_top_trace[j]->allocs)
6655                         continue;
6656
6657                 mleak_top_trace[j] = &mleak_traces[i];
6658
6659                 for (k = j; k > 0; k--) {
6660                         if (mleak_top_trace[k]->allocs <=
6661                             mleak_top_trace[k-1]->allocs)
6662                                 break;
6663
6664                         swap = mleak_top_trace[k-1];
6665                         mleak_top_trace[k-1] = mleak_top_trace[k];
6666                         mleak_top_trace[k] = swap;
6667                 }
6668         }
6669 }
6670
6671 static void
6672 mleak_update_stats()
6673 {
6674         mleak_trace_stat_t *mltr;
6675         int i;
6676
6677         VERIFY(mleak_stat != NULL);
6678 #ifdef __LP64__
6679         VERIFY(mleak_stat->ml_isaddr64);
6680 #else
6681         VERIFY(!mleak_stat->ml_isaddr64);
6682 #endif /* !__LP64__ */
6683         VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES);
6684
6685         mleak_sort_traces();
6686
6687         mltr = &mleak_stat->ml_trace[0];
6688         bzero(mltr, sizeof (*mltr) * MLEAK_NUM_TRACES);
6689         for (i = 0; i < MLEAK_NUM_TRACES; i++) {
6690         int j;
6691
6692                 if (mleak_top_trace[i] == NULL ||
6693                     mleak_top_trace[i]->allocs == 0)
6694                         continue;
6695
6696                 mltr->mltr_collisions   = mleak_top_trace[i]->collisions;
6697                 mltr->mltr_hitcount     = mleak_top_trace[i]->hitcount;
6698                 mltr->mltr_allocs       = mleak_top_trace[i]->allocs;
6699                 mltr->mltr_depth        = mleak_top_trace[i]->depth;
6700
6701                 VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH);
6702                 for (j = 0; j < mltr->mltr_depth; j++)
6703                         mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j];
6704
6705                 mltr++;
6706         }
6707 }
6708
6709 static struct mbtypes {
6710         int             mt_type;
6711         const char      *mt_name;
6712 } mbtypes[] = {
6713         { MT_DATA,      "data" },
6714         { MT_OOBDATA,   "oob data" },
6715         { MT_CONTROL,   "ancillary data" },
6716         { MT_HEADER,    "packet headers" },
6717         { MT_SOCKET,    "socket structures" },
6718         { MT_PCB,       "protocol control blocks" },
6719         { MT_RTABLE,    "routing table entries" },
6720         { MT_HTABLE,    "IMP host table entries" },
6721         { MT_ATABLE,    "address resolution tables" },
6722         { MT_FTABLE,    "fragment reassembly queue headers" },
6723         { MT_SONAME,    "socket names and addresses" },
6724         { MT_SOOPTS,    "socket options" },
6725         { MT_RIGHTS,    "access rights" },
6726         { MT_IFADDR,    "interface addresses" },
6727         { MT_TAG,       "packet tags" },
6728         { 0,            NULL }
6729 };
6730
6731 #define MBUF_DUMP_BUF_CHK() {   \
6732         clen -= k;              \
6733         if (clen < 1)           \
6734                 goto done;      \
6735         c += k;                 \
6736 }
6737
6738 static char *
6739 mbuf_dump(void)
6740 {
6741         unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct;
6742         u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0;
6743         u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0;
6744         u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0;
6745         int nmbtypes = sizeof (mbstat.m_mtypes) / sizeof (short);
6746         uint8_t seen[256];
6747         struct mbtypes *mp;
6748         mb_class_stat_t *sp;
6749         mleak_trace_stat_t *mltr;
6750         char *c = mbuf_dump_buf;
6751         int i, k, clen = MBUF_DUMP_BUF_SIZE;
6752
6753         mbuf_dump_buf[0] = '\0';
6754
6755         /* synchronize all statistics in the mbuf table */
6756         mbuf_stat_sync();
6757         mbuf_mtypes_sync(TRUE);
6758
6759         sp = &mb_stat->mbs_class[0];
6760         for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) {
6761                 u_int32_t mem;
6762
6763                 if (m_class(i) == MC_MBUF) {
6764                         m_mbufs = sp->mbcl_active;
6765                 } else if (m_class(i) == MC_CL) {
6766                         m_clfree = sp->mbcl_total - sp->mbcl_active;
6767                 } else if (m_class(i) == MC_BIGCL) {
6768                         m_bigclfree = sp->mbcl_total - sp->mbcl_active;
6769                 } else if (njcl > 0 && m_class(i) == MC_16KCL) {
6770                         m_16kclfree = sp->mbcl_total - sp->mbcl_active;
6771                         m_16kclusters = sp->mbcl_total;
6772                 } else if (m_class(i) == MC_MBUF_CL) {
6773                         m_mbufclfree = sp->mbcl_total - sp->mbcl_active;
6774                 } else if (m_class(i) == MC_MBUF_BIGCL) {
6775                         m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active;
6776                 } else if (njcl > 0 && m_class(i) == MC_MBUF_16KCL) {
6777                         m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active;
6778                 }
6779
6780                 mem = sp->mbcl_ctotal * sp->mbcl_size;
6781                 totmem += mem;
6782                 totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) *
6783                     sp->mbcl_size;
6784
6785         }
6786
6787         /* adjust free counts to include composite caches */
6788         m_clfree += m_mbufclfree;
6789         m_bigclfree += m_mbufbigclfree;
6790         m_16kclfree += m_mbuf16kclfree;
6791
6792         totmbufs = 0;
6793         for (mp = mbtypes; mp->mt_name != NULL; mp++)
6794                 totmbufs += mbstat.m_mtypes[mp->mt_type];
6795         if (totmbufs > m_mbufs)
6796                 totmbufs = m_mbufs;
6797         k = snprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs);
6798         MBUF_DUMP_BUF_CHK();
6799
6800         bzero(&seen, sizeof (seen));
6801         for (mp = mbtypes; mp->mt_name != NULL; mp++) {
6802                 if (mbstat.m_mtypes[mp->mt_type] != 0) {
6803                         seen[mp->mt_type] = 1;
6804                         k = snprintf(c, clen, "\t%u mbufs allocated to %s\n",
6805                             mbstat.m_mtypes[mp->mt_type], mp->mt_name);
6806                         MBUF_DUMP_BUF_CHK();
6807                 }
6808         }
6809         seen[MT_FREE] = 1;
6810         for (i = 0; i < nmbtypes; i++)
6811                 if (!seen[i] && mbstat.m_mtypes[i] != 0) {
6812                         k = snprintf(c, clen, "\t%u mbufs allocated to "
6813                             "<mbuf type %d>\n", mbstat.m_mtypes[i], i);
6814                         MBUF_DUMP_BUF_CHK();
6815                 }
6816         if ((m_mbufs - totmbufs) > 0) {
6817                 k = snprintf(c, clen, "\t%lu mbufs allocated to caches\n",
6818                     m_mbufs - totmbufs);
6819                 MBUF_DUMP_BUF_CHK();
6820         }
6821         k = snprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n"
6822             "%u/%u mbuf 4KB clusters in use\n",
6823             (unsigned int)(mbstat.m_clusters - m_clfree),
6824             (unsigned int)mbstat.m_clusters,
6825             (unsigned int)(mbstat.m_bigclusters - m_bigclfree),
6826             (unsigned int)mbstat.m_bigclusters);
6827         MBUF_DUMP_BUF_CHK();
6828
6829         if (njcl > 0) {
6830                 k = snprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n",
6831                     m_16kclusters - m_16kclfree, m_16kclusters,
6832                     njclbytes / 1024);
6833                 MBUF_DUMP_BUF_CHK();
6834         }
6835         totused = totmem - totfree;
6836         if (totmem == 0) {
6837                 totpct = 0;
6838         } else if (totused < (ULONG_MAX / 100)) {
6839                 totpct = (totused * 100) / totmem;
6840         } else {
6841                 u_long totmem1 = totmem / 100;
6842                 u_long totused1 = totused / 100;
6843                 totpct = (totused1 * 100) / totmem1;
6844         }
6845         k = snprintf(c, clen, "%lu KB allocated to network (approx. %lu%% "
6846             "in use)\n", totmem / 1024, totpct);
6847         MBUF_DUMP_BUF_CHK();
6848
6849         /* mbuf leak detection statistics */
6850         mleak_update_stats();
6851
6852         k = snprintf(c, clen, "\nmbuf leak detection table:\n");
6853         MBUF_DUMP_BUF_CHK();
6854         k = snprintf(c, clen, "\ttotal captured: %u (one per %u)\n",
6855             mleak_table.mleak_capture / mleak_table.mleak_sample_factor,
6856             mleak_table.mleak_sample_factor);
6857         MBUF_DUMP_BUF_CHK();
6858         k = snprintf(c, clen, "\ttotal allocs outstanding: %llu\n",
6859             mleak_table.outstanding_allocs);
6860         MBUF_DUMP_BUF_CHK();
6861         k = snprintf(c, clen, "\tnew hash recorded: %llu allocs, %llu traces\n",
6862             mleak_table.alloc_recorded, mleak_table.trace_recorded);
6863         MBUF_DUMP_BUF_CHK();
6864         k = snprintf(c, clen, "\thash collisions: %llu allocs, %llu traces\n",
6865             mleak_table.alloc_collisions, mleak_table.trace_collisions);
6866         MBUF_DUMP_BUF_CHK();
6867         k = snprintf(c, clen, "\toverwrites: %llu allocs, %llu traces\n",
6868             mleak_table.alloc_overwrites, mleak_table.trace_overwrites);
6869         MBUF_DUMP_BUF_CHK();
6870         k = snprintf(c, clen, "\tlock conflicts: %llu\n\n",
6871             mleak_table.total_conflicts);
6872         MBUF_DUMP_BUF_CHK();
6873
6874         k = snprintf(c, clen, "top %d outstanding traces:\n",
6875             mleak_stat->ml_cnt);
6876         MBUF_DUMP_BUF_CHK();
6877         for (i = 0; i < mleak_stat->ml_cnt; i++) {
6878                 mltr = &mleak_stat->ml_trace[i];
6879                 k = snprintf(c, clen, "[%d] %llu outstanding alloc(s), "
6880                     "%llu hit(s), %llu collision(s)\n", (i + 1),
6881                     mltr->mltr_allocs, mltr->mltr_hitcount,
6882                     mltr->mltr_collisions);
6883                 MBUF_DUMP_BUF_CHK();
6884         }
6885
6886         if (mleak_stat->ml_isaddr64)
6887                 k = snprintf(c, clen, MB_LEAK_HDR_64);
6888         else
6889                 k = snprintf(c, clen, MB_LEAK_HDR_32);
6890         MBUF_DUMP_BUF_CHK();
6891
6892         for (i = 0; i < MLEAK_STACK_DEPTH; i++) {
6893                 int j;
6894                 k = snprintf(c, clen, "%2d: ", (i + 1));
6895                 MBUF_DUMP_BUF_CHK();
6896                 for (j = 0; j < mleak_stat->ml_cnt; j++) {
6897                         mltr = &mleak_stat->ml_trace[j];
6898                         if (i < mltr->mltr_depth) {
6899                                 if (mleak_stat->ml_isaddr64) {
6900                                         k = snprintf(c, clen, "0x%0llx  ",
6901                                             mltr->mltr_addr[i]);
6902                                 } else {
6903                                         k = snprintf(c, clen,
6904                                             "0x%08x  ",
6905                                             (u_int32_t)mltr->mltr_addr[i]);
6906                                 }
6907                         } else {
6908                                 if (mleak_stat->ml_isaddr64)
6909                                         k = snprintf(c, clen,
6910                                             MB_LEAK_SPACING_64);
6911                                 else
6912                                         k = snprintf(c, clen,
6913                                             MB_LEAK_SPACING_32);
6914                         }
6915                         MBUF_DUMP_BUF_CHK();
6916                 }
6917                 k = snprintf(c, clen, "\n");
6918                 MBUF_DUMP_BUF_CHK();
6919         }
6920 done:
6921         return (mbuf_dump_buf);
6922 }
6923
6924 #undef MBUF_DUMP_BUF_CHK
6925
6926 SYSCTL_DECL(_kern_ipc);
6927 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat,
6928     CTLFLAG_RD | CTLFLAG_LOCKED,
6929     0, 0, mbstat_sysctl, "S,mbstat", "");
6930 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat,
6931     CTLFLAG_RD | CTLFLAG_LOCKED,
6932     0, 0, mb_stat_sysctl, "S,mb_stat", "");
6933 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace,
6934     CTLFLAG_RD | CTLFLAG_LOCKED,
6935     0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", "");
6936 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table,
6937     CTLFLAG_RD | CTLFLAG_LOCKED,
6938     0, 0, mleak_table_sysctl, "S,mleak_table", "");
6939 SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor,
6940     CTLFLAG_RW | CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, "");
6941 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized,
6942     CTLFLAG_RD | CTLFLAG_LOCKED, &mb_normalized, 0, "");
6943 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog,
6944     CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, "");