bsd/kern/uipc_mbuf.c

   1 /*
   2  * Copyright (c) 1998-2011 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1988, 1991, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/malloc.h>
  73 #include <sys/mbuf.h>
  74 #include <sys/kernel.h>
  75 #include <sys/sysctl.h>
  76 #include <sys/syslog.h>
  77 #include <sys/protosw.h>
  78 #include <sys/domain.h>
  79 #include <sys/queue.h>
  80 #include <sys/proc.h>
  81
  82 #include <kern/kern_types.h>
  83 #include <kern/simple_lock.h>
  84 #include <kern/queue.h>
  85 #include <kern/sched_prim.h>
  86 #include <kern/cpu_number.h>
  87 #include <kern/zalloc.h>
  88
  89 #include <libkern/OSAtomic.h>
  90 #include <libkern/libkern.h>
  91
  92 #include <IOKit/IOMapper.h>
  93
  94 #include <machine/limits.h>
  95 #include <machine/machine_routines.h>
  96
  97 #if CONFIG_MACF_NET
  98 #include <security/mac_framework.h>
  99 #endif /* MAC_NET */
 100
 101 #include <sys/mcache.h>
 102
 103 /*
 104  * MBUF IMPLEMENTATION NOTES.
 105  *
 106  * There is a total of 5 per-CPU caches:
 107  *
 108  * MC_MBUF:
 109  *      This is a cache of rudimentary objects of MSIZE in size; each
 110  *      object represents an mbuf structure.  This cache preserves only
 111  *      the m_type field of the mbuf during its transactions.
 112  *
 113  * MC_CL:
 114  *      This is a cache of rudimentary objects of MCLBYTES in size; each
 115  *      object represents a mcluster structure.  This cache does not
 116  *      preserve the contents of the objects during its transactions.
 117  *
 118  * MC_BIGCL:
 119  *      This is a cache of rudimentary objects of MBIGCLBYTES in size; each
 120  *      object represents a mbigcluster structure.  This cache does not
 121  *      preserve the contents of the objects during its transaction.
 122  *
 123  * MC_MBUF_CL:
 124  *      This is a cache of mbufs each having a cluster attached to it.
 125  *      It is backed by MC_MBUF and MC_CL rudimentary caches.  Several
 126  *      fields of the mbuf related to the external cluster are preserved
 127  *      during transactions.
 128  *
 129  * MC_MBUF_BIGCL:
 130  *      This is a cache of mbufs each having a big cluster attached to it.
 131  *      It is backed by MC_MBUF and MC_BIGCL rudimentary caches.  Several
 132  *      fields of the mbuf related to the external cluster are preserved
 133  *      during transactions.
 134  *
 135  * OBJECT ALLOCATION:
 136  *
 137  * Allocation requests are handled first at the per-CPU (mcache) layer
 138  * before falling back to the slab layer.  Performance is optimal when
 139  * the request is satisfied at the CPU layer because global data/lock
 140  * never gets accessed.  When the slab layer is entered for allocation,
 141  * the slab freelist will be checked first for available objects before
 142  * the VM backing store is invoked.  Slab layer operations are serialized
 143  * for all of the caches as the mbuf global lock is held most of the time.
 144  * Allocation paths are different depending on the class of objects:
 145  *
 146  * a. Rudimentary object:
 147  *
 148  *      { m_get_common(), m_clattach(), m_mclget(),
 149  *        m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
 150  *        composite object allocation }
 151  *                      |       ^
 152  *                      |       |
 153  *                      |       +-----------------------+
 154  *                      v                               |
 155  *         mcache_alloc/mcache_alloc_ext()      mbuf_slab_audit()
 156  *                      |                               ^
 157  *                      v                               |
 158  *                 [CPU cache] -------> (found?) -------+
 159  *                      |                               |
 160  *                      v                               |
 161  *               mbuf_slab_alloc()                      |
 162  *                      |                               |
 163  *                      v                               |
 164  *      +---------> [freelist] -------> (found?) -------+
 165  *      |               |
 166  *      |               v
 167  *      |           m_clalloc()
 168  *      |               |
 169  *      |               v
 170  *      +---<<---- kmem_mb_alloc()
 171  *
 172  * b. Composite object:
 173  *
 174  *      { m_getpackets_internal(), m_allocpacket_internal() }
 175  *                      |       ^
 176  *                      |       |
 177  *                      |       +------ (done) ---------+
 178  *                      v                               |
 179  *         mcache_alloc/mcache_alloc_ext()      mbuf_cslab_audit()
 180  *                      |                               ^
 181  *                      v                               |
 182  *                 [CPU cache] -------> (found?) -------+
 183  *                      |                               |
 184  *                      v                               |
 185  *               mbuf_cslab_alloc()                     |
 186  *                      |                               |
 187  *                      v                               |
 188  *                  [freelist] -------> (found?) -------+
 189  *                      |                               |
 190  *                      v                               |
 191  *              (rudimentary object)                    |
 192  *         mcache_alloc/mcache_alloc_ext() ------>>-----+
 193  *
 194  * Auditing notes: If auditing is enabled, buffers will be subjected to
 195  * integrity checks by the audit routine.  This is done by verifying their
 196  * contents against DEADBEEF (free) pattern before returning them to caller.
 197  * As part of this step, the routine will also record the transaction and
 198  * pattern-fill the buffers with BADDCAFE (uninitialized) pattern.  It will
 199  * also restore any constructed data structure fields if necessary.
 200  *
 201  * OBJECT DEALLOCATION:
 202  *
 203  * Freeing an object simply involves placing it into the CPU cache; this
 204  * pollutes the cache to benefit subsequent allocations.  The slab layer
 205  * will only be entered if the object is to be purged out of the cache.
 206  * During normal operations, this happens only when the CPU layer resizes
 207  * its bucket while it's adjusting to the allocation load.  Deallocation
 208  * paths are different depending on the class of objects:
 209  *
 210  * a. Rudimentary object:
 211  *
 212  *      { m_free(), m_freem_list(), composite object deallocation }
 213  *                      |       ^
 214  *                      |       |
 215  *                      |       +------ (done) ---------+
 216  *                      v                               |
 217  *         mcache_free/mcache_free_ext()                |
 218  *                      |                               |
 219  *                      v                               |
 220  *              mbuf_slab_audit()                       |
 221  *                      |                               |
 222  *                      v                               |
 223  *                 [CPU cache] ---> (not purging?) -----+
 224  *                      |                               |
 225  *                      v                               |
 226  *               mbuf_slab_free()                       |
 227  *                      |                               |
 228  *                      v                               |
 229  *                  [freelist] ----------->>------------+
 230  *       (objects never get purged to VM)
 231  *
 232  * b. Composite object:
 233  *
 234  *      { m_free(), m_freem_list() }
 235  *                      |       ^
 236  *                      |       |
 237  *                      |       +------ (done) ---------+
 238  *                      v                               |
 239  *         mcache_free/mcache_free_ext()                |
 240  *                      |                               |
 241  *                      v                               |
 242  *              mbuf_cslab_audit()                      |
 243  *                      |                               |
 244  *                      v                               |
 245  *                 [CPU cache] ---> (not purging?) -----+
 246  *                      |                               |
 247  *                      v                               |
 248  *               mbuf_cslab_free()                      |
 249  *                      |                               |
 250  *                      v                               |
 251  *                  [freelist] ---> (not purging?) -----+
 252  *                      |                               |
 253  *                      v                               |
 254  *              (rudimentary object)                    |
 255  *         mcache_free/mcache_free_ext() ------->>------+
 256  *
 257  * Auditing notes: If auditing is enabled, the audit routine will save
 258  * any constructed data structure fields (if necessary) before filling the
 259  * contents of the buffers with DEADBEEF (free) pattern and recording the
 260  * transaction.  Buffers that are freed (whether at CPU or slab layer) are
 261  * expected to contain the free pattern.
 262  *
 263  * DEBUGGING:
 264  *
 265  * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
 266  * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT).  Additionally,
 267  * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
 268  * i.e. modify the boot argument parameter to "mbuf_debug=0x13".  Leak
 269  * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g.
 270  * "mbuf_debug=0x113".  Note that debugging consumes more CPU and memory.
 271  *
 272  * Each object is associated with exactly one mcache_audit_t structure that
 273  * contains the information related to its last buffer transaction.  Given
 274  * an address of an object, the audit structure can be retrieved by finding
 275  * the position of the object relevant to the base address of the cluster:
 276  *
 277  *      +------------+                  +=============+
 278  *      | mbuf addr  |                  | mclaudit[i] |
 279  *      +------------+                  +=============+
 280  *            |                         | cl_audit[0] |
 281  *      i = MTOBG(addr)                 +-------------+
 282  *            |                 +-----> | cl_audit[1] | -----> mcache_audit_t
 283  *      b = BGTOM(i)            |       +-------------+
 284  *            |                 |       |     ...     |
 285  *      x = MCLIDX(b, addr)     |       +-------------+
 286  *            |                 |       | cl_audit[7] |
 287  *            +-----------------+       +-------------+
 288  *               (e.g. x == 1)
 289  *
 290  * The mclaudit[] array is allocated at initialization time, but its contents
 291  * get populated when the corresponding cluster is created.  Because a page
 292  * can be turned into NMBPBG number of mbufs, we preserve enough space for the
 293  * mbufs so that there is a 1-to-1 mapping between them.  A page that never
 294  * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
 295  * remaining entries unused.  For 16KB cluster, only one entry from the first
 296  * page is allocated and used for the entire object.
 297  */
 298
 299 /* TODO: should be in header file */
 300 /* kernel translater */
 301 extern vm_offset_t kmem_mb_alloc(vm_map_t, int, int);
 302 extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
 303 extern vm_map_t mb_map;         /* special map */
 304
 305 /* Global lock */
 306 static lck_mtx_t *mbuf_mlock;
 307 static lck_attr_t *mbuf_mlock_attr;
 308 static lck_grp_t *mbuf_mlock_grp;
 309 static lck_grp_attr_t *mbuf_mlock_grp_attr;
 310
 311 /* Back-end (common) layer */
 312 static void *mbuf_worker_run;   /* wait channel for worker thread */
 313 static int mbuf_worker_ready;   /* worker thread is runnable */
 314 static int mbuf_expand_mcl;     /* number of cluster creation requets */
 315 static int mbuf_expand_big;     /* number of big cluster creation requests */
 316 static int mbuf_expand_16k;     /* number of 16KB cluster creation requests */
 317 static int ncpu;                /* number of CPUs */
 318 static ppnum_t *mcl_paddr;      /* Array of cluster physical addresses */
 319 static ppnum_t mcl_pages;       /* Size of array (# physical pages) */
 320 static ppnum_t mcl_paddr_base;  /* Handle returned by IOMapper::iovmAlloc() */
 321 static mcache_t *ref_cache;     /* Cache of cluster reference & flags */
 322 static mcache_t *mcl_audit_con_cache; /* Audit contents cache */
 323 static unsigned int mbuf_debug; /* patchable mbuf mcache flags */
 324 static unsigned int mb_normalized; /* number of packets "normalized" */
 325
 326 #define MB_GROWTH_AGGRESSIVE    1       /* Threshold: 1/2 of total */
 327 #define MB_GROWTH_NORMAL        2       /* Threshold: 3/4 of total */
 328
 329 typedef enum {
 330         MC_MBUF = 0,    /* Regular mbuf */
 331         MC_CL,          /* Cluster */
 332         MC_BIGCL,       /* Large (4KB) cluster */
 333         MC_16KCL,       /* Jumbo (16KB) cluster */
 334         MC_MBUF_CL,     /* mbuf + cluster */
 335         MC_MBUF_BIGCL,  /* mbuf + large (4KB) cluster */
 336         MC_MBUF_16KCL   /* mbuf + jumbo (16KB) cluster */
 337 } mbuf_class_t;
 338
 339 #define MBUF_CLASS_MIN          MC_MBUF
 340 #define MBUF_CLASS_MAX          MC_MBUF_16KCL
 341 #define MBUF_CLASS_LAST         MC_16KCL
 342 #define MBUF_CLASS_VALID(c) \
 343         ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
 344 #define MBUF_CLASS_COMPOSITE(c) \
 345         ((int)(c) > MBUF_CLASS_LAST)
 346
 347
 348 /*
 349  * mbuf specific mcache allocation request flags.
 350  */
 351 #define MCR_COMP        MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
 352
 353 /*
 354  * Per-cluster slab structure.
 355  *
 356  * A slab is a cluster control structure that contains one or more object
 357  * chunks; the available chunks are chained in the slab's freelist (sl_head).
 358  * Each time a chunk is taken out of the slab, the slab's reference count
 359  * gets incremented.  When all chunks have been taken out, the empty slab
 360  * gets removed (SLF_DETACHED) from the class's slab list.  A chunk that is
 361  * returned to a slab causes the slab's reference count to be decremented;
 362  * it also causes the slab to be reinserted back to class's slab list, if
 363  * it's not already done.
 364  *
 365  * Compartmentalizing of the object chunks into slabs allows us to easily
 366  * merge one or more slabs together when the adjacent slabs are idle, as
 367  * well as to convert or move a slab from one class to another; e.g. the
 368  * mbuf cluster slab can be converted to a regular cluster slab when all
 369  * mbufs in the slab have been freed.
 370  *
 371  * A slab may also span across multiple clusters for chunks larger than
 372  * a cluster's size.  In this case, only the slab of the first cluster is
 373  * used.  The rest of the slabs are marked with SLF_PARTIAL to indicate
 374  * that they are part of the larger slab.
 375  *
 376  * Each slab controls a page of memory.
 377  */
 378 typedef struct mcl_slab {
 379         struct mcl_slab *sl_next;       /* neighboring slab */
 380         u_int8_t        sl_class;       /* controlling mbuf class */
 381         int8_t          sl_refcnt;      /* outstanding allocations */
 382         int8_t          sl_chunks;      /* chunks (bufs) in this slab */
 383         u_int16_t       sl_flags;       /* slab flags (see below) */
 384         u_int16_t       sl_len;         /* slab length */
 385         void            *sl_base;       /* base of allocated memory */
 386         void            *sl_head;       /* first free buffer */
 387         TAILQ_ENTRY(mcl_slab) sl_link;  /* next/prev slab on freelist */
 388 } mcl_slab_t;
 389
 390 #define SLF_MAPPED      0x0001          /* backed by a mapped page */
 391 #define SLF_PARTIAL     0x0002          /* part of another slab */
 392 #define SLF_DETACHED    0x0004          /* not in slab freelist */
 393
 394 /*
 395  * The array of slabs are broken into groups of arrays per 1MB of kernel
 396  * memory to reduce the footprint.  Each group is allocated on demand
 397  * whenever a new piece of memory mapped in from the VM crosses the 1MB
 398  * boundary.
 399  */
 400 #define NSLABSPMB       ((1 << MBSHIFT) >> PGSHIFT)     /* 256 slabs/grp */
 401
 402 typedef struct mcl_slabg {
 403         mcl_slab_t      slg_slab[NSLABSPMB];    /* group of slabs */
 404 } mcl_slabg_t;
 405
 406 /*
 407  * Number of slabs needed to control a 16KB cluster object.
 408  */
 409 #define NSLABSP16KB     (M16KCLBYTES >> PGSHIFT)
 410
 411 /*
 412  * Per-cluster audit structure.
 413  */
 414 typedef struct {
 415         mcache_audit_t  *cl_audit[NMBPBG];      /* array of audits */
 416 } mcl_audit_t;
 417
 418 /*
 419  * Size of data from the beginning of an mbuf that covers m_hdr, pkthdr
 420  * and m_ext structures.  If auditing is enabled, we allocate a shadow
 421  * mbuf structure of this size inside each audit structure, and the
 422  * contents of the real mbuf gets copied into it when the mbuf is freed.
 423  * This allows us to pattern-fill the mbuf for integrity check, and to
 424  * preserve any constructed mbuf fields (e.g. mbuf + cluster cache case).
 425  * Note that we don't save the contents of clusters when they are freed;
 426  * we simply pattern-fill them.
 427  */
 428 #define AUDIT_CONTENTS_SIZE     ((MSIZE - MHLEN) + sizeof (_m_ext_t))
 429
 430 /*
 431  * mbuf specific mcache audit flags
 432  */
 433 #define MB_INUSE        0x01    /* object has not been returned to slab */
 434 #define MB_COMP_INUSE   0x02    /* object has not been returned to cslab */
 435 #define MB_SCVALID      0x04    /* object has valid saved contents */
 436
 437 /*
 438  * Each of the following two arrays hold up to nmbclusters elements.
 439  */
 440 static mcl_audit_t *mclaudit;   /* array of cluster audit information */
 441 static unsigned int maxclaudit; /* max # of entries in audit table */
 442 static mcl_slabg_t **slabstbl;  /* cluster slabs table */
 443 static unsigned int maxslabgrp; /* max # of entries in slabs table */
 444 static unsigned int slabgrp;    /* # of entries in slabs table */
 445
 446 /* Globals */
 447 int nclusters;                  /* # of clusters for non-jumbo (legacy) sizes */
 448 int njcl;                       /* # of clusters for jumbo sizes */
 449 int njclbytes;                  /* size of a jumbo cluster */
 450 union mbigcluster *mbutl;       /* first mapped cluster address */
 451 union mbigcluster *embutl;      /* ending virtual address of mclusters */
 452 int max_linkhdr;                /* largest link-level header */
 453 int max_protohdr;               /* largest protocol header */
 454 int max_hdr;                    /* largest link+protocol header */
 455 int max_datalen;                /* MHLEN - max_hdr */
 456
 457 static boolean_t mclverify;     /* debug: pattern-checking */
 458 static boolean_t mcltrace;      /* debug: stack tracing */
 459 static boolean_t mclfindleak;   /* debug: leak detection */
 460
 461 /* mbuf leak detection variables */
 462 static struct mleak_table mleak_table;
 463 static mleak_stat_t *mleak_stat;
 464
 465 #define MLEAK_STAT_SIZE(n) \
 466         ((size_t)(&((mleak_stat_t *)0)->ml_trace[n]))
 467
 468 struct mallocation {
 469         mcache_obj_t *element;  /* the alloc'ed element, NULL if unused */
 470         u_int32_t trace_index;  /* mtrace index for corresponding backtrace */
 471         u_int32_t count;        /* How many objects were requested */
 472         u_int64_t hitcount;     /* for determining hash effectiveness */
 473 };
 474
 475 struct mtrace {
 476         u_int64_t       collisions;
 477         u_int64_t       hitcount;
 478         u_int64_t       allocs;
 479         u_int64_t       depth;
 480         uintptr_t       addr[MLEAK_STACK_DEPTH];
 481 };
 482
 483 /* Size must be a power of two for the zhash to be able to just mask off bits */
 484 #define MLEAK_ALLOCATION_MAP_NUM        512
 485 #define MLEAK_TRACE_MAP_NUM             256
 486
 487 /*
 488  * Sample factor for how often to record a trace.  This is overwritable
 489  * by the boot-arg mleak_sample_factor.
 490  */
 491 #define MLEAK_SAMPLE_FACTOR             500
 492
 493 /*
 494  * Number of top leakers recorded.
 495  */
 496 #define MLEAK_NUM_TRACES                5
 497
 498 static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM;
 499 static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM;
 500
 501 /* Hashmaps of allocations and their corresponding traces */
 502 static struct mallocation *mleak_allocations;
 503 static struct mtrace *mleak_traces;
 504 static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES];
 505
 506 /* Lock to protect mleak tables from concurrent modification */
 507 static lck_mtx_t *mleak_lock;
 508 static lck_attr_t *mleak_lock_attr;
 509 static lck_grp_t *mleak_lock_grp;
 510 static lck_grp_attr_t *mleak_lock_grp_attr;
 511
 512 extern u_int32_t high_sb_max;
 513
 514 /* TODO: should be in header file */
 515 int do_reclaim = 0;
 516
 517 /* The minimum number of objects that are allocated, to start. */
 518 #define MINCL           32
 519 #define MINBIGCL        (MINCL >> 1)
 520 #define MIN16KCL        (MINCL >> 2)
 521
 522 /* Low watermarks (only map in pages once free counts go below) */
 523 #define MBIGCL_LOWAT    MINBIGCL
 524 #define M16KCL_LOWAT    MIN16KCL
 525
 526 typedef struct {
 527         mbuf_class_t    mtbl_class;     /* class type */
 528         mcache_t        *mtbl_cache;    /* mcache for this buffer class */
 529         TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */
 530         mcache_obj_t    *mtbl_cobjlist; /* composite objects freelist */
 531         mb_class_stat_t *mtbl_stats;    /* statistics fetchable via sysctl */
 532         u_int32_t       mtbl_maxsize;   /* maximum buffer size */
 533         int             mtbl_minlimit;  /* minimum allowed */
 534         int             mtbl_maxlimit;  /* maximum allowed */
 535         u_int32_t       mtbl_wantpurge; /* purge during next reclaim */
 536 } mbuf_table_t;
 537
 538 #define m_class(c)      mbuf_table[c].mtbl_class
 539 #define m_cache(c)      mbuf_table[c].mtbl_cache
 540 #define m_slablist(c)   mbuf_table[c].mtbl_slablist
 541 #define m_cobjlist(c)   mbuf_table[c].mtbl_cobjlist
 542 #define m_maxsize(c)    mbuf_table[c].mtbl_maxsize
 543 #define m_minlimit(c)   mbuf_table[c].mtbl_minlimit
 544 #define m_maxlimit(c)   mbuf_table[c].mtbl_maxlimit
 545 #define m_wantpurge(c)  mbuf_table[c].mtbl_wantpurge
 546 #define m_cname(c)      mbuf_table[c].mtbl_stats->mbcl_cname
 547 #define m_size(c)       mbuf_table[c].mtbl_stats->mbcl_size
 548 #define m_total(c)      mbuf_table[c].mtbl_stats->mbcl_total
 549 #define m_active(c)     mbuf_table[c].mtbl_stats->mbcl_active
 550 #define m_infree(c)     mbuf_table[c].mtbl_stats->mbcl_infree
 551 #define m_slab_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_slab_cnt
 552 #define m_alloc_cnt(c)  mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
 553 #define m_free_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_free_cnt
 554 #define m_notified(c)   mbuf_table[c].mtbl_stats->mbcl_notified
 555 #define m_purge_cnt(c)  mbuf_table[c].mtbl_stats->mbcl_purge_cnt
 556 #define m_fail_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_fail_cnt
 557 #define m_ctotal(c)     mbuf_table[c].mtbl_stats->mbcl_ctotal
 558
 559 static mbuf_table_t mbuf_table[] = {
 560         /*
 561          * The caches for mbufs, regular clusters and big clusters.
 562          */
 563         { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
 564             NULL, NULL, 0, 0, 0, 0 },
 565         { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
 566             NULL, NULL, 0, 0, 0, 0 },
 567         { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
 568             NULL, NULL, 0, 0, 0, 0 },
 569         { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
 570             NULL, NULL, 0, 0, 0, 0 },
 571         /*
 572          * The following are special caches; they serve as intermediate
 573          * caches backed by the above rudimentary caches.  Each object
 574          * in the cache is an mbuf with a cluster attached to it.  Unlike
 575          * the above caches, these intermediate caches do not directly
 576          * deal with the slab structures; instead, the constructed
 577          * cached elements are simply stored in the freelists.
 578          */
 579         { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
 580         { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
 581         { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
 582 };
 583
 584 #define NELEM(a)        (sizeof (a) / sizeof ((a)[0]))
 585
 586 static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */
 587 static int mb_waiters;                  /* number of waiters */
 588
 589 #define MB_WDT_MAXTIME  10              /* # of secs before watchdog panic */
 590 static struct timeval mb_wdtstart;      /* watchdog start timestamp */
 591 static char mbuf_dump_buf[256];
 592
 593 /*
 594  * mbuf watchdog is enabled by default on embedded platforms.  It is
 595  * also toggeable via the kern.ipc.mb_watchdog sysctl.
 596  */
 597 #if CONFIG_EMBEDDED
 598 static unsigned int mb_watchdog = 1;
 599 #else
 600 static unsigned int mb_watchdog = 0;
 601 #endif /* CONFIG_EMBEDDED */
 602
 603 /* The following are used to serialize m_clalloc() */
 604 static boolean_t mb_clalloc_busy;
 605 static void *mb_clalloc_waitchan = &mb_clalloc_busy;
 606 static int mb_clalloc_waiters;
 607
 608 static void mbuf_mtypes_sync(boolean_t);
 609 static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
 610 static void mbuf_stat_sync(void);
 611 static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
 612 static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS;
 613 static int mleak_table_sysctl SYSCTL_HANDLER_ARGS;
 614 static char *mbuf_dump(void);
 615 static void mbuf_table_init(void);
 616 static inline void m_incref(struct mbuf *);
 617 static inline u_int32_t m_decref(struct mbuf *);
 618 static int m_clalloc(const u_int32_t, const int, const u_int32_t);
 619 static void mbuf_worker_thread_init(void);
 620 static mcache_obj_t *slab_alloc(mbuf_class_t, int);
 621 static void slab_free(mbuf_class_t, mcache_obj_t *);
 622 static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***,
 623     unsigned int, int);
 624 static void mbuf_slab_free(void *, mcache_obj_t *, int);
 625 static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t);
 626 static void mbuf_slab_notify(void *, u_int32_t);
 627 static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
 628     unsigned int);
 629 static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
 630 static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***,
 631     unsigned int, int);
 632 static void mbuf_cslab_free(void *, mcache_obj_t *, int);
 633 static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t);
 634 static int freelist_populate(mbuf_class_t, unsigned int, int);
 635 static void freelist_init(mbuf_class_t);
 636 static boolean_t mbuf_cached_above(mbuf_class_t, int);
 637 static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
 638 static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
 639 static int m_howmany(int, size_t);
 640 static void mbuf_worker_thread(void);
 641 static void mbuf_watchdog(void);
 642 static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
 643
 644 static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **,
 645     size_t, unsigned int);
 646 static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *);
 647 static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t);
 648 static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t,
 649     boolean_t);
 650 static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t);
 651 static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *);
 652 static void mcl_audit_mcheck_panic(struct mbuf *);
 653 static void mcl_audit_verify_nextptr(void *, mcache_audit_t *);
 654
 655 static void mleak_activate(void);
 656 static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t);
 657 static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int);
 658 static void mleak_free(mcache_obj_t *);
 659
 660 static mcl_slab_t *slab_get(void *);
 661 static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
 662     void *, void *, unsigned int, int, int);
 663 static void slab_insert(mcl_slab_t *, mbuf_class_t);
 664 static void slab_remove(mcl_slab_t *, mbuf_class_t);
 665 static boolean_t slab_inrange(mcl_slab_t *, void *);
 666 static void slab_nextptr_panic(mcl_slab_t *, void *);
 667 static void slab_detach(mcl_slab_t *);
 668 static boolean_t slab_is_detached(mcl_slab_t *);
 669
 670 static int m_copyback0(struct mbuf **, int, int, const void *, int, int);
 671 static struct mbuf *m_split0(struct mbuf *, int, int, int);
 672
 673 /* flags for m_copyback0 */
 674 #define M_COPYBACK0_COPYBACK    0x0001  /* copyback from cp */
 675 #define M_COPYBACK0_PRESERVE    0x0002  /* preserve original data */
 676 #define M_COPYBACK0_COW         0x0004  /* do copy-on-write */
 677 #define M_COPYBACK0_EXTEND      0x0008  /* extend chain */
 678
 679 /*
 680  * This flag is set for all mbufs that come out of and into the composite
 681  * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL.  mbufs that
 682  * are marked with such a flag have clusters attached to them, and will be
 683  * treated differently when they are freed; instead of being placed back
 684  * into the mbuf and cluster freelists, the composite mbuf + cluster objects
 685  * are placed back into the appropriate composite cache's freelist, and the
 686  * actual freeing is deferred until the composite objects are purged.  At
 687  * such a time, this flag will be cleared from the mbufs and the objects
 688  * will be freed into their own separate freelists.
 689  */
 690 #define EXTF_COMPOSITE  0x1
 691
 692 /*
 693  * This flag indicates that the external cluster is read-only, i.e. it is
 694  * or was referred to by more than one mbufs.  Once set, this flag is never
 695  * cleared.
 696  */
 697 #define EXTF_READONLY   0x2
 698 #define EXTF_MASK       (EXTF_COMPOSITE | EXTF_READONLY)
 699
 700 #define MEXT_RFA(m)             ((m)->m_ext.ext_refflags)
 701 #define MEXT_REF(m)             (MEXT_RFA(m)->refcnt)
 702 #define MEXT_FLAGS(m)           (MEXT_RFA(m)->flags)
 703 #define MBUF_IS_COMPOSITE(m)    \
 704         (MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE)
 705
 706 /*
 707  * Macros used to verify the integrity of the mbuf.
 708  */
 709 #define _MCHECK(m) {                                                    \
 710         if ((m)->m_type != MT_FREE) {                                   \
 711                 if (mclaudit == NULL)                                   \
 712                         panic("MCHECK: m_type=%d m=%p",                 \
 713                             (u_int16_t)(m)->m_type, m);                 \
 714                 else                                                    \
 715                         mcl_audit_mcheck_panic(m);                      \
 716         }                                                               \
 717 }
 718
 719 #define MBUF_IN_MAP(addr)                                               \
 720         ((void *)(addr) >= (void *)mbutl && (void *)(addr) < (void *)embutl)
 721
 722 #define MRANGE(addr) {                                                  \
 723         if (!MBUF_IN_MAP(addr))                                         \
 724                 panic("MRANGE: address out of range 0x%p", addr);       \
 725 }
 726
 727 /*
 728  * Macro version of mtod.
 729  */
 730 #define MTOD(m, t)      ((t)((m)->m_data))
 731
 732 /*
 733  * Macros to obtain (4KB) cluster index and base cluster address.
 734  */
 735
 736 #define MTOBG(x)        (((char *)(x) - (char *)mbutl) >> MBIGCLSHIFT)
 737 #define BGTOM(x)        ((union mbigcluster *)(mbutl + (x)))
 738
 739 /*
 740  * Macro to find the mbuf index relative to a base.
 741  */
 742 #define MCLIDX(c, m)    (((char *)(m) - (char *)(c)) >> MSIZESHIFT)
 743
 744 /*
 745  * Same thing for 2KB cluster index.
 746  */
 747 #define CLBGIDX(c, m)   (((char *)(m) - (char *)(c)) >> MCLSHIFT)
 748
 749 /*
 750  * Macros used during mbuf and cluster initialization.
 751  */
 752 #define MBUF_INIT(m, pkthdr, type) {                                    \
 753         _MCHECK(m);                                                     \
 754         (m)->m_next = (m)->m_nextpkt = NULL;                            \
 755         (m)->m_len = 0;                                                 \
 756         (m)->m_type = type;                                             \
 757         if ((pkthdr) == 0) {                                            \
 758                 (m)->m_data = (m)->m_dat;                               \
 759                 (m)->m_flags = 0;                                       \
 760         } else {                                                        \
 761                 (m)->m_data = (m)->m_pktdat;                            \
 762                 (m)->m_flags = M_PKTHDR;                                \
 763                 (m)->m_pkthdr.rcvif = NULL;                             \
 764                 (m)->m_pkthdr.len = 0;                                  \
 765                 (m)->m_pkthdr.header = NULL;                            \
 766                 (m)->m_pkthdr.csum_flags = 0;                           \
 767                 (m)->m_pkthdr.csum_data = 0;                            \
 768                 (m)->m_pkthdr.tso_segsz = 0;                            \
 769                 (m)->m_pkthdr.vlan_tag = 0;                             \
 770                 (m)->m_pkthdr.socket_id = 0;                            \
 771                 (m)->m_pkthdr.vt_nrecs = 0;                             \
 772                 m_tag_init(m);                                          \
 773                 m_prio_init(m);                                         \
 774         }                                                               \
 775 }
 776
 777 #define MEXT_INIT(m, buf, size, free, arg, rfa, ref, flag) {            \
 778         (m)->m_data = (m)->m_ext.ext_buf = (buf);                       \
 779         (m)->m_flags |= M_EXT;                                          \
 780         (m)->m_ext.ext_size = (size);                                   \
 781         (m)->m_ext.ext_free = (free);                                   \
 782         (m)->m_ext.ext_arg = (arg);                                     \
 783         (m)->m_ext.ext_refs.forward = (m)->m_ext.ext_refs.backward =    \
 784             &(m)->m_ext.ext_refs;                                       \
 785         MEXT_RFA(m) = (rfa);                                            \
 786         MEXT_REF(m) = (ref);                                            \
 787         MEXT_FLAGS(m) = (flag);                                         \
 788 }
 789
 790 #define MBUF_CL_INIT(m, buf, rfa, ref, flag)    \
 791         MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, ref, flag)
 792
 793 #define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
 794         MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, ref, flag)
 795
 796 #define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
 797         MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, ref, flag)
 798
 799 /*
 800  * Macro to convert BSD malloc sleep flag to mcache's
 801  */
 802 #define MSLEEPF(f)      ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
 803
 804 /*
 805  * The structure that holds all mbuf class statistics exportable via sysctl.
 806  * Similar to mbstat structure, the mb_stat structure is protected by the
 807  * global mbuf lock.  It contains additional information about the classes
 808  * that allows for a more accurate view of the state of the allocator.
 809  */
 810 struct mb_stat *mb_stat;
 811 struct omb_stat *omb_stat;      /* For backwards compatibility */
 812
 813 #define MB_STAT_SIZE(n) \
 814         ((size_t)(&((mb_stat_t *)0)->mbs_class[n]))
 815 #define OMB_STAT_SIZE(n) \
 816         ((size_t)(&((struct omb_stat *)0)->mbs_class[n]))
 817
 818 /*
 819  * The legacy structure holding all of the mbuf allocation statistics.
 820  * The actual statistics used by the kernel are stored in the mbuf_table
 821  * instead, and are updated atomically while the global mbuf lock is held.
 822  * They are mirrored in mbstat to support legacy applications (e.g. netstat).
 823  * Unlike before, the kernel no longer relies on the contents of mbstat for
 824  * its operations (e.g. cluster expansion) because the structure is exposed
 825  * to outside and could possibly be modified, therefore making it unsafe.
 826  * With the exception of the mbstat.m_mtypes array (see below), all of the
 827  * statistics are updated as they change.
 828  */
 829 struct mbstat mbstat;
 830
 831 #define MBSTAT_MTYPES_MAX \
 832         (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
 833
 834 /*
 835  * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
 836  * atomically and stored in a per-CPU structure which is lock-free; this is
 837  * done in order to avoid writing to the global mbstat data structure which
 838  * would cause false sharing.  During sysctl request for kern.ipc.mbstat,
 839  * the statistics across all CPUs will be converged into the mbstat.m_mtypes
 840  * array and returned to the application.  Any updates for types greater or
 841  * equal than MT_MAX would be done atomically to the mbstat; this slows down
 842  * performance but is okay since the kernel uses only up to MT_MAX-1 while
 843  * anything beyond that (up to type 255) is considered a corner case.
 844  */
 845 typedef struct {
 846         unsigned int    cpu_mtypes[MT_MAX];
 847 } __attribute__((aligned(CPU_CACHE_SIZE), packed)) mtypes_cpu_t;
 848
 849 typedef struct {
 850         mtypes_cpu_t    mbs_cpu[1];
 851 } mbuf_mtypes_t;
 852
 853 static mbuf_mtypes_t *mbuf_mtypes;      /* per-CPU statistics */
 854
 855 #define MBUF_MTYPES_SIZE(n) \
 856         ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n]))
 857
 858 #define MTYPES_CPU(p) \
 859         ((mtypes_cpu_t *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number())))
 860
 861 #define mtype_stat_add(type, n) {                                       \
 862         if ((unsigned)(type) < MT_MAX) {                                \
 863                 mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes);            \
 864                 atomic_add_32(&mbs->cpu_mtypes[type], n);               \
 865         } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) {    \
 866                 atomic_add_16((int16_t *)&mbstat.m_mtypes[type], n);    \
 867         }                                                               \
 868 }
 869
 870 #define mtype_stat_sub(t, n)    mtype_stat_add(t, -(n))
 871 #define mtype_stat_inc(t)       mtype_stat_add(t, 1)
 872 #define mtype_stat_dec(t)       mtype_stat_sub(t, 1)
 873
 874 static void
 875 mbuf_mtypes_sync(boolean_t locked)
 876 {
 877         int m, n;
 878         mtypes_cpu_t mtc;
 879
 880         if (locked)
 881                 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
 882
 883         bzero(&mtc, sizeof (mtc));
 884         for (m = 0; m < ncpu; m++) {
 885                 mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m];
 886                 mtypes_cpu_t temp;
 887
 888                 bcopy(&scp->cpu_mtypes, &temp.cpu_mtypes,
 889                     sizeof (temp.cpu_mtypes));
 890
 891                 for (n = 0; n < MT_MAX; n++)
 892                         mtc.cpu_mtypes[n] += temp.cpu_mtypes[n];
 893         }
 894         if (!locked)
 895                 lck_mtx_lock(mbuf_mlock);
 896         for (n = 0; n < MT_MAX; n++)
 897                 mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
 898         if (!locked)
 899                 lck_mtx_unlock(mbuf_mlock);
 900 }
 901
 902 static int
 903 mbstat_sysctl SYSCTL_HANDLER_ARGS
 904 {
 905 #pragma unused(oidp, arg1, arg2)
 906         mbuf_mtypes_sync(FALSE);
 907
 908         return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat)));
 909 }
 910
 911 static void
 912 mbuf_stat_sync(void)
 913 {
 914         mb_class_stat_t *sp;
 915         mcache_cpu_t *ccp;
 916         mcache_t *cp;
 917         int k, m, bktsize;
 918
 919         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
 920
 921         for (k = 0; k < NELEM(mbuf_table); k++) {
 922                 cp = m_cache(k);
 923                 ccp = &cp->mc_cpu[0];
 924                 bktsize = ccp->cc_bktsize;
 925                 sp = mbuf_table[k].mtbl_stats;
 926
 927                 if (cp->mc_flags & MCF_NOCPUCACHE)
 928                         sp->mbcl_mc_state = MCS_DISABLED;
 929                 else if (cp->mc_purge_cnt > 0)
 930                         sp->mbcl_mc_state = MCS_PURGING;
 931                 else if (bktsize == 0)
 932                         sp->mbcl_mc_state = MCS_OFFLINE;
 933                 else
 934                         sp->mbcl_mc_state = MCS_ONLINE;
 935
 936                 sp->mbcl_mc_cached = 0;
 937                 for (m = 0; m < ncpu; m++) {
 938                         ccp = &cp->mc_cpu[m];
 939                         if (ccp->cc_objs > 0)
 940                                 sp->mbcl_mc_cached += ccp->cc_objs;
 941                         if (ccp->cc_pobjs > 0)
 942                                 sp->mbcl_mc_cached += ccp->cc_pobjs;
 943                 }
 944                 sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
 945                 sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
 946                     sp->mbcl_infree;
 947
 948                 sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
 949                 sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
 950                 sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
 951
 952                 /* Calculate total count specific to each class */
 953                 sp->mbcl_ctotal = sp->mbcl_total;
 954                 switch (m_class(k)) {
 955                 case MC_MBUF:
 956                         /* Deduct mbufs used in composite caches */
 957                         sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
 958                             m_total(MC_MBUF_BIGCL));
 959                         break;
 960
 961                 case MC_CL:
 962                         /* Deduct clusters used in composite cache */
 963                         sp->mbcl_ctotal -= m_total(MC_MBUF_CL);
 964                         break;
 965
 966                 case MC_BIGCL:
 967                         /* Deduct clusters used in composite cache */
 968                         sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
 969                         break;
 970
 971                 case MC_16KCL:
 972                         /* Deduct clusters used in composite cache */
 973                         sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
 974                         break;
 975
 976                 default:
 977                         break;
 978                 }
 979         }
 980 }
 981
 982 static int
 983 mb_stat_sysctl SYSCTL_HANDLER_ARGS
 984 {
 985 #pragma unused(oidp, arg1, arg2)
 986         void *statp;
 987         int k, statsz, proc64 = proc_is64bit(req->p);
 988
 989         lck_mtx_lock(mbuf_mlock);
 990         mbuf_stat_sync();
 991
 992         if (!proc64) {
 993                 struct omb_class_stat *oc;
 994                 struct mb_class_stat *c;
 995
 996                 omb_stat->mbs_cnt = mb_stat->mbs_cnt;
 997                 oc = &omb_stat->mbs_class[0];
 998                 c = &mb_stat->mbs_class[0];
 999                 for (k = 0; k < omb_stat->mbs_cnt; k++, oc++, c++) {
1000                         (void) snprintf(oc->mbcl_cname, sizeof (oc->mbcl_cname),
1001                             "%s", c->mbcl_cname);
1002                         oc->mbcl_size = c->mbcl_size;
1003                         oc->mbcl_total = c->mbcl_total;
1004                         oc->mbcl_active = c->mbcl_active;
1005                         oc->mbcl_infree = c->mbcl_infree;
1006                         oc->mbcl_slab_cnt = c->mbcl_slab_cnt;
1007                         oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt;
1008                         oc->mbcl_free_cnt = c->mbcl_free_cnt;
1009                         oc->mbcl_notified = c->mbcl_notified;
1010                         oc->mbcl_purge_cnt = c->mbcl_purge_cnt;
1011                         oc->mbcl_fail_cnt = c->mbcl_fail_cnt;
1012                         oc->mbcl_ctotal = c->mbcl_ctotal;
1013                         oc->mbcl_mc_state = c->mbcl_mc_state;
1014                         oc->mbcl_mc_cached = c->mbcl_mc_cached;
1015                         oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt;
1016                         oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt;
1017                         oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt;
1018                 }
1019                 statp = omb_stat;
1020                 statsz = OMB_STAT_SIZE(NELEM(mbuf_table));
1021         } else {
1022                 statp = mb_stat;
1023                 statsz = MB_STAT_SIZE(NELEM(mbuf_table));
1024         }
1025
1026         lck_mtx_unlock(mbuf_mlock);
1027
1028         return (SYSCTL_OUT(req, statp, statsz));
1029 }
1030
1031 static int
1032 mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS
1033 {
1034 #pragma unused(oidp, arg1, arg2)
1035         mleak_trace_stat_t *mltr;
1036         int i;
1037
1038         /* Ensure leak tracing turned on */
1039         if (!mclfindleak)
1040                 return (ENXIO);
1041
1042         VERIFY(mleak_stat != NULL);
1043 #ifdef __LP64__
1044         VERIFY(mleak_stat->ml_isaddr64);
1045 #else
1046         VERIFY(!mleak_stat->ml_isaddr64);
1047 #endif /* !__LP64__ */
1048         VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES);
1049
1050         lck_mtx_lock(mleak_lock);
1051         mltr = &mleak_stat->ml_trace[0];
1052         bzero(mltr, sizeof (*mltr) * MLEAK_NUM_TRACES);
1053         for (i = 0; i < MLEAK_NUM_TRACES; i++) {
1054                 int j;
1055
1056                 if (mleak_top_trace[i] == NULL ||
1057                     mleak_top_trace[i]->allocs == 0)
1058                         continue;
1059
1060                 mltr->mltr_collisions   = mleak_top_trace[i]->collisions;
1061                 mltr->mltr_hitcount     = mleak_top_trace[i]->hitcount;
1062                 mltr->mltr_allocs       = mleak_top_trace[i]->allocs;
1063                 mltr->mltr_depth        = mleak_top_trace[i]->depth;
1064
1065                 VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH);
1066                 for (j = 0; j < mltr->mltr_depth; j++)
1067                         mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j];
1068
1069                 mltr++;
1070         }
1071         i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES));
1072         lck_mtx_unlock(mleak_lock);
1073
1074         return (i);
1075 }
1076
1077 static int
1078 mleak_table_sysctl SYSCTL_HANDLER_ARGS
1079 {
1080 #pragma unused(oidp, arg1, arg2)
1081         int i = 0;
1082
1083         /* Ensure leak tracing turned on */
1084         if (!mclfindleak)
1085                 return (ENXIO);
1086
1087         lck_mtx_lock(mleak_lock);
1088         i = SYSCTL_OUT(req, &mleak_table, sizeof (mleak_table));
1089         lck_mtx_unlock(mleak_lock);
1090
1091         return (i);
1092 }
1093
1094 static inline void
1095 m_incref(struct mbuf *m)
1096 {
1097         UInt32 old, new;
1098         volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
1099
1100         do {
1101                 old = *addr;
1102                 new = old + 1;
1103                 ASSERT(new != 0);
1104         } while (!OSCompareAndSwap(old, new, addr));
1105
1106         /*
1107          * If cluster is shared, mark it with (sticky) EXTF_READONLY;
1108          * we don't clear the flag when the refcount goes back to 1
1109          * to simplify code calling m_mclhasreference().
1110          */
1111         if (new > 1 && !(MEXT_FLAGS(m) & EXTF_READONLY))
1112                 (void) OSBitOrAtomic(EXTF_READONLY, &MEXT_FLAGS(m));
1113 }
1114
1115 static inline u_int32_t
1116 m_decref(struct mbuf *m)
1117 {
1118         UInt32 old, new;
1119         volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
1120
1121         do {
1122                 old = *addr;
1123                 new = old - 1;
1124                 ASSERT(old != 0);
1125         } while (!OSCompareAndSwap(old, new, addr));
1126
1127         return (new);
1128 }
1129
1130 static void
1131 mbuf_table_init(void)
1132 {
1133         unsigned int b, c, s;
1134         int m;
1135
1136         MALLOC(omb_stat, struct omb_stat *, OMB_STAT_SIZE(NELEM(mbuf_table)),
1137             M_TEMP, M_WAITOK | M_ZERO);
1138         VERIFY(omb_stat != NULL);
1139
1140         MALLOC(mb_stat, mb_stat_t *, MB_STAT_SIZE(NELEM(mbuf_table)),
1141             M_TEMP, M_WAITOK | M_ZERO);
1142         VERIFY(mb_stat != NULL);
1143
1144         mb_stat->mbs_cnt = NELEM(mbuf_table);
1145         for (m = 0; m < NELEM(mbuf_table); m++)
1146                 mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
1147
1148 #if CONFIG_MBUF_JUMBO
1149         /*
1150          * Set aside 1/3 of the mbuf cluster map for jumbo clusters; we do
1151          * this only on platforms where jumbo cluster pool is enabled.
1152          */
1153         njcl = nmbclusters / 3;
1154         njclbytes = M16KCLBYTES;
1155 #endif /* CONFIG_MBUF_JUMBO */
1156
1157         /*
1158          * nclusters holds both the 2KB and 4KB pools, so ensure it's
1159          * a multiple of 4KB clusters.
1160          */
1161         nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG);
1162         if (njcl > 0) {
1163                 /*
1164                  * Each jumbo cluster takes 8 2KB clusters, so make
1165                  * sure that the pool size is evenly divisible by 8;
1166                  * njcl is in 2KB unit, hence treated as such.
1167                  */
1168                 njcl = P2ROUNDDOWN(nmbclusters - nclusters, 8);
1169
1170                 /* Update nclusters with rounded down value of njcl */
1171                 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG);
1172         }
1173
1174         /*
1175          * njcl is valid only on platforms with 16KB jumbo clusters, where
1176          * it is configured to 1/3 of the pool size.  On these platforms,
1177          * the remaining is used for 2KB and 4KB clusters.  On platforms
1178          * without 16KB jumbo clusters, the entire pool is used for both
1179          * 2KB and 4KB clusters.  A 4KB cluster can either be splitted into
1180          * 16 mbufs, or into 2 2KB clusters.
1181          *
1182          *  +---+---+------------ ... -----------+------- ... -------+
1183          *  | c | b |              s             |        njcl       |
1184          *  +---+---+------------ ... -----------+------- ... -------+
1185          *
1186          * 1/32th of the shared region is reserved for pure 2KB and 4KB
1187          * clusters (1/64th each.)
1188          */
1189         c = P2ROUNDDOWN((nclusters >> 6), 2);           /* in 2KB unit */
1190         b = P2ROUNDDOWN((nclusters >> (6 + NCLPBGSHIFT)), 2); /* in 4KB unit */
1191         s = nclusters - (c + (b << NCLPBGSHIFT));       /* in 2KB unit */
1192
1193         /*
1194          * 1/64th (c) is reserved for 2KB clusters.
1195          */
1196         m_minlimit(MC_CL) = c;
1197         m_maxlimit(MC_CL) = s + c;                      /* in 2KB unit */
1198         m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
1199         (void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
1200
1201         /*
1202          * Another 1/64th (b) of the map is reserved for 4KB clusters.
1203          * It cannot be turned into 2KB clusters or mbufs.
1204          */
1205         m_minlimit(MC_BIGCL) = b;
1206         m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b;  /* in 4KB unit */
1207         m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES;
1208         (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
1209
1210         /*
1211          * The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB)
1212          */
1213         m_minlimit(MC_MBUF) = 0;
1214         m_maxlimit(MC_MBUF) = (s << NMBPCLSHIFT);       /* in mbuf unit */
1215         m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE;
1216         (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
1217
1218         /*
1219          * Set limits for the composite classes.
1220          */
1221         m_minlimit(MC_MBUF_CL) = 0;
1222         m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL);
1223         m_maxsize(MC_MBUF_CL) = MCLBYTES;
1224         m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
1225         (void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
1226
1227         m_minlimit(MC_MBUF_BIGCL) = 0;
1228         m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
1229         m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES;
1230         m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
1231         (void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
1232
1233         /*
1234          * And for jumbo classes.
1235          */
1236         m_minlimit(MC_16KCL) = 0;
1237         m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT);  /* in 16KB unit */
1238         m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
1239         (void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
1240
1241         m_minlimit(MC_MBUF_16KCL) = 0;
1242         m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
1243         m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
1244         m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
1245         (void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
1246
1247         /*
1248          * Initialize the legacy mbstat structure.
1249          */
1250         bzero(&mbstat, sizeof (mbstat));
1251         mbstat.m_msize = m_maxsize(MC_MBUF);
1252         mbstat.m_mclbytes = m_maxsize(MC_CL);
1253         mbstat.m_minclsize = MINCLSIZE;
1254         mbstat.m_mlen = MLEN;
1255         mbstat.m_mhlen = MHLEN;
1256         mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
1257 }
1258
1259 #if defined(__LP64__)
1260 typedef struct ncl_tbl {
1261         uint64_t nt_maxmem;     /* memory (sane) size */
1262         uint32_t nt_mbpool;     /* mbuf pool size */
1263 } ncl_tbl_t;
1264
1265 /* Non-server */
1266 static ncl_tbl_t ncl_table[] = {
1267         { (1ULL << GBSHIFT)       /*  1 GB */,  (64 << MBSHIFT)  /*  64 MB */ },
1268         { (1ULL << (GBSHIFT + 3)) /*  8 GB */,  (96 << MBSHIFT)  /*  96 MB */ },
1269         { (1ULL << (GBSHIFT + 4)) /* 16 GB */,  (128 << MBSHIFT) /* 128 MB */ },
1270         { 0, 0 }
1271 };
1272
1273 /* Server */
1274 static ncl_tbl_t ncl_table_srv[] = {
1275         { (1ULL << GBSHIFT)       /*  1 GB */,  (96 << MBSHIFT)  /*  96 MB */ },
1276         { (1ULL << (GBSHIFT + 2)) /*  4 GB */,  (128 << MBSHIFT) /* 128 MB */ },
1277         { (1ULL << (GBSHIFT + 3)) /*  8 GB */,  (160 << MBSHIFT) /* 160 MB */ },
1278         { (1ULL << (GBSHIFT + 4)) /* 16 GB */,  (192 << MBSHIFT) /* 192 MB */ },
1279         { (1ULL << (GBSHIFT + 5)) /* 32 GB */,  (256 << MBSHIFT) /* 256 MB */ },
1280         { (1ULL << (GBSHIFT + 6)) /* 64 GB */,  (384 << MBSHIFT) /* 384 MB */ },
1281         { 0, 0 }
1282 };
1283 #endif /* __LP64__ */
1284
1285 __private_extern__ unsigned int
1286 mbuf_default_ncl(int server, uint64_t mem)
1287 {
1288 #if !defined(__LP64__)
1289 #pragma unused(server)
1290         unsigned int n;
1291         /*
1292          * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
1293          */
1294         if ((n = ((mem / 16) / MCLBYTES)) > 32768)
1295                 n = 32768;
1296 #else
1297         unsigned int n, i;
1298         ncl_tbl_t *tbl = (server ? ncl_table_srv : ncl_table);
1299         /*
1300          * 64-bit kernel (mbuf pool size based on table).
1301          */
1302         n = tbl[0].nt_mbpool;
1303         for (i = 0; tbl[i].nt_mbpool != 0; i++) {
1304                 if (mem < tbl[i].nt_maxmem)
1305                         break;
1306                 n = tbl[i].nt_mbpool;
1307         }
1308         n >>= MCLSHIFT;
1309 #endif /* !__LP64__ */
1310         return (n);
1311 }
1312
1313 __private_extern__ void
1314 mbinit(void)
1315 {
1316         unsigned int m;
1317         unsigned int initmcl = 0;
1318         void *buf;
1319         thread_t thread = THREAD_NULL;
1320
1321         if (nmbclusters == 0)
1322                 nmbclusters = NMBCLUSTERS;
1323
1324         /* This should be a sane (at least even) value by now */
1325         VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1));
1326
1327         /* Setup the mbuf table */
1328         mbuf_table_init();
1329
1330         /* Global lock for common layer */
1331         mbuf_mlock_grp_attr = lck_grp_attr_alloc_init();
1332         mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr);
1333         mbuf_mlock_attr = lck_attr_alloc_init();
1334         mbuf_mlock = lck_mtx_alloc_init(mbuf_mlock_grp, mbuf_mlock_attr);
1335
1336         /*
1337          * Allocate cluster slabs table:
1338          *
1339          *      maxslabgrp = (N * 2048) / (1024 * 1024)
1340          *
1341          * Where N is nmbclusters rounded up to the nearest 512.  This yields
1342          * mcl_slab_g_t units, each one representing a MB of memory.
1343          */
1344         maxslabgrp =
1345             (P2ROUNDUP(nmbclusters, (MBSIZE >> 11)) << MCLSHIFT) >> MBSHIFT;
1346         MALLOC(slabstbl, mcl_slabg_t **, maxslabgrp * sizeof (mcl_slabg_t *),
1347             M_TEMP, M_WAITOK | M_ZERO);
1348         VERIFY(slabstbl != NULL);
1349
1350         /*
1351          * Allocate audit structures, if needed:
1352          *
1353          *      maxclaudit = (maxslabgrp * 1024 * 1024) / 4096
1354          *
1355          * This yields mcl_audit_t units, each one representing a page.
1356          */
1357         PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof (mbuf_debug));
1358         mbuf_debug |= mcache_getflags();
1359         if (mbuf_debug & MCF_DEBUG) {
1360                 maxclaudit = ((maxslabgrp << MBSHIFT) >> PGSHIFT);
1361                 MALLOC(mclaudit, mcl_audit_t *, maxclaudit * sizeof (*mclaudit),
1362                     M_TEMP, M_WAITOK | M_ZERO);
1363                 VERIFY(mclaudit != NULL);
1364
1365                 mcl_audit_con_cache = mcache_create("mcl_audit_contents",
1366                     AUDIT_CONTENTS_SIZE, 0, 0, MCR_SLEEP);
1367                 VERIFY(mcl_audit_con_cache != NULL);
1368         }
1369         mclverify = (mbuf_debug & MCF_VERIFY);
1370         mcltrace = (mbuf_debug & MCF_TRACE);
1371         mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG);
1372
1373         /* Enable mbuf leak logging, with a lock to protect the tables */
1374
1375         mleak_lock_grp_attr = lck_grp_attr_alloc_init();
1376         mleak_lock_grp = lck_grp_alloc_init("mleak_lock", mleak_lock_grp_attr);
1377         mleak_lock_attr = lck_attr_alloc_init();
1378         mleak_lock = lck_mtx_alloc_init(mleak_lock_grp, mleak_lock_attr);
1379
1380         mleak_activate();
1381
1382         /* Calculate the number of pages assigned to the cluster pool */
1383         mcl_pages = (nmbclusters * MCLBYTES) / CLBYTES;
1384         MALLOC(mcl_paddr, ppnum_t *, mcl_pages * sizeof (ppnum_t),
1385             M_TEMP, M_WAITOK);
1386         VERIFY(mcl_paddr != NULL);
1387
1388         /* Register with the I/O Bus mapper */
1389         mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
1390         bzero((char *)mcl_paddr, mcl_pages * sizeof (ppnum_t));
1391
1392         embutl = (union mbigcluster *)
1393             ((unsigned char *)mbutl + (nmbclusters * MCLBYTES));
1394         VERIFY((((char *)embutl - (char *)mbutl) % MBIGCLBYTES) == 0);
1395
1396         /* Prime up the freelist */
1397         PE_parse_boot_argn("initmcl", &initmcl, sizeof (initmcl));
1398         if (initmcl != 0) {
1399                 initmcl >>= NCLPBGSHIFT;        /* become a 4K unit */
1400                 if (initmcl > m_maxlimit(MC_BIGCL))
1401                         initmcl = m_maxlimit(MC_BIGCL);
1402         }
1403         if (initmcl < m_minlimit(MC_BIGCL))
1404                 initmcl = m_minlimit(MC_BIGCL);
1405
1406         lck_mtx_lock(mbuf_mlock);
1407
1408         /*
1409          * For classes with non-zero minimum limits, populate their freelists
1410          * so that m_total(class) is at least m_minlimit(class).
1411          */
1412         VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0);
1413         freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT);
1414         VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
1415         freelist_init(m_class(MC_CL));
1416
1417         for (m = 0; m < NELEM(mbuf_table); m++) {
1418                 /* Make sure we didn't miss any */
1419                 VERIFY(m_minlimit(m_class(m)) == 0 ||
1420                     m_total(m_class(m)) >= m_minlimit(m_class(m)));
1421         }
1422
1423         lck_mtx_unlock(mbuf_mlock);
1424
1425         (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init,
1426             NULL, &thread);
1427         thread_deallocate(thread);
1428
1429         ref_cache = mcache_create("mext_ref", sizeof (struct ext_ref),
1430             0, 0, MCR_SLEEP);
1431
1432         /* Create the cache for each class */
1433         for (m = 0; m < NELEM(mbuf_table); m++) {
1434                 void *allocfunc, *freefunc, *auditfunc, *logfunc;
1435                 u_int32_t flags;
1436
1437                 flags = mbuf_debug;
1438                 if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL ||
1439                     m_class(m) == MC_MBUF_16KCL) {
1440                         allocfunc = mbuf_cslab_alloc;
1441                         freefunc = mbuf_cslab_free;
1442                         auditfunc = mbuf_cslab_audit;
1443                         logfunc = mleak_logger;
1444                 } else {
1445                         allocfunc = mbuf_slab_alloc;
1446                         freefunc = mbuf_slab_free;
1447                         auditfunc = mbuf_slab_audit;
1448                         logfunc = mleak_logger;
1449                 }
1450
1451                 /*
1452                  * Disable per-CPU caches for jumbo classes if there
1453                  * is no jumbo cluster pool available in the system.
1454                  * The cache itself is still created (but will never
1455                  * be populated) since it simplifies the code.
1456                  */
1457                 if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) &&
1458                     njcl == 0)
1459                         flags |= MCF_NOCPUCACHE;
1460
1461                 if (!mclfindleak)
1462                         flags |= MCF_NOLEAKLOG;
1463
1464                 m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
1465                     allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify,
1466                     (void *)(uintptr_t)m, flags, MCR_SLEEP);
1467         }
1468
1469         /*
1470          * Allocate structure for per-CPU statistics that's aligned
1471          * on the CPU cache boundary; this code assumes that we never
1472          * uninitialize this framework, since the original address
1473          * before alignment is not saved.
1474          */
1475         ncpu = ml_get_max_cpus();
1476         MALLOC(buf, void *, MBUF_MTYPES_SIZE(ncpu) + CPU_CACHE_SIZE,
1477             M_TEMP, M_WAITOK);
1478         VERIFY(buf != NULL);
1479
1480         mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf, CPU_CACHE_SIZE);
1481         bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu));
1482
1483         /*
1484          * Set the max limit on sb_max to be 1/16 th of the size of
1485          * memory allocated for mbuf clusters.
1486          */
1487         high_sb_max = (nmbclusters << (MCLSHIFT - 4));
1488         if (high_sb_max < sb_max) {
1489                 /* sb_max is too large for this configuration, scale it down */
1490                 if (high_sb_max > (1 << MBSHIFT)) {
1491                         /* We have atleast 16 M of mbuf pool */
1492                         sb_max = high_sb_max;
1493                 } else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) {
1494                         /*
1495                          * If we have more than 1M of mbufpool, cap the size of
1496                          * max sock buf at 1M
1497                          */
1498                         sb_max = high_sb_max = (1 << MBSHIFT);
1499                 } else {
1500                         sb_max = high_sb_max;
1501                 }
1502         }
1503
1504         printf("mbinit: done [%d MB total pool size, (%d/%d) split]\n",
1505             (nmbclusters << MCLSHIFT) >> MBSHIFT,
1506             (nclusters << MCLSHIFT) >> MBSHIFT,
1507             (njcl << MCLSHIFT) >> MBSHIFT);
1508 }
1509
1510 /*
1511  * Obtain a slab of object(s) from the class's freelist.
1512  */
1513 static mcache_obj_t *
1514 slab_alloc(mbuf_class_t class, int wait)
1515 {
1516         mcl_slab_t *sp;
1517         mcache_obj_t *buf;
1518
1519         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1520
1521         VERIFY(class != MC_16KCL || njcl > 0);
1522
1523         /* This should always be NULL for us */
1524         VERIFY(m_cobjlist(class) == NULL);
1525
1526         /*
1527          * Treat composite objects as having longer lifespan by using
1528          * a slab from the reverse direction, in hoping that this could
1529          * reduce the probability of fragmentation for slabs that hold
1530          * more than one buffer chunks (e.g. mbuf slabs).  For other
1531          * slabs, this probably doesn't make much of a difference.
1532          */
1533         if ((class == MC_MBUF || class == MC_CL) && (wait & MCR_COMP))
1534                 sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
1535         else
1536                 sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
1537
1538         if (sp == NULL) {
1539                 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1540                 /* The slab list for this class is empty */
1541                 return (NULL);
1542         }
1543
1544         VERIFY(m_infree(class) > 0);
1545         VERIFY(!slab_is_detached(sp));
1546         VERIFY(sp->sl_class == class &&
1547             (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1548         buf = sp->sl_head;
1549         VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
1550
1551         if (class == MC_MBUF) {
1552                 sp->sl_head = buf->obj_next;
1553                 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NMBPBG - 1));
1554         } else if (class == MC_CL) {
1555                 sp->sl_head = buf->obj_next;
1556                 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NCLPBG - 1));
1557         } else {
1558                 sp->sl_head = NULL;
1559         }
1560         if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
1561                 slab_nextptr_panic(sp, sp->sl_head);
1562                 /* In case sl_head is in the map but not in the slab */
1563                 VERIFY(slab_inrange(sp, sp->sl_head));
1564                 /* NOTREACHED */
1565         }
1566
1567         /* Increment slab reference */
1568         sp->sl_refcnt++;
1569
1570         if (mclaudit != NULL) {
1571                 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1572                 mca->mca_uflags = 0;
1573                 /* Save contents on mbuf objects only */
1574                 if (class == MC_MBUF)
1575                         mca->mca_uflags |= MB_SCVALID;
1576         }
1577
1578         if (class == MC_CL) {
1579                 mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1580                 /*
1581                  * A 2K cluster slab can have at most NCLPBG references.
1582                  */
1583                 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPBG &&
1584                     sp->sl_chunks == NCLPBG &&
1585                     sp->sl_len == m_maxsize(MC_BIGCL));
1586                 VERIFY(sp->sl_refcnt < NCLPBG || sp->sl_head == NULL);
1587         } else if (class == MC_BIGCL) {
1588                 mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
1589                     m_infree(MC_MBUF_BIGCL);
1590                 /*
1591                  * A 4K cluster slab can have at most 1 reference.
1592                  */
1593                 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1594                     sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1595         } else if (class == MC_16KCL) {
1596                 mcl_slab_t *nsp;
1597                 int k;
1598
1599                 --m_infree(MC_16KCL);
1600                 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1601                     sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1602                 /*
1603                  * Increment 2nd-Nth slab reference, where N is NSLABSP16KB.
1604                  * A 16KB big cluster takes NSLABSP16KB slabs, each having at
1605                  * most 1 reference.
1606                  */
1607                 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1608                         nsp = nsp->sl_next;
1609                         /* Next slab must already be present */
1610                         VERIFY(nsp != NULL);
1611                         nsp->sl_refcnt++;
1612                         VERIFY(!slab_is_detached(nsp));
1613                         VERIFY(nsp->sl_class == MC_16KCL &&
1614                             nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1615                             nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1616                             nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1617                             nsp->sl_head == NULL);
1618                 }
1619         } else {
1620                 VERIFY(class == MC_MBUF);
1621                 --m_infree(MC_MBUF);
1622                 /*
1623                  * If auditing is turned on, this check is
1624                  * deferred until later in mbuf_slab_audit().
1625                  */
1626                 if (mclaudit == NULL)
1627                         _MCHECK((struct mbuf *)buf);
1628                 /*
1629                  * Since we have incremented the reference count above,
1630                  * an mbuf slab (formerly a 4KB cluster slab that was cut
1631                  * up into mbufs) must have a reference count between 1
1632                  * and NMBPBG at this point.
1633                  */
1634                 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPBG &&
1635                     sp->sl_chunks == NMBPBG &&
1636                     sp->sl_len == m_maxsize(MC_BIGCL));
1637                 VERIFY(sp->sl_refcnt < NMBPBG || sp->sl_head == NULL);
1638         }
1639
1640         /* If empty, remove this slab from the class's freelist */
1641         if (sp->sl_head == NULL) {
1642                 VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPBG);
1643                 VERIFY(class != MC_CL || sp->sl_refcnt == NCLPBG);
1644                 slab_remove(sp, class);
1645         }
1646
1647         return (buf);
1648 }
1649
1650 /*
1651  * Place a slab of object(s) back into a class's slab list.
1652  */
1653 static void
1654 slab_free(mbuf_class_t class, mcache_obj_t *buf)
1655 {
1656         mcl_slab_t *sp;
1657
1658         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1659
1660         VERIFY(class != MC_16KCL || njcl > 0);
1661         VERIFY(buf->obj_next == NULL);
1662         sp = slab_get(buf);
1663         VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
1664             (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1665
1666         /* Decrement slab reference */
1667         sp->sl_refcnt--;
1668
1669         if (class == MC_CL) {
1670                 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1671                 /*
1672                  * A slab that has been splitted for 2KB clusters can have
1673                  * at most 1 outstanding reference at this point.
1674                  */
1675                 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPBG - 1) &&
1676                     sp->sl_chunks == NCLPBG &&
1677                     sp->sl_len == m_maxsize(MC_BIGCL));
1678                 VERIFY(sp->sl_refcnt < (NCLPBG - 1) ||
1679                     (slab_is_detached(sp) && sp->sl_head == NULL));
1680         } else if (class == MC_BIGCL) {
1681                 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1682                 /*
1683                  * A 4KB cluster slab can have at most 1 reference
1684                  * which must be 0 at this point.
1685                  */
1686                 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1687                     sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1688                 VERIFY(slab_is_detached(sp));
1689         } else if (class == MC_16KCL) {
1690                 mcl_slab_t *nsp;
1691                 int k;
1692                 /*
1693                  * A 16KB cluster takes NSLABSP16KB slabs, all must
1694                  * now have 0 reference.
1695                  */
1696                 VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES));
1697                 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1698                     sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1699                 VERIFY(slab_is_detached(sp));
1700                 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1701                         nsp = nsp->sl_next;
1702                         /* Next slab must already be present */
1703                         VERIFY(nsp != NULL);
1704                         nsp->sl_refcnt--;
1705                         VERIFY(slab_is_detached(nsp));
1706                         VERIFY(nsp->sl_class == MC_16KCL &&
1707                             (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
1708                             nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
1709                             nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1710                             nsp->sl_head == NULL);
1711                 }
1712         } else {
1713                 /*
1714                  * A slab that has been splitted for mbufs has at most NMBPBG
1715                  * reference counts.  Since we have decremented one reference
1716                  * above, it must now be between 0 and NMBPBG-1.
1717                  */
1718                 VERIFY(class == MC_MBUF);
1719                 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NMBPBG - 1) &&
1720                     sp->sl_chunks == NMBPBG &&
1721                     sp->sl_len == m_maxsize(MC_BIGCL));
1722                 VERIFY(sp->sl_refcnt < (NMBPBG - 1) ||
1723                     (slab_is_detached(sp) && sp->sl_head == NULL));
1724         }
1725
1726         /*
1727          * When auditing is enabled, ensure that the buffer still
1728          * contains the free pattern.  Otherwise it got corrupted
1729          * while at the CPU cache layer.
1730          */
1731         if (mclaudit != NULL) {
1732                 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1733                 if (mclverify) {
1734                         mcache_audit_free_verify(mca, buf, 0, m_maxsize(class));
1735                 }
1736                 mca->mca_uflags &= ~MB_SCVALID;
1737         }
1738
1739         if (class == MC_CL) {
1740                 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1741                 buf->obj_next = sp->sl_head;
1742         } else if (class == MC_BIGCL) {
1743                 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1744                     m_infree(MC_MBUF_BIGCL);
1745         } else if (class == MC_16KCL) {
1746                 ++m_infree(MC_16KCL);
1747         } else {
1748                 ++m_infree(MC_MBUF);
1749                 buf->obj_next = sp->sl_head;
1750         }
1751         sp->sl_head = buf;
1752
1753         /*
1754          * If a slab has been splitted to either one which holds 2KB clusters,
1755          * or one which holds mbufs, turn it back to one which holds a 4KB
1756          * cluster.
1757          */
1758         if (class == MC_MBUF && sp->sl_refcnt == 0 &&
1759             m_total(class) > m_minlimit(class) &&
1760             m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) {
1761                 int i = NMBPBG;
1762
1763                 m_total(MC_BIGCL)++;
1764                 mbstat.m_bigclusters = m_total(MC_BIGCL);
1765                 m_total(MC_MBUF) -= NMBPBG;
1766                 mbstat.m_mbufs = m_total(MC_MBUF);
1767                 m_infree(MC_MBUF) -= NMBPBG;
1768                 mtype_stat_add(MT_FREE, -((unsigned)NMBPBG));
1769
1770                 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
1771                 VERIFY(m_total(MC_MBUF) >= m_minlimit(MC_MBUF));
1772
1773                 while (i--) {
1774                         struct mbuf *m = sp->sl_head;
1775                         VERIFY(m != NULL);
1776                         sp->sl_head = m->m_next;
1777                         m->m_next = NULL;
1778                 }
1779                 VERIFY(sp->sl_head == NULL);
1780
1781                 /* Remove the slab from the mbuf class's slab list */
1782                 slab_remove(sp, class);
1783
1784                 /* Reinitialize it as a 4KB cluster slab */
1785                 slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base,
1786                     sp->sl_len, 0, 1);
1787
1788                 if (mclverify) {
1789                         mcache_set_pattern(MCACHE_FREE_PATTERN,
1790                             (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL));
1791                 }
1792                 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1793                     m_infree(MC_MBUF_BIGCL);
1794
1795                 VERIFY(slab_is_detached(sp));
1796                 /* And finally switch class */
1797                 class = MC_BIGCL;
1798         } else if (class == MC_CL && sp->sl_refcnt == 0 &&
1799             m_total(class) > m_minlimit(class) &&
1800             m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) {
1801                 int i = NCLPBG;
1802
1803                 m_total(MC_BIGCL)++;
1804                 mbstat.m_bigclusters = m_total(MC_BIGCL);
1805                 m_total(MC_CL) -= NCLPBG;
1806                 mbstat.m_clusters = m_total(MC_CL);
1807                 m_infree(MC_CL) -= NCLPBG;
1808                 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
1809                 VERIFY(m_total(MC_CL) >= m_minlimit(MC_CL));
1810
1811                 while (i--) {
1812                         union mcluster *c = sp->sl_head;
1813                         VERIFY(c != NULL);
1814                         sp->sl_head = c->mcl_next;
1815                         c->mcl_next = NULL;
1816                 }
1817                 VERIFY(sp->sl_head == NULL);
1818
1819                 /* Remove the slab from the 2KB cluster class's slab list */
1820                 slab_remove(sp, class);
1821
1822                 /* Reinitialize it as a 4KB cluster slab */
1823                 slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base,
1824                     sp->sl_len, 0, 1);
1825
1826                 if (mclverify) {
1827                         mcache_set_pattern(MCACHE_FREE_PATTERN,
1828                             (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL));
1829                 }
1830                 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1831                     m_infree(MC_MBUF_BIGCL);
1832
1833                 VERIFY(slab_is_detached(sp));
1834                 /* And finally switch class */
1835                 class = MC_BIGCL;
1836         }
1837
1838         /* Reinsert the slab to the class's slab list */
1839         if (slab_is_detached(sp))
1840                 slab_insert(sp, class);
1841 }
1842
1843 /*
1844  * Common allocator for rudimentary objects called by the CPU cache layer
1845  * during an allocation request whenever there is no available element in the
1846  * bucket layer.  It returns one or more elements from the appropriate global
1847  * freelist.  If the freelist is empty, it will attempt to populate it and
1848  * retry the allocation.
1849  */
1850 static unsigned int
1851 mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
1852 {
1853         mbuf_class_t class = (mbuf_class_t)arg;
1854         unsigned int need = num;
1855         mcache_obj_t **list = *plist;
1856
1857         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1858         ASSERT(need > 0);
1859
1860         lck_mtx_lock(mbuf_mlock);
1861
1862         for (;;) {
1863                 if ((*list = slab_alloc(class, wait)) != NULL) {
1864                         (*list)->obj_next = NULL;
1865                         list = *plist = &(*list)->obj_next;
1866
1867                         if (--need == 0) {
1868                                 /*
1869                                  * If the number of elements in freelist has
1870                                  * dropped below low watermark, asynchronously
1871                                  * populate the freelist now rather than doing
1872                                  * it later when we run out of elements.
1873                                  */
1874                                 if (!mbuf_cached_above(class, wait) &&
1875                                     m_infree(class) < m_total(class) >> 5) {
1876                                         (void) freelist_populate(class, 1,
1877                                             M_DONTWAIT);
1878                                 }
1879                                 break;
1880                         }
1881                 } else {
1882                         VERIFY(m_infree(class) == 0 || class == MC_CL);
1883
1884                         (void) freelist_populate(class, 1,
1885                             (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
1886
1887                         if (m_infree(class) > 0)
1888                                 continue;
1889
1890                         /* Check if there's anything at the cache layer */
1891                         if (mbuf_cached_above(class, wait))
1892                                 break;
1893
1894                         /* watchdog checkpoint */
1895                         mbuf_watchdog();
1896
1897                         /* We have nothing and cannot block; give up */
1898                         if (wait & MCR_NOSLEEP) {
1899                                 if (!(wait & MCR_TRYHARD)) {
1900                                         m_fail_cnt(class)++;
1901                                         mbstat.m_drops++;
1902                                         break;
1903                                 }
1904                         }
1905
1906                         /*
1907                          * If the freelist is still empty and the caller is
1908                          * willing to be blocked, sleep on the wait channel
1909                          * until an element is available.  Otherwise, if
1910                          * MCR_TRYHARD is set, do our best to satisfy the
1911                          * request without having to go to sleep.
1912                          */
1913                         if (mbuf_worker_ready &&
1914                             mbuf_sleep(class, need, wait))
1915                                 break;
1916
1917                         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1918                 }
1919         }
1920
1921         m_alloc_cnt(class) += num - need;
1922         lck_mtx_unlock(mbuf_mlock);
1923
1924         return (num - need);
1925 }
1926
1927 /*
1928  * Common de-allocator for rudimentary objects called by the CPU cache
1929  * layer when one or more elements need to be returned to the appropriate
1930  * global freelist.
1931  */
1932 static void
1933 mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
1934 {
1935         mbuf_class_t class = (mbuf_class_t)arg;
1936         mcache_obj_t *nlist;
1937         unsigned int num = 0;
1938         int w;
1939
1940         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1941
1942         lck_mtx_lock(mbuf_mlock);
1943
1944         for (;;) {
1945                 nlist = list->obj_next;
1946                 list->obj_next = NULL;
1947                 slab_free(class, list);
1948                 ++num;
1949                 if ((list = nlist) == NULL)
1950                         break;
1951         }
1952         m_free_cnt(class) += num;
1953
1954         if ((w = mb_waiters) > 0)
1955                 mb_waiters = 0;
1956
1957         lck_mtx_unlock(mbuf_mlock);
1958
1959         if (w != 0)
1960                 wakeup(mb_waitchan);
1961 }
1962
1963 /*
1964  * Common auditor for rudimentary objects called by the CPU cache layer
1965  * during an allocation or free request.  For the former, this is called
1966  * after the objects are obtained from either the bucket or slab layer
1967  * and before they are returned to the caller.  For the latter, this is
1968  * called immediately during free and before placing the objects into
1969  * the bucket or slab layer.
1970  */
1971 static void
1972 mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
1973 {
1974         mbuf_class_t class = (mbuf_class_t)arg;
1975         mcache_audit_t *mca;
1976
1977         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1978
1979         while (list != NULL) {
1980                 lck_mtx_lock(mbuf_mlock);
1981                 mca = mcl_audit_buf2mca(class, list);
1982
1983                 /* Do the sanity checks */
1984                 if (class == MC_MBUF) {
1985                         mcl_audit_mbuf(mca, list, FALSE, alloc);
1986                         ASSERT(mca->mca_uflags & MB_SCVALID);
1987                 } else {
1988                         mcl_audit_cluster(mca, list, m_maxsize(class),
1989                             alloc, TRUE);
1990                         ASSERT(!(mca->mca_uflags & MB_SCVALID));
1991                 }
1992                 /* Record this transaction */
1993                 if (mcltrace)
1994                         mcache_buffer_log(mca, list, m_cache(class));
1995
1996                 if (alloc)
1997                         mca->mca_uflags |= MB_INUSE;
1998                 else
1999                         mca->mca_uflags &= ~MB_INUSE;
2000                 /* Unpair the object (unconditionally) */
2001                 mca->mca_uptr = NULL;
2002                 lck_mtx_unlock(mbuf_mlock);
2003
2004                 list = list->obj_next;
2005         }
2006 }
2007
2008 /*
2009  * Common notify routine for all caches.  It is called by mcache when
2010  * one or more objects get freed.  We use this indication to trigger
2011  * the wakeup of any sleeping threads so that they can retry their
2012  * allocation requests.
2013  */
2014 static void
2015 mbuf_slab_notify(void *arg, u_int32_t reason)
2016 {
2017         mbuf_class_t class = (mbuf_class_t)arg;
2018         int w;
2019
2020         ASSERT(MBUF_CLASS_VALID(class));
2021
2022         if (reason != MCN_RETRYALLOC)
2023                 return;
2024
2025         lck_mtx_lock(mbuf_mlock);
2026         if ((w = mb_waiters) > 0) {
2027                 m_notified(class)++;
2028                 mb_waiters = 0;
2029         }
2030         lck_mtx_unlock(mbuf_mlock);
2031
2032         if (w != 0)
2033                 wakeup(mb_waitchan);
2034 }
2035
2036 /*
2037  * Obtain object(s) from the composite class's freelist.
2038  */
2039 static unsigned int
2040 cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
2041 {
2042         unsigned int need = num;
2043         mcl_slab_t *sp, *clsp, *nsp;
2044         struct mbuf *m;
2045         mcache_obj_t **list = *plist;
2046         void *cl;
2047
2048         VERIFY(need > 0);
2049         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2050         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2051
2052         /* Get what we can from the freelist */
2053         while ((*list = m_cobjlist(class)) != NULL) {
2054                 MRANGE(*list);
2055
2056                 m = (struct mbuf *)*list;
2057                 sp = slab_get(m);
2058                 cl = m->m_ext.ext_buf;
2059                 clsp = slab_get(cl);
2060                 VERIFY(m->m_flags == M_EXT && cl != NULL);
2061                 VERIFY(MEXT_RFA(m) != NULL && MBUF_IS_COMPOSITE(m));
2062
2063                 if (class == MC_MBUF_CL) {
2064                         VERIFY(clsp->sl_refcnt >= 1 &&
2065                             clsp->sl_refcnt <= NCLPBG);
2066                 } else {
2067                         VERIFY(clsp->sl_refcnt == 1);
2068                 }
2069
2070                 if (class == MC_MBUF_16KCL) {
2071                         int k;
2072                         for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2073                                 nsp = nsp->sl_next;
2074                                 /* Next slab must already be present */
2075                                 VERIFY(nsp != NULL);
2076                                 VERIFY(nsp->sl_refcnt == 1);
2077                         }
2078                 }
2079
2080                 if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
2081                     !MBUF_IN_MAP(m_cobjlist(class))) {
2082                         slab_nextptr_panic(sp, m_cobjlist(class));
2083                         /* NOTREACHED */
2084                 }
2085                 (*list)->obj_next = NULL;
2086                 list = *plist = &(*list)->obj_next;
2087
2088                 if (--need == 0)
2089                         break;
2090         }
2091         m_infree(class) -= (num - need);
2092
2093         return (num - need);
2094 }
2095
2096 /*
2097  * Place object(s) back into a composite class's freelist.
2098  */
2099 static unsigned int
2100 cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
2101 {
2102         mcache_obj_t *o, *tail;
2103         unsigned int num = 0;
2104         struct mbuf *m, *ms;
2105         mcache_audit_t *mca = NULL;
2106         mcache_obj_t *ref_list = NULL;
2107         mcl_slab_t *clsp, *nsp;
2108         void *cl;
2109         mbuf_class_t cl_class;
2110
2111         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2112         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2113         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2114
2115         if (class == MC_MBUF_CL) {
2116                 cl_class = MC_CL;
2117         } else if (class == MC_MBUF_BIGCL) {
2118                 cl_class = MC_BIGCL;
2119         } else {
2120                 VERIFY(class == MC_MBUF_16KCL);
2121                 cl_class = MC_16KCL;
2122         }
2123
2124         o = tail = list;
2125
2126         while ((m = ms = (struct mbuf *)o) != NULL) {
2127                 mcache_obj_t *rfa, *nexto = o->obj_next;
2128
2129                 /* Do the mbuf sanity checks */
2130                 if (mclaudit != NULL) {
2131                         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2132                         if (mclverify) {
2133                                 mcache_audit_free_verify(mca, m, 0,
2134                                     m_maxsize(MC_MBUF));
2135                         }
2136                         ms = (struct mbuf *)mca->mca_contents;
2137                 }
2138
2139                 /* Do the cluster sanity checks */
2140                 cl = ms->m_ext.ext_buf;
2141                 clsp = slab_get(cl);
2142                 if (mclverify) {
2143                         size_t size = m_maxsize(cl_class);
2144                         mcache_audit_free_verify(mcl_audit_buf2mca(cl_class,
2145                             (mcache_obj_t *)cl), cl, 0, size);
2146                 }
2147                 VERIFY(ms->m_type == MT_FREE);
2148                 VERIFY(ms->m_flags == M_EXT);
2149                 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2150                 if (cl_class == MC_CL) {
2151                         VERIFY(clsp->sl_refcnt >= 1 &&
2152                             clsp->sl_refcnt <= NCLPBG);
2153                 } else {
2154                         VERIFY(clsp->sl_refcnt == 1);
2155                 }
2156                 if (cl_class == MC_16KCL) {
2157                         int k;
2158                         for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2159                                 nsp = nsp->sl_next;
2160                                 /* Next slab must already be present */
2161                                 VERIFY(nsp != NULL);
2162                                 VERIFY(nsp->sl_refcnt == 1);
2163                         }
2164                 }
2165
2166                 /*
2167                  * If we're asked to purge, restore the actual mbuf using
2168                  * contents of the shadow structure (if auditing is enabled)
2169                  * and clear EXTF_COMPOSITE flag from the mbuf, as we are
2170                  * about to free it and the attached cluster into their caches.
2171                  */
2172                 if (purged) {
2173                         /* Restore constructed mbuf fields */
2174                         if (mclaudit != NULL)
2175                                 mcl_audit_restore_mbuf(m, mca, TRUE);
2176
2177                         MEXT_REF(m) = 0;
2178                         MEXT_FLAGS(m) = 0;
2179
2180                         rfa = (mcache_obj_t *)MEXT_RFA(m);
2181                         rfa->obj_next = ref_list;
2182                         ref_list = rfa;
2183                         MEXT_RFA(m) = NULL;
2184
2185                         m->m_type = MT_FREE;
2186                         m->m_flags = m->m_len = 0;
2187                         m->m_next = m->m_nextpkt = NULL;
2188
2189                         /* Save mbuf fields and make auditing happy */
2190                         if (mclaudit != NULL)
2191                                 mcl_audit_mbuf(mca, o, FALSE, FALSE);
2192
2193                         VERIFY(m_total(class) > 0);
2194                         m_total(class)--;
2195
2196                         /* Free the mbuf */
2197                         o->obj_next = NULL;
2198                         slab_free(MC_MBUF, o);
2199
2200                         /* And free the cluster */
2201                         ((mcache_obj_t *)cl)->obj_next = NULL;
2202                         if (class == MC_MBUF_CL)
2203                                 slab_free(MC_CL, cl);
2204                         else if (class == MC_MBUF_BIGCL)
2205                                 slab_free(MC_BIGCL, cl);
2206                         else
2207                                 slab_free(MC_16KCL, cl);
2208                 }
2209
2210                 ++num;
2211                 tail = o;
2212                 o = nexto;
2213         }
2214
2215         if (!purged) {
2216                 tail->obj_next = m_cobjlist(class);
2217                 m_cobjlist(class) = list;
2218                 m_infree(class) += num;
2219         } else if (ref_list != NULL) {
2220                 mcache_free_ext(ref_cache, ref_list);
2221         }
2222
2223         return (num);
2224 }
2225
2226 /*
2227  * Common allocator for composite objects called by the CPU cache layer
2228  * during an allocation request whenever there is no available element in
2229  * the bucket layer.  It returns one or more composite elements from the
2230  * appropriate global freelist.  If the freelist is empty, it will attempt
2231  * to obtain the rudimentary objects from their caches and construct them
2232  * into composite mbuf + cluster objects.
2233  */
2234 static unsigned int
2235 mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed,
2236     int wait)
2237 {
2238         mbuf_class_t class = (mbuf_class_t)arg;
2239         mbuf_class_t cl_class = 0;
2240         unsigned int num = 0, cnum = 0, want = needed;
2241         mcache_obj_t *ref_list = NULL;
2242         mcache_obj_t *mp_list = NULL;
2243         mcache_obj_t *clp_list = NULL;
2244         mcache_obj_t **list;
2245         struct ext_ref *rfa;
2246         struct mbuf *m;
2247         void *cl;
2248
2249         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2250         ASSERT(needed > 0);
2251
2252         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2253
2254         /* There should not be any slab for this class */
2255         VERIFY(m_slab_cnt(class) == 0 &&
2256             m_slablist(class).tqh_first == NULL &&
2257             m_slablist(class).tqh_last == NULL);
2258
2259         lck_mtx_lock(mbuf_mlock);
2260
2261         /* Try using the freelist first */
2262         num = cslab_alloc(class, plist, needed);
2263         list = *plist;
2264         if (num == needed) {
2265                 m_alloc_cnt(class) += num;
2266                 lck_mtx_unlock(mbuf_mlock);
2267                 return (needed);
2268         }
2269
2270         lck_mtx_unlock(mbuf_mlock);
2271
2272         /*
2273          * We could not satisfy the request using the freelist alone;
2274          * allocate from the appropriate rudimentary caches and use
2275          * whatever we can get to construct the composite objects.
2276          */
2277         needed -= num;
2278
2279         /*
2280          * Mark these allocation requests as coming from a composite cache.
2281          * Also, if the caller is willing to be blocked, mark the request
2282          * with MCR_FAILOK such that we don't end up sleeping at the mbuf
2283          * slab layer waiting for the individual object when one or more
2284          * of the already-constructed composite objects are available.
2285          */
2286         wait |= MCR_COMP;
2287         if (!(wait & MCR_NOSLEEP))
2288                 wait |= MCR_FAILOK;
2289
2290         /* allocate mbufs */
2291         needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
2292         if (needed == 0) {
2293                 ASSERT(mp_list == NULL);
2294                 goto fail;
2295         }
2296
2297         /* allocate clusters */
2298         if (class == MC_MBUF_CL) {
2299                 cl_class = MC_CL;
2300         } else if (class == MC_MBUF_BIGCL) {
2301                 cl_class = MC_BIGCL;
2302         } else {
2303                 VERIFY(class == MC_MBUF_16KCL);
2304                 cl_class = MC_16KCL;
2305         }
2306         needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait);
2307         if (needed == 0) {
2308                 ASSERT(clp_list == NULL);
2309                 goto fail;
2310         }
2311
2312         needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
2313         if (needed == 0) {
2314                 ASSERT(ref_list == NULL);
2315                 goto fail;
2316         }
2317
2318         /*
2319          * By this time "needed" is MIN(mbuf, cluster, ref).  Any left
2320          * overs will get freed accordingly before we return to caller.
2321          */
2322         for (cnum = 0; cnum < needed; cnum++) {
2323                 struct mbuf *ms;
2324
2325                 m = ms = (struct mbuf *)mp_list;
2326                 mp_list = mp_list->obj_next;
2327
2328                 cl = clp_list;
2329                 clp_list = clp_list->obj_next;
2330                 ((mcache_obj_t *)cl)->obj_next = NULL;
2331
2332                 rfa = (struct ext_ref *)ref_list;
2333                 ref_list = ref_list->obj_next;
2334                 ((mcache_obj_t *)rfa)->obj_next = NULL;
2335
2336                 /*
2337                  * If auditing is enabled, construct the shadow mbuf
2338                  * in the audit structure instead of in the actual one.
2339                  * mbuf_cslab_audit() will take care of restoring the
2340                  * contents after the integrity check.
2341                  */
2342                 if (mclaudit != NULL) {
2343                         mcache_audit_t *mca, *cl_mca;
2344
2345                         lck_mtx_lock(mbuf_mlock);
2346                         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2347                         ms = ((struct mbuf *)mca->mca_contents);
2348                         cl_mca = mcl_audit_buf2mca(MC_CL, (mcache_obj_t *)cl);
2349
2350                         /*
2351                          * Pair them up.  Note that this is done at the time
2352                          * the mbuf+cluster objects are constructed.  This
2353                          * information should be treated as "best effort"
2354                          * debugging hint since more than one mbufs can refer
2355                          * to a cluster.  In that case, the cluster might not
2356                          * be freed along with the mbuf it was paired with.
2357                          */
2358                         mca->mca_uptr = cl_mca;
2359                         cl_mca->mca_uptr = mca;
2360
2361                         ASSERT(mca->mca_uflags & MB_SCVALID);
2362                         ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
2363                         lck_mtx_unlock(mbuf_mlock);
2364
2365                         /* Technically, they are in the freelist */
2366                         if (mclverify) {
2367                                 size_t size;
2368
2369                                 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
2370                                     m_maxsize(MC_MBUF));
2371
2372                                 if (class == MC_MBUF_CL)
2373                                         size = m_maxsize(MC_CL);
2374                                 else if (class == MC_MBUF_BIGCL)
2375                                         size = m_maxsize(MC_BIGCL);
2376                                 else
2377                                         size = m_maxsize(MC_16KCL);
2378
2379                                 mcache_set_pattern(MCACHE_FREE_PATTERN, cl,
2380                                     size);
2381                         }
2382                 }
2383
2384                 MBUF_INIT(ms, 0, MT_FREE);
2385                 if (class == MC_MBUF_16KCL) {
2386                         MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2387                 } else if (class == MC_MBUF_BIGCL) {
2388                         MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2389                 } else {
2390                         MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2391                 }
2392                 VERIFY(ms->m_flags == M_EXT);
2393                 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2394
2395                 *list = (mcache_obj_t *)m;
2396                 (*list)->obj_next = NULL;
2397                 list = *plist = &(*list)->obj_next;
2398         }
2399
2400 fail:
2401         /*
2402          * Free up what's left of the above.
2403          */
2404         if (mp_list != NULL)
2405                 mcache_free_ext(m_cache(MC_MBUF), mp_list);
2406         if (clp_list != NULL)
2407                 mcache_free_ext(m_cache(cl_class), clp_list);
2408         if (ref_list != NULL)
2409                 mcache_free_ext(ref_cache, ref_list);
2410
2411         lck_mtx_lock(mbuf_mlock);
2412         if (num > 0 || cnum > 0) {
2413                 m_total(class) += cnum;
2414                 VERIFY(m_total(class) <= m_maxlimit(class));
2415                 m_alloc_cnt(class) += num + cnum;
2416         }
2417         if ((num + cnum) < want)
2418                 m_fail_cnt(class) += (want - (num + cnum));
2419         lck_mtx_unlock(mbuf_mlock);
2420
2421         return (num + cnum);
2422 }
2423
2424 /*
2425  * Common de-allocator for composite objects called by the CPU cache
2426  * layer when one or more elements need to be returned to the appropriate
2427  * global freelist.
2428  */
2429 static void
2430 mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
2431 {
2432         mbuf_class_t class = (mbuf_class_t)arg;
2433         unsigned int num;
2434         int w;
2435
2436         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2437
2438         lck_mtx_lock(mbuf_mlock);
2439
2440         num = cslab_free(class, list, purged);
2441         m_free_cnt(class) += num;
2442
2443         if ((w = mb_waiters) > 0)
2444                 mb_waiters = 0;
2445
2446         lck_mtx_unlock(mbuf_mlock);
2447
2448         if (w != 0)
2449                 wakeup(mb_waitchan);
2450 }
2451
2452 /*
2453  * Common auditor for composite objects called by the CPU cache layer
2454  * during an allocation or free request.  For the former, this is called
2455  * after the objects are obtained from either the bucket or slab layer
2456  * and before they are returned to the caller.  For the latter, this is
2457  * called immediately during free and before placing the objects into
2458  * the bucket or slab layer.
2459  */
2460 static void
2461 mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2462 {
2463         mbuf_class_t class = (mbuf_class_t)arg;
2464         mcache_audit_t *mca;
2465         struct mbuf *m, *ms;
2466         mcl_slab_t *clsp, *nsp;
2467         size_t size;
2468         void *cl;
2469
2470         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2471
2472         while ((m = ms = (struct mbuf *)list) != NULL) {
2473                 lck_mtx_lock(mbuf_mlock);
2474                 /* Do the mbuf sanity checks and record its transaction */
2475                 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2476                 mcl_audit_mbuf(mca, m, TRUE, alloc);
2477                 if (mcltrace)
2478                         mcache_buffer_log(mca, m, m_cache(class));
2479
2480                 if (alloc)
2481                         mca->mca_uflags |= MB_COMP_INUSE;
2482                 else
2483                         mca->mca_uflags &= ~MB_COMP_INUSE;
2484
2485                 /*
2486                  * Use the shadow mbuf in the audit structure if we are
2487                  * freeing, since the contents of the actual mbuf has been
2488                  * pattern-filled by the above call to mcl_audit_mbuf().
2489                  */
2490                 if (!alloc && mclverify)
2491                         ms = (struct mbuf *)mca->mca_contents;
2492
2493                 /* Do the cluster sanity checks and record its transaction */
2494                 cl = ms->m_ext.ext_buf;
2495                 clsp = slab_get(cl);
2496                 VERIFY(ms->m_flags == M_EXT && cl != NULL);
2497                 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2498                 if (class == MC_MBUF_CL)
2499                         VERIFY(clsp->sl_refcnt >= 1 &&
2500                             clsp->sl_refcnt <= NCLPBG);
2501                 else
2502                         VERIFY(clsp->sl_refcnt == 1);
2503
2504                 if (class == MC_MBUF_16KCL) {
2505                         int k;
2506                         for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2507                                 nsp = nsp->sl_next;
2508                                 /* Next slab must already be present */
2509                                 VERIFY(nsp != NULL);
2510                                 VERIFY(nsp->sl_refcnt == 1);
2511                         }
2512                 }
2513
2514                 mca = mcl_audit_buf2mca(MC_CL, cl);
2515                 if (class == MC_MBUF_CL)
2516                         size = m_maxsize(MC_CL);
2517                 else if (class == MC_MBUF_BIGCL)
2518                         size = m_maxsize(MC_BIGCL);
2519                 else
2520                         size = m_maxsize(MC_16KCL);
2521                 mcl_audit_cluster(mca, cl, size, alloc, FALSE);
2522                 if (mcltrace)
2523                         mcache_buffer_log(mca, cl, m_cache(class));
2524
2525                 if (alloc)
2526                         mca->mca_uflags |= MB_COMP_INUSE;
2527                 else
2528                         mca->mca_uflags &= ~MB_COMP_INUSE;
2529                 lck_mtx_unlock(mbuf_mlock);
2530
2531                 list = list->obj_next;
2532         }
2533 }
2534
2535 /*
2536  * Allocate some number of mbuf clusters and place on cluster freelist.
2537  */
2538 static int
2539 m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
2540 {
2541         int i;
2542         vm_size_t size = 0;
2543         int numpages = 0, large_buffer = (bufsize == m_maxsize(MC_16KCL));
2544         vm_offset_t page = 0;
2545         mcache_audit_t *mca_list = NULL;
2546         mcache_obj_t *con_list = NULL;
2547         mcl_slab_t *sp;
2548
2549         VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
2550             bufsize == m_maxsize(MC_16KCL));
2551
2552         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2553
2554         /*
2555          * Multiple threads may attempt to populate the cluster map one
2556          * after another.  Since we drop the lock below prior to acquiring
2557          * the physical page(s), our view of the cluster map may no longer
2558          * be accurate, and we could end up over-committing the pages beyond
2559          * the maximum allowed for each class.  To prevent it, this entire
2560          * operation (including the page mapping) is serialized.
2561          */
2562         while (mb_clalloc_busy) {
2563                 mb_clalloc_waiters++;
2564                 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
2565                     (PZERO-1), "m_clalloc", NULL);
2566                 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2567         }
2568
2569         /* We are busy now; tell everyone else to go away */
2570         mb_clalloc_busy = TRUE;
2571
2572         /*
2573          * Honor the caller's wish to block or not block.  We have a way
2574          * to grow the pool asynchronously using the mbuf worker thread.
2575          */
2576         i = m_howmany(num, bufsize);
2577         if (i == 0 || (wait & M_DONTWAIT))
2578                 goto out;
2579
2580         lck_mtx_unlock(mbuf_mlock);
2581
2582         size = round_page(i * bufsize);
2583         page = kmem_mb_alloc(mb_map, size, large_buffer);
2584
2585         /*
2586          * If we did ask for "n" 16KB physically contiguous chunks
2587          * and didn't get them, then please try again without this
2588          * restriction.
2589          */
2590         if (large_buffer && page == 0)
2591                 page = kmem_mb_alloc(mb_map, size, 0);
2592
2593         if (page == 0) {
2594                 if (bufsize == m_maxsize(MC_BIGCL)) {
2595                         /* Try for 1 page if failed, only 4KB request */
2596                         size = NBPG;
2597                         page = kmem_mb_alloc(mb_map, size, 0);
2598                 }
2599
2600                 if (page == 0) {
2601                         lck_mtx_lock(mbuf_mlock);
2602                         goto out;
2603                 }
2604         }
2605
2606         VERIFY(IS_P2ALIGNED(page, NBPG));
2607         numpages = size / NBPG;
2608
2609         /* If auditing is enabled, allocate the audit structures now */
2610         if (mclaudit != NULL) {
2611                 int needed;
2612
2613                 /*
2614                  * Yes, I realize this is a waste of memory for clusters
2615                  * that never get transformed into mbufs, as we may end
2616                  * up with NMBPBG-1 unused audit structures per cluster.
2617                  * But doing so tremendously simplifies the allocation
2618                  * strategy, since at this point we are not holding the
2619                  * mbuf lock and the caller is okay to be blocked.
2620                  */
2621                 if (bufsize == m_maxsize(MC_BIGCL)) {
2622                         needed = numpages * NMBPBG;
2623
2624                         i = mcache_alloc_ext(mcl_audit_con_cache,
2625                             &con_list, needed, MCR_SLEEP);
2626
2627                         VERIFY(con_list != NULL && i == needed);
2628                 } else {
2629                         needed = numpages / NSLABSP16KB;
2630                 }
2631
2632                 i = mcache_alloc_ext(mcache_audit_cache,
2633                     (mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
2634
2635                 VERIFY(mca_list != NULL && i == needed);
2636         }
2637
2638         lck_mtx_lock(mbuf_mlock);
2639
2640         for (i = 0; i < numpages; i++, page += NBPG) {
2641                 ppnum_t offset = ((char *)page - (char *)mbutl) / NBPG;
2642                 ppnum_t new_page = pmap_find_phys(kernel_pmap,
2643                     (vm_offset_t)page);
2644
2645                 /*
2646                  * In the case of no mapper being available the following
2647                  * code noops and returns the input page; if there is a
2648                  * mapper the appropriate I/O page is returned.
2649                  */
2650                 VERIFY(offset < mcl_pages);
2651                 new_page = IOMapperInsertPage(mcl_paddr_base, offset, new_page);
2652                 mcl_paddr[offset] = new_page << PGSHIFT;
2653
2654                 /* Pattern-fill this fresh page */
2655                 if (mclverify) {
2656                         mcache_set_pattern(MCACHE_FREE_PATTERN,
2657                             (caddr_t)page, NBPG);
2658                 }
2659                 if (bufsize == m_maxsize(MC_BIGCL)) {
2660                         union mbigcluster *mbc = (union mbigcluster *)page;
2661
2662                         /* One for the entire page */
2663                         sp = slab_get(mbc);
2664                         if (mclaudit != NULL) {
2665                                 mcl_audit_init(mbc, &mca_list, &con_list,
2666                                     AUDIT_CONTENTS_SIZE, NMBPBG);
2667                         }
2668                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2669                         slab_init(sp, MC_BIGCL, SLF_MAPPED,
2670                             mbc, mbc, bufsize, 0, 1);
2671
2672                         /* Insert this slab */
2673                         slab_insert(sp, MC_BIGCL);
2674
2675                         /* Update stats now since slab_get() drops the lock */
2676                         mbstat.m_bigclfree = ++m_infree(MC_BIGCL) +
2677                             m_infree(MC_MBUF_BIGCL);
2678                         mbstat.m_bigclusters = ++m_total(MC_BIGCL);
2679                         VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
2680                 } else if ((i % NSLABSP16KB) == 0) {
2681                         union m16kcluster *m16kcl = (union m16kcluster *)page;
2682                         mcl_slab_t *nsp;
2683                         int k;
2684
2685                         VERIFY(njcl > 0);
2686                         /* One for the entire 16KB */
2687                         sp = slab_get(m16kcl);
2688                         if (mclaudit != NULL)
2689                                 mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
2690
2691                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2692                         slab_init(sp, MC_16KCL, SLF_MAPPED,
2693                             m16kcl, m16kcl, bufsize, 0, 1);
2694
2695                         /*
2696                          * 2nd-Nth page's slab is part of the first one,
2697                          * where N is NSLABSP16KB.
2698                          */
2699                         for (k = 1; k < NSLABSP16KB; k++) {
2700                                 nsp = slab_get(((union mbigcluster *)page) + k);
2701                                 VERIFY(nsp->sl_refcnt == 0 &&
2702                                     nsp->sl_flags == 0);
2703                                 slab_init(nsp, MC_16KCL,
2704                                     SLF_MAPPED | SLF_PARTIAL,
2705                                     m16kcl, NULL, 0, 0, 0);
2706                         }
2707
2708                         /* Insert this slab */
2709                         slab_insert(sp, MC_16KCL);
2710
2711                         /* Update stats now since slab_get() drops the lock */
2712                         m_infree(MC_16KCL)++;
2713                         m_total(MC_16KCL)++;
2714                         VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
2715                 }
2716         }
2717         VERIFY(mca_list == NULL && con_list == NULL);
2718
2719         /* We're done; let others enter */
2720         mb_clalloc_busy = FALSE;
2721         if (mb_clalloc_waiters > 0) {
2722                 mb_clalloc_waiters = 0;
2723                 wakeup(mb_clalloc_waitchan);
2724         }
2725
2726         if (bufsize == m_maxsize(MC_BIGCL))
2727                 return (numpages);
2728
2729         VERIFY(bufsize == m_maxsize(MC_16KCL));
2730         return (numpages / NSLABSP16KB);
2731
2732 out:
2733         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2734
2735         /* We're done; let others enter */
2736         mb_clalloc_busy = FALSE;
2737         if (mb_clalloc_waiters > 0) {
2738                 mb_clalloc_waiters = 0;
2739                 wakeup(mb_clalloc_waitchan);
2740         }
2741
2742         /*
2743          * When non-blocking we kick a thread if we have to grow the
2744          * pool or if the number of free clusters is less than requested.
2745          */
2746         if (bufsize == m_maxsize(MC_BIGCL)) {
2747                 if (i > 0) {
2748                         /*
2749                          * Remember total number of 4KB clusters needed
2750                          * at this time.
2751                          */
2752                         i += m_total(MC_BIGCL);
2753                         if (i > mbuf_expand_big) {
2754                                 mbuf_expand_big = i;
2755                                 if (mbuf_worker_ready)
2756                                         wakeup((caddr_t)&mbuf_worker_run);
2757                         }
2758                 }
2759
2760                 if (m_infree(MC_BIGCL) >= num)
2761                         return (1);
2762         } else {
2763                 if (i > 0) {
2764                         /*
2765                          * Remember total number of 16KB clusters needed
2766                          * at this time.
2767                          */
2768                         i += m_total(MC_16KCL);
2769                         if (i > mbuf_expand_16k) {
2770                                 mbuf_expand_16k = i;
2771                                 if (mbuf_worker_ready)
2772                                         wakeup((caddr_t)&mbuf_worker_run);
2773                         }
2774                 }
2775
2776                 if (m_infree(MC_16KCL) >= num)
2777                         return (1);
2778         }
2779         return (0);
2780 }
2781
2782 /*
2783  * Populate the global freelist of the corresponding buffer class.
2784  */
2785 static int
2786 freelist_populate(mbuf_class_t class, unsigned int num, int wait)
2787 {
2788         mcache_obj_t *o = NULL;
2789         int i, numpages = 0, count;
2790
2791         VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL ||
2792             class == MC_16KCL);
2793
2794         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2795
2796         switch (class) {
2797         case MC_MBUF:
2798         case MC_CL:
2799         case MC_BIGCL:
2800                 numpages = (num * m_size(class) + NBPG - 1) / NBPG;
2801                 i = m_clalloc(numpages, wait, m_maxsize(MC_BIGCL));
2802
2803                 /* Respect the 4KB clusters minimum limit */
2804                 if (m_total(MC_BIGCL) == m_maxlimit(MC_BIGCL) &&
2805                     m_infree(MC_BIGCL) <= m_minlimit(MC_BIGCL)) {
2806                         if (class != MC_BIGCL || (wait & MCR_COMP))
2807                                 return (0);
2808                 }
2809                 if (class == MC_BIGCL)
2810                         return (i != 0);
2811                 break;
2812
2813         case MC_16KCL:
2814                 return (m_clalloc(num, wait, m_maxsize(class)) != 0);
2815                 /* NOTREACHED */
2816
2817         default:
2818                 VERIFY(0);
2819                 /* NOTREACHED */
2820         }
2821
2822         VERIFY(class == MC_MBUF || class == MC_CL);
2823
2824         /* how many objects will we cut the page into? */
2825         int numobj = (class == MC_MBUF ? NMBPBG : NCLPBG);
2826
2827         for (count = 0; count < numpages; count++) {
2828
2829                 /* respect totals, minlimit, maxlimit */
2830                 if (m_total(MC_BIGCL) <= m_minlimit(MC_BIGCL) ||
2831                     m_total(class) >= m_maxlimit(class))
2832                         break;
2833
2834                 if ((o = slab_alloc(MC_BIGCL, wait)) == NULL)
2835                         break;
2836
2837                 struct mbuf *m = (struct mbuf *)o;
2838                 union mcluster *c = (union mcluster *)o;
2839                 mcl_slab_t *sp = slab_get(o);
2840                 mcache_audit_t *mca = NULL;
2841
2842                 VERIFY(slab_is_detached(sp) &&
2843                     (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
2844
2845                 /*
2846                  * Make sure that the cluster is unmolested
2847                  * while in freelist
2848                  */
2849                 if (mclverify) {
2850                         mca = mcl_audit_buf2mca(MC_BIGCL, o);
2851                         mcache_audit_free_verify(mca, o, 0,
2852                             m_maxsize(MC_BIGCL));
2853                 }
2854
2855                 /* Reinitialize it as an mbuf or 2K slab */
2856                 slab_init(sp, class, sp->sl_flags,
2857                     sp->sl_base, NULL, sp->sl_len, 0, numobj);
2858
2859                 VERIFY(o == (mcache_obj_t *)sp->sl_base);
2860                 VERIFY(sp->sl_head == NULL);
2861
2862                 VERIFY(m_total(MC_BIGCL) > 0);
2863                 m_total(MC_BIGCL)--;
2864                 mbstat.m_bigclusters = m_total(MC_BIGCL);
2865
2866                 m_total(class) += numobj;
2867                 m_infree(class) += numobj;
2868
2869                 VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
2870                 VERIFY(m_total(class) <= m_maxlimit(class));
2871
2872                 i = numobj;
2873                 if (class == MC_MBUF) {
2874                         mbstat.m_mbufs = m_total(MC_MBUF);
2875                         mtype_stat_add(MT_FREE, NMBPBG);
2876                         while (i--) {
2877                                 /*
2878                                  * If auditing is enabled, construct the
2879                                  * shadow mbuf in the audit structure
2880                                  * instead of the actual one.
2881                                  * mbuf_slab_audit() will take care of
2882                                  * restoring the contents after the
2883                                  * integrity check.
2884                                  */
2885                                 if (mclaudit != NULL) {
2886                                         struct mbuf *ms;
2887                                         mca = mcl_audit_buf2mca(MC_MBUF,
2888                                             (mcache_obj_t *)m);
2889                                         ms = ((struct mbuf *)
2890                                             mca->mca_contents);
2891                                         ms->m_type = MT_FREE;
2892                                 } else {
2893                                         m->m_type = MT_FREE;
2894                                 }
2895                                 m->m_next = sp->sl_head;
2896                                 sp->sl_head = (void *)m++;
2897                         }
2898                 } else { /* MC_CL */
2899                         mbstat.m_clfree =
2900                             m_infree(MC_CL) + m_infree(MC_MBUF_CL);
2901                         mbstat.m_clusters = m_total(MC_CL);
2902                         while (i--) {
2903                                 c->mcl_next = sp->sl_head;
2904                                 sp->sl_head = (void *)c++;
2905                         }
2906                 }
2907
2908                 /* Insert into the mbuf or 2k slab list */
2909                 slab_insert(sp, class);
2910
2911                 if ((i = mb_waiters) > 0)
2912                         mb_waiters = 0;
2913                 if (i != 0)
2914                         wakeup(mb_waitchan);
2915         }
2916         return (count != 0);
2917 }
2918
2919 /*
2920  * For each class, initialize the freelist to hold m_minlimit() objects.
2921  */
2922 static void
2923 freelist_init(mbuf_class_t class)
2924 {
2925         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2926
2927         VERIFY(class == MC_CL || class == MC_BIGCL);
2928         VERIFY(m_total(class) == 0);
2929         VERIFY(m_minlimit(class) > 0);
2930
2931         while (m_total(class) < m_minlimit(class))
2932                 (void) freelist_populate(class, m_minlimit(class), M_WAIT);
2933
2934         VERIFY(m_total(class) >= m_minlimit(class));
2935 }
2936
2937 /*
2938  * (Inaccurately) check if it might be worth a trip back to the
2939  * mcache layer due the availability of objects there.  We'll
2940  * end up back here if there's nothing up there.
2941  */
2942 static boolean_t
2943 mbuf_cached_above(mbuf_class_t class, int wait)
2944 {
2945         switch (class) {
2946         case MC_MBUF:
2947                 if (wait & MCR_COMP)
2948                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)) ||
2949                             !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
2950                 break;
2951
2952         case MC_CL:
2953                 if (wait & MCR_COMP)
2954                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)));
2955                 break;
2956
2957         case MC_BIGCL:
2958                 if (wait & MCR_COMP)
2959                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
2960                 break;
2961
2962         case MC_16KCL:
2963                 if (wait & MCR_COMP)
2964                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL)));
2965                 break;
2966
2967         case MC_MBUF_CL:
2968         case MC_MBUF_BIGCL:
2969         case MC_MBUF_16KCL:
2970                 break;
2971
2972         default:
2973                 VERIFY(0);
2974                 /* NOTREACHED */
2975         }
2976
2977         return (!mcache_bkt_isempty(m_cache(class)));
2978 }
2979
2980 /*
2981  * If possible, convert constructed objects to raw ones.
2982  */
2983 static boolean_t
2984 mbuf_steal(mbuf_class_t class, unsigned int num)
2985 {
2986         mcache_obj_t *top = NULL;
2987         mcache_obj_t **list = &top;
2988         unsigned int tot = 0;
2989
2990         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2991
2992         switch (class) {
2993         case MC_MBUF:
2994         case MC_CL:
2995         case MC_BIGCL:
2996         case MC_16KCL:
2997                 return (FALSE);
2998
2999         case MC_MBUF_CL:
3000         case MC_MBUF_BIGCL:
3001         case MC_MBUF_16KCL:
3002                 /* Get the required number of constructed objects if possible */
3003                 if (m_infree(class) > m_minlimit(class)) {
3004                         tot = cslab_alloc(class, &list,
3005                             MIN(num, m_infree(class)));
3006                 }
3007
3008                 /* And destroy them to get back the raw objects */
3009                 if (top != NULL)
3010                         (void) cslab_free(class, top, 1);
3011                 break;
3012
3013         default:
3014                 VERIFY(0);
3015                 /* NOTREACHED */
3016         }
3017
3018         return (tot == num);
3019 }
3020
3021 static void
3022 m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
3023 {
3024         int m, bmap = 0;
3025
3026         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3027
3028         VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
3029         VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
3030         VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3031
3032         /*
3033          * This logic can be made smarter; for now, simply mark
3034          * all other related classes as potential victims.
3035          */
3036         switch (class) {
3037         case MC_MBUF:
3038                 m_wantpurge(MC_CL)++;
3039                 m_wantpurge(MC_BIGCL)++;
3040                 m_wantpurge(MC_MBUF_CL)++;
3041                 m_wantpurge(MC_MBUF_BIGCL)++;
3042                 break;
3043
3044         case MC_CL:
3045                 m_wantpurge(MC_MBUF)++;
3046                 m_wantpurge(MC_BIGCL)++;
3047                 m_wantpurge(MC_MBUF_BIGCL)++;
3048                 if (!comp)
3049                         m_wantpurge(MC_MBUF_CL)++;
3050                 break;
3051
3052         case MC_BIGCL:
3053                 m_wantpurge(MC_MBUF)++;
3054                 m_wantpurge(MC_CL)++;
3055                 m_wantpurge(MC_MBUF_CL)++;
3056                 if (!comp)
3057                         m_wantpurge(MC_MBUF_BIGCL)++;
3058                 break;
3059
3060         case MC_16KCL:
3061                 if (!comp)
3062                         m_wantpurge(MC_MBUF_16KCL)++;
3063                 break;
3064
3065         default:
3066                 VERIFY(0);
3067                 /* NOTREACHED */
3068         }
3069
3070         /*
3071          * Run through each marked class and check if we really need to
3072          * purge (and therefore temporarily disable) the per-CPU caches
3073          * layer used by the class.  If so, remember the classes since
3074          * we are going to drop the lock below prior to purging.
3075          */
3076         for (m = 0; m < NELEM(mbuf_table); m++) {
3077                 if (m_wantpurge(m) > 0) {
3078                         m_wantpurge(m) = 0;
3079                         /*
3080                          * Try hard to steal the required number of objects
3081                          * from the freelist of other mbuf classes.  Only
3082                          * purge and disable the per-CPU caches layer when
3083                          * we don't have enough; it's the last resort.
3084                          */
3085                         if (!mbuf_steal(m, num))
3086                                 bmap |= (1 << m);
3087                 }
3088         }
3089
3090         lck_mtx_unlock(mbuf_mlock);
3091
3092         if (bmap != 0) {
3093                 /* drain is performed in pfslowtimo(), to avoid deadlocks */
3094                 do_reclaim = 1;
3095
3096                 /* Sigh; we have no other choices but to ask mcache to purge */
3097                 for (m = 0; m < NELEM(mbuf_table); m++) {
3098                         if ((bmap & (1 << m)) &&
3099                             mcache_purge_cache(m_cache(m))) {
3100                                 lck_mtx_lock(mbuf_mlock);
3101                                 m_purge_cnt(m)++;
3102                                 mbstat.m_drain++;
3103                                 lck_mtx_unlock(mbuf_mlock);
3104                         }
3105                 }
3106         } else {
3107                 /*
3108                  * Request mcache to reap extra elements from all of its caches;
3109                  * note that all reaps are serialized and happen only at a fixed
3110                  * interval.
3111                  */
3112                 mcache_reap();
3113         }
3114         lck_mtx_lock(mbuf_mlock);
3115 }
3116
3117 static inline struct mbuf *
3118 m_get_common(int wait, short type, int hdr)
3119 {
3120         struct mbuf *m;
3121         int mcflags = MSLEEPF(wait);
3122
3123         /* Is this due to a non-blocking retry?  If so, then try harder */
3124         if (mcflags & MCR_NOSLEEP)
3125                 mcflags |= MCR_TRYHARD;
3126
3127         m = mcache_alloc(m_cache(MC_MBUF), mcflags);
3128         if (m != NULL) {
3129                 MBUF_INIT(m, hdr, type);
3130                 mtype_stat_inc(type);
3131                 mtype_stat_dec(MT_FREE);
3132 #if CONFIG_MACF_NET
3133                 if (hdr && mac_init_mbuf(m, wait) != 0) {
3134                         m_free(m);
3135                         return (NULL);
3136                 }
3137 #endif /* MAC_NET */
3138         }
3139         return (m);
3140 }
3141
3142 /*
3143  * Space allocation routines; these are also available as macros
3144  * for critical paths.
3145  */
3146 #define _M_GET(wait, type)      m_get_common(wait, type, 0)
3147 #define _M_GETHDR(wait, type)   m_get_common(wait, type, 1)
3148 #define _M_RETRY(wait, type)    _M_GET(wait, type)
3149 #define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
3150 #define _MGET(m, how, type)     ((m) = _M_GET(how, type))
3151 #define _MGETHDR(m, how, type)  ((m) = _M_GETHDR(how, type))
3152
3153 struct mbuf *
3154 m_get(int wait, int type)
3155 {
3156         return (_M_GET(wait, type));
3157 }
3158
3159 struct mbuf *
3160 m_gethdr(int wait, int type)
3161 {
3162         return (_M_GETHDR(wait, type));
3163 }
3164
3165 struct mbuf *
3166 m_retry(int wait, int type)
3167 {
3168         return (_M_RETRY(wait, type));
3169 }
3170
3171 struct mbuf *
3172 m_retryhdr(int wait, int type)
3173 {
3174         return (_M_RETRYHDR(wait, type));
3175 }
3176
3177 struct mbuf *
3178 m_getclr(int wait, int type)
3179 {
3180         struct mbuf *m;
3181
3182         _MGET(m, wait, type);
3183         if (m != NULL)
3184                 bzero(MTOD(m, caddr_t), MLEN);
3185         return (m);
3186 }
3187
3188 struct mbuf *
3189 m_free(struct mbuf *m)
3190 {
3191         struct mbuf *n = m->m_next;
3192
3193         if (m->m_type == MT_FREE)
3194                 panic("m_free: freeing an already freed mbuf");
3195
3196         /* Free the aux data and tags if there is any */
3197         if (m->m_flags & M_PKTHDR) {
3198                 m_tag_delete_chain(m, NULL);
3199         }
3200
3201         if (m->m_flags & M_EXT) {
3202                 u_int32_t refcnt;
3203                 u_int32_t composite;
3204
3205                 refcnt = m_decref(m);
3206                 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3207                 if (refcnt == 0 && !composite) {
3208                         if (m->m_ext.ext_free == NULL) {
3209                                 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3210                         } else if (m->m_ext.ext_free == m_bigfree) {
3211                                 mcache_free(m_cache(MC_BIGCL),
3212                                     m->m_ext.ext_buf);
3213                         } else if (m->m_ext.ext_free == m_16kfree) {
3214                                 mcache_free(m_cache(MC_16KCL),
3215                                     m->m_ext.ext_buf);
3216                         } else {
3217                                 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
3218                                     m->m_ext.ext_size, m->m_ext.ext_arg);
3219                         }
3220                         mcache_free(ref_cache, MEXT_RFA(m));
3221                         MEXT_RFA(m) = NULL;
3222                 } else if (refcnt == 0 && composite) {
3223                         VERIFY(m->m_type != MT_FREE);
3224
3225                         mtype_stat_dec(m->m_type);
3226                         mtype_stat_inc(MT_FREE);
3227
3228                         m->m_type = MT_FREE;
3229                         m->m_flags = M_EXT;
3230                         m->m_len = 0;
3231                         m->m_next = m->m_nextpkt = NULL;
3232
3233                         MEXT_FLAGS(m) &= ~EXTF_READONLY;
3234
3235                         /* "Free" into the intermediate cache */
3236                         if (m->m_ext.ext_free == NULL) {
3237                                 mcache_free(m_cache(MC_MBUF_CL), m);
3238                         } else if (m->m_ext.ext_free == m_bigfree) {
3239                                 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3240                         } else {
3241                                 VERIFY(m->m_ext.ext_free == m_16kfree);
3242                                 mcache_free(m_cache(MC_MBUF_16KCL), m);
3243                         }
3244                         return (n);
3245                 }
3246         }
3247
3248         if (m->m_type != MT_FREE) {
3249                 mtype_stat_dec(m->m_type);
3250                 mtype_stat_inc(MT_FREE);
3251         }
3252
3253         m->m_type = MT_FREE;
3254         m->m_flags = m->m_len = 0;
3255         m->m_next = m->m_nextpkt = NULL;
3256
3257         mcache_free(m_cache(MC_MBUF), m);
3258
3259         return (n);
3260 }
3261
3262 __private_extern__ struct mbuf *
3263 m_clattach(struct mbuf *m, int type, caddr_t extbuf,
3264     void (*extfree)(caddr_t, u_int, caddr_t), u_int extsize, caddr_t extarg,
3265     int wait)
3266 {
3267         struct ext_ref *rfa = NULL;
3268
3269         if (m == NULL && (m = _M_GETHDR(wait, type)) == NULL)
3270                 return (NULL);
3271
3272         if (m->m_flags & M_EXT) {
3273                 u_int32_t refcnt;
3274                 u_int32_t composite;
3275
3276                 refcnt = m_decref(m);
3277                 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3278                 if (refcnt == 0 && !composite) {
3279                         if (m->m_ext.ext_free == NULL) {
3280                                 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3281                         } else if (m->m_ext.ext_free == m_bigfree) {
3282                                 mcache_free(m_cache(MC_BIGCL),
3283                                     m->m_ext.ext_buf);
3284                         } else if (m->m_ext.ext_free == m_16kfree) {
3285                                 mcache_free(m_cache(MC_16KCL),
3286                                     m->m_ext.ext_buf);
3287                         } else {
3288                                 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
3289                                     m->m_ext.ext_size, m->m_ext.ext_arg);
3290                         }
3291                         /* Re-use the reference structure */
3292                         rfa = MEXT_RFA(m);
3293                 } else if (refcnt == 0 && composite) {
3294                         VERIFY(m->m_type != MT_FREE);
3295
3296                         mtype_stat_dec(m->m_type);
3297                         mtype_stat_inc(MT_FREE);
3298
3299                         m->m_type = MT_FREE;
3300                         m->m_flags = M_EXT;
3301                         m->m_len = 0;
3302                         m->m_next = m->m_nextpkt = NULL;
3303
3304                         MEXT_FLAGS(m) &= ~EXTF_READONLY;
3305
3306                         /* "Free" into the intermediate cache */
3307                         if (m->m_ext.ext_free == NULL) {
3308                                 mcache_free(m_cache(MC_MBUF_CL), m);
3309                         } else if (m->m_ext.ext_free == m_bigfree) {
3310                                 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3311                         } else {
3312                                 VERIFY(m->m_ext.ext_free == m_16kfree);
3313                                 mcache_free(m_cache(MC_MBUF_16KCL), m);
3314                         }
3315                         /*
3316                          * Allocate a new mbuf, since we didn't divorce
3317                          * the composite mbuf + cluster pair above.
3318                          */
3319                         if ((m = _M_GETHDR(wait, type)) == NULL)
3320                                 return (NULL);
3321                 }
3322         }
3323
3324         if (rfa == NULL &&
3325             (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
3326                 m_free(m);
3327                 return (NULL);
3328         }
3329
3330         MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa, 1, 0);
3331
3332         return (m);
3333 }
3334
3335 /*
3336  * Perform `fast' allocation mbuf clusters from a cache of recently-freed
3337  * clusters. (If the cache is empty, new clusters are allocated en-masse.)
3338  */
3339 struct mbuf *
3340 m_getcl(int wait, int type, int flags)
3341 {
3342         struct mbuf *m;
3343         int mcflags = MSLEEPF(wait);
3344         int hdr = (flags & M_PKTHDR);
3345
3346         /* Is this due to a non-blocking retry?  If so, then try harder */
3347         if (mcflags & MCR_NOSLEEP)
3348                 mcflags |= MCR_TRYHARD;
3349
3350         m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags);
3351         if (m != NULL) {
3352                 u_int32_t flag;
3353                 struct ext_ref *rfa;
3354                 void *cl;
3355
3356                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3357                 cl = m->m_ext.ext_buf;
3358                 rfa = MEXT_RFA(m);
3359
3360                 ASSERT(cl != NULL && rfa != NULL);
3361                 VERIFY(MBUF_IS_COMPOSITE(m) && m->m_ext.ext_free == NULL);
3362
3363                 flag = MEXT_FLAGS(m);
3364
3365                 MBUF_INIT(m, hdr, type);
3366                 MBUF_CL_INIT(m, cl, rfa, 1, flag);
3367
3368                 mtype_stat_inc(type);
3369                 mtype_stat_dec(MT_FREE);
3370 #if CONFIG_MACF_NET
3371                 if (hdr && mac_init_mbuf(m, wait) != 0) {
3372                         m_freem(m);
3373                         return (NULL);
3374                 }
3375 #endif /* MAC_NET */
3376         }
3377         return (m);
3378 }
3379
3380 /* m_mclget() add an mbuf cluster to a normal mbuf */
3381 struct mbuf *
3382 m_mclget(struct mbuf *m, int wait)
3383 {
3384         struct ext_ref *rfa;
3385
3386         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3387                 return (m);
3388
3389         m->m_ext.ext_buf = m_mclalloc(wait);
3390         if (m->m_ext.ext_buf != NULL) {
3391                 MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3392         } else {
3393                 mcache_free(ref_cache, rfa);
3394         }
3395         return (m);
3396 }
3397
3398 /* Allocate an mbuf cluster */
3399 caddr_t
3400 m_mclalloc(int wait)
3401 {
3402         int mcflags = MSLEEPF(wait);
3403
3404         /* Is this due to a non-blocking retry?  If so, then try harder */
3405         if (mcflags & MCR_NOSLEEP)
3406                 mcflags |= MCR_TRYHARD;
3407
3408         return (mcache_alloc(m_cache(MC_CL), mcflags));
3409 }
3410
3411 /* Free an mbuf cluster */
3412 void
3413 m_mclfree(caddr_t p)
3414 {
3415         mcache_free(m_cache(MC_CL), p);
3416 }
3417
3418 /*
3419  * mcl_hasreference() checks if a cluster of an mbuf is referenced by
3420  * another mbuf; see comments in m_incref() regarding EXTF_READONLY.
3421  */
3422 int
3423 m_mclhasreference(struct mbuf *m)
3424 {
3425         if (!(m->m_flags & M_EXT))
3426                 return (0);
3427
3428         ASSERT(MEXT_RFA(m) != NULL);
3429
3430         return ((MEXT_FLAGS(m) & EXTF_READONLY) ? 1 : 0);
3431 }
3432
3433 __private_extern__ caddr_t
3434 m_bigalloc(int wait)
3435 {
3436         int mcflags = MSLEEPF(wait);
3437
3438         /* Is this due to a non-blocking retry?  If so, then try harder */
3439         if (mcflags & MCR_NOSLEEP)
3440                 mcflags |= MCR_TRYHARD;
3441
3442         return (mcache_alloc(m_cache(MC_BIGCL), mcflags));
3443 }
3444
3445 __private_extern__ void
3446 m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3447 {
3448         mcache_free(m_cache(MC_BIGCL), p);
3449 }
3450
3451 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
3452 __private_extern__ struct mbuf *
3453 m_mbigget(struct mbuf *m, int wait)
3454 {
3455         struct ext_ref *rfa;
3456
3457         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3458                 return (m);
3459
3460         m->m_ext.ext_buf =  m_bigalloc(wait);
3461         if (m->m_ext.ext_buf != NULL) {
3462                 MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3463         } else {
3464                 mcache_free(ref_cache, rfa);
3465         }
3466         return (m);
3467 }
3468
3469 __private_extern__ caddr_t
3470 m_16kalloc(int wait)
3471 {
3472         int mcflags = MSLEEPF(wait);
3473
3474         /* Is this due to a non-blocking retry?  If so, then try harder */
3475         if (mcflags & MCR_NOSLEEP)
3476                 mcflags |= MCR_TRYHARD;
3477
3478         return (mcache_alloc(m_cache(MC_16KCL), mcflags));
3479 }
3480
3481 __private_extern__ void
3482 m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3483 {
3484         mcache_free(m_cache(MC_16KCL), p);
3485 }
3486
3487 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
3488 __private_extern__ struct mbuf *
3489 m_m16kget(struct mbuf *m, int wait)
3490 {
3491         struct ext_ref *rfa;
3492
3493         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3494                 return (m);
3495
3496         m->m_ext.ext_buf =  m_16kalloc(wait);
3497         if (m->m_ext.ext_buf != NULL) {
3498                 MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3499         } else {
3500                 mcache_free(ref_cache, rfa);
3501         }
3502         return (m);
3503 }
3504
3505 /*
3506  * "Move" mbuf pkthdr from "from" to "to".
3507  * "from" must have M_PKTHDR set, and "to" must be empty.
3508  */
3509 void
3510 m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
3511 {
3512         /* We will be taking over the tags of 'to' */
3513         if (to->m_flags & M_PKTHDR)
3514                 m_tag_delete_chain(to, NULL);
3515         to->m_pkthdr = from->m_pkthdr;          /* especially tags */
3516         m_tag_init(from);                       /* purge tags from src */
3517         m_prio_init(from);                      /* reset priority from src */
3518         to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3519         if ((to->m_flags & M_EXT) == 0)
3520                 to->m_data = to->m_pktdat;
3521 }
3522
3523 /*
3524  * Duplicate "from"'s mbuf pkthdr in "to".
3525  * "from" must have M_PKTHDR set, and "to" must be empty.
3526  * In particular, this does a deep copy of the packet tags.
3527  */
3528 static int
3529 m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
3530 {
3531         if (to->m_flags & M_PKTHDR)
3532                 m_tag_delete_chain(to, NULL);
3533         to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3534         if ((to->m_flags & M_EXT) == 0)
3535                 to->m_data = to->m_pktdat;
3536         to->m_pkthdr = from->m_pkthdr;
3537         m_tag_init(to);
3538         return (m_tag_copy_chain(to, from, how));
3539 }
3540
3541 /*
3542  * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
3543  * if wantall is not set, return whatever number were available.  Set up the
3544  * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
3545  * are chained on the m_nextpkt field.  Any packets requested beyond this
3546  * are chained onto the last packet header's m_next field.  The size of
3547  * the cluster is controlled by the parameter bufsize.
3548  */
3549 __private_extern__ struct mbuf *
3550 m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
3551     int wait, int wantall, size_t bufsize)
3552 {
3553         struct mbuf *m;
3554         struct mbuf **np, *top;
3555         unsigned int pnum, needed = *num_needed;
3556         mcache_obj_t *mp_list = NULL;
3557         int mcflags = MSLEEPF(wait);
3558         u_int32_t flag;
3559         struct ext_ref *rfa;
3560         mcache_t *cp;
3561         void *cl;
3562
3563         ASSERT(bufsize == m_maxsize(MC_CL) ||
3564             bufsize == m_maxsize(MC_BIGCL) ||
3565             bufsize == m_maxsize(MC_16KCL));
3566
3567         /*
3568          * Caller must first check for njcl because this
3569          * routine is internal and not exposed/used via KPI.
3570          */
3571         VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0);
3572
3573         top = NULL;
3574         np = &top;
3575         pnum = 0;
3576
3577         /*
3578          * The caller doesn't want all the requested buffers; only some.
3579          * Try hard to get what we can, but don't block.  This effectively
3580          * overrides MCR_SLEEP, since this thread will not go to sleep
3581          * if we can't get all the buffers.
3582          */
3583         if (!wantall || (mcflags & MCR_NOSLEEP))
3584                 mcflags |= MCR_TRYHARD;
3585
3586         /* Allocate the composite mbuf + cluster elements from the cache */
3587         if (bufsize == m_maxsize(MC_CL))
3588                 cp = m_cache(MC_MBUF_CL);
3589         else if (bufsize == m_maxsize(MC_BIGCL))
3590                 cp = m_cache(MC_MBUF_BIGCL);
3591         else
3592                 cp = m_cache(MC_MBUF_16KCL);
3593         needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
3594
3595         for (pnum = 0; pnum < needed; pnum++) {
3596                 m = (struct mbuf *)mp_list;
3597                 mp_list = mp_list->obj_next;
3598
3599                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3600                 cl = m->m_ext.ext_buf;
3601                 rfa = MEXT_RFA(m);
3602
3603                 ASSERT(cl != NULL && rfa != NULL);
3604                 VERIFY(MBUF_IS_COMPOSITE(m));
3605
3606                 flag = MEXT_FLAGS(m);
3607
3608                 MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
3609                 if (bufsize == m_maxsize(MC_16KCL)) {
3610                         MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
3611                 } else if (bufsize == m_maxsize(MC_BIGCL)) {
3612                         MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
3613                 } else {
3614                         MBUF_CL_INIT(m, cl, rfa, 1, flag);
3615                 }
3616
3617                 if (num_with_pkthdrs > 0) {
3618                         --num_with_pkthdrs;
3619 #if CONFIG_MACF_NET
3620                         if (mac_mbuf_label_init(m, wait) != 0) {
3621                                 m_freem(m);
3622                                 break;
3623                         }
3624 #endif /* MAC_NET */
3625                 }
3626
3627                 *np = m;
3628                 if (num_with_pkthdrs > 0)
3629                         np = &m->m_nextpkt;
3630                 else
3631                         np = &m->m_next;
3632         }
3633         ASSERT(pnum != *num_needed || mp_list == NULL);
3634         if (mp_list != NULL)
3635                 mcache_free_ext(cp, mp_list);
3636
3637         if (pnum > 0) {
3638                 mtype_stat_add(MT_DATA, pnum);
3639                 mtype_stat_sub(MT_FREE, pnum);
3640         }
3641
3642         if (wantall && (pnum != *num_needed)) {
3643                 if (top != NULL)
3644                         m_freem_list(top);
3645                 return (NULL);
3646         }
3647
3648         *num_needed = pnum;
3649         return (top);
3650 }
3651
3652 /*
3653  * Return list of mbuf linked by m_nextpkt.  Try for numlist, and if
3654  * wantall is not set, return whatever number were available.  The size of
3655  * each mbuf in the list is controlled by the parameter packetlen.  Each
3656  * mbuf of the list may have a chain of mbufs linked by m_next.  Each mbuf
3657  * in the chain is called a segment.  If maxsegments is not null and the
3658  * value pointed to is not null, this specify the maximum number of segments
3659  * for a chain of mbufs.  If maxsegments is zero or the value pointed to
3660  * is zero the caller does not have any restriction on the number of segments.
3661  * The actual  number of segments of a mbuf chain is return in the value
3662  * pointed to by maxsegments.
3663  */
3664 __private_extern__ struct mbuf *
3665 m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
3666     unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
3667 {
3668         struct mbuf **np, *top, *first = NULL;
3669         size_t bufsize, r_bufsize;
3670         unsigned int num = 0;
3671         unsigned int nsegs = 0;
3672         unsigned int needed, resid;
3673         int mcflags = MSLEEPF(wait);
3674         mcache_obj_t *mp_list = NULL, *rmp_list = NULL;
3675         mcache_t *cp = NULL, *rcp = NULL;
3676
3677         if (*numlist == 0)
3678                 return (NULL);
3679
3680         top = NULL;
3681         np = &top;
3682
3683         if (wantsize == 0) {
3684                 if (packetlen <= MINCLSIZE) {
3685                         bufsize = packetlen;
3686                 } else if (packetlen > m_maxsize(MC_CL)) {
3687                         /* Use 4KB if jumbo cluster pool isn't available */
3688                         if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0)
3689                                 bufsize = m_maxsize(MC_BIGCL);
3690                         else
3691                                 bufsize = m_maxsize(MC_16KCL);
3692                 } else {
3693                         bufsize = m_maxsize(MC_CL);
3694                 }
3695         } else if (wantsize == m_maxsize(MC_CL) ||
3696             wantsize == m_maxsize(MC_BIGCL) ||
3697             (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) {
3698                 bufsize = wantsize;
3699         } else {
3700                 return (NULL);
3701         }
3702
3703         if (bufsize <= MHLEN) {
3704                 nsegs = 1;
3705         } else if (bufsize <= MINCLSIZE) {
3706                 if (maxsegments != NULL && *maxsegments == 1) {
3707                         bufsize = m_maxsize(MC_CL);
3708                         nsegs = 1;
3709                 } else {
3710                         nsegs = 2;
3711                 }
3712         } else if (bufsize == m_maxsize(MC_16KCL)) {
3713                 VERIFY(njcl > 0);
3714                 nsegs = ((packetlen - 1) >> (PGSHIFT + 2)) + 1;
3715         } else if (bufsize == m_maxsize(MC_BIGCL)) {
3716                 nsegs = ((packetlen - 1) >> PGSHIFT) + 1;
3717         } else {
3718                 nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
3719         }
3720         if (maxsegments != NULL) {
3721                 if (*maxsegments && nsegs > *maxsegments) {
3722                         *maxsegments = nsegs;
3723                         return (NULL);
3724                 }
3725                 *maxsegments = nsegs;
3726         }
3727
3728         /*
3729          * The caller doesn't want all the requested buffers; only some.
3730          * Try hard to get what we can, but don't block.  This effectively
3731          * overrides MCR_SLEEP, since this thread will not go to sleep
3732          * if we can't get all the buffers.
3733          */
3734         if (!wantall || (mcflags & MCR_NOSLEEP))
3735                 mcflags |= MCR_TRYHARD;
3736
3737         /*
3738          * Simple case where all elements in the lists/chains are mbufs.
3739          * Unless bufsize is greater than MHLEN, each segment chain is made
3740          * up of exactly 1 mbuf.  Otherwise, each segment chain is made up
3741          * of 2 mbufs; the second one is used for the residual data, i.e.
3742          * the remaining data that cannot fit into the first mbuf.
3743          */
3744         if (bufsize <= MINCLSIZE) {
3745                 /* Allocate the elements in one shot from the mbuf cache */
3746                 ASSERT(bufsize <= MHLEN || nsegs == 2);
3747                 cp = m_cache(MC_MBUF);
3748                 needed = mcache_alloc_ext(cp, &mp_list,
3749                     (*numlist) * nsegs, mcflags);
3750
3751                 /*
3752                  * The number of elements must be even if we are to use an
3753                  * mbuf (instead of a cluster) to store the residual data.
3754                  * If we couldn't allocate the requested number of mbufs,
3755                  * trim the number down (if it's odd) in order to avoid
3756                  * creating a partial segment chain.
3757                  */
3758                 if (bufsize > MHLEN && (needed & 0x1))
3759                         needed--;
3760
3761                 while (num < needed) {
3762                         struct mbuf *m;
3763
3764                         m = (struct mbuf *)mp_list;
3765                         mp_list = mp_list->obj_next;
3766                         ASSERT(m != NULL);
3767
3768                         MBUF_INIT(m, 1, MT_DATA);
3769 #if CONFIG_MACF_NET
3770                         if (mac_init_mbuf(m, wait) != 0) {
3771                                 m_free(m);
3772                                 break;
3773                         }
3774 #endif /* MAC_NET */
3775                         num++;
3776                         if (bufsize > MHLEN) {
3777                                 /* A second mbuf for this segment chain */
3778                                 m->m_next = (struct mbuf *)mp_list;
3779                                 mp_list = mp_list->obj_next;
3780                                 ASSERT(m->m_next != NULL);
3781
3782                                 MBUF_INIT(m->m_next, 0, MT_DATA);
3783                                 num++;
3784                         }
3785                         *np = m;
3786                         np = &m->m_nextpkt;
3787                 }
3788                 ASSERT(num != *numlist || mp_list == NULL);
3789
3790                 if (num > 0) {
3791                         mtype_stat_add(MT_DATA, num);
3792                         mtype_stat_sub(MT_FREE, num);
3793                 }
3794                 num /= nsegs;
3795
3796                 /* We've got them all; return to caller */
3797                 if (num == *numlist)
3798                         return (top);
3799
3800                 goto fail;
3801         }
3802
3803         /*
3804          * Complex cases where elements are made up of one or more composite
3805          * mbufs + cluster, depending on packetlen.  Each N-segment chain can
3806          * be illustrated as follows:
3807          *
3808          * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
3809          *
3810          * Every composite mbuf + cluster element comes from the intermediate
3811          * cache (either MC_MBUF_CL or MC_MBUF_BIGCL).  For space efficiency,
3812          * the last composite element will come from the MC_MBUF_CL cache,
3813          * unless the residual data is larger than 2KB where we use the
3814          * big cluster composite cache (MC_MBUF_BIGCL) instead.  Residual
3815          * data is defined as extra data beyond the first element that cannot
3816          * fit into the previous element, i.e. there is no residual data if
3817          * the chain only has 1 segment.
3818          */
3819         r_bufsize = bufsize;
3820         resid = packetlen > bufsize ? packetlen % bufsize : 0;
3821         if (resid > 0) {
3822                 /* There is residual data; figure out the cluster size */
3823                 if (wantsize == 0 && packetlen > MINCLSIZE) {
3824                         /*
3825                          * Caller didn't request that all of the segments
3826                          * in the chain use the same cluster size; use the
3827                          * smaller of the cluster sizes.
3828                          */
3829                         if (njcl > 0 && resid > m_maxsize(MC_BIGCL))
3830                                 r_bufsize = m_maxsize(MC_16KCL);
3831                         else if (resid > m_maxsize(MC_CL))
3832                                 r_bufsize = m_maxsize(MC_BIGCL);
3833                         else
3834                                 r_bufsize = m_maxsize(MC_CL);
3835                 } else {
3836                         /* Use the same cluster size as the other segments */
3837                         resid = 0;
3838                 }
3839         }
3840
3841         needed = *numlist;
3842         if (resid > 0) {
3843                 /*
3844                  * Attempt to allocate composite mbuf + cluster elements for
3845                  * the residual data in each chain; record the number of such
3846                  * elements that can be allocated so that we know how many
3847                  * segment chains we can afford to create.
3848                  */
3849                 if (r_bufsize <= m_maxsize(MC_CL))
3850                         rcp = m_cache(MC_MBUF_CL);
3851                 else if (r_bufsize <= m_maxsize(MC_BIGCL))
3852                         rcp = m_cache(MC_MBUF_BIGCL);
3853                 else
3854                         rcp = m_cache(MC_MBUF_16KCL);
3855                 needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
3856
3857                 if (needed == 0)
3858                         goto fail;
3859
3860                 /* This is temporarily reduced for calculation */
3861                 ASSERT(nsegs > 1);
3862                 nsegs--;
3863         }
3864
3865         /*
3866          * Attempt to allocate the rest of the composite mbuf + cluster
3867          * elements for the number of segment chains that we need.
3868          */
3869         if (bufsize <= m_maxsize(MC_CL))
3870                 cp = m_cache(MC_MBUF_CL);
3871         else if (bufsize <= m_maxsize(MC_BIGCL))
3872                 cp = m_cache(MC_MBUF_BIGCL);
3873         else
3874                 cp = m_cache(MC_MBUF_16KCL);
3875         needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
3876
3877         /* Round it down to avoid creating a partial segment chain */
3878         needed = (needed / nsegs) * nsegs;
3879         if (needed == 0)
3880                 goto fail;
3881
3882         if (resid > 0) {
3883                 /*
3884                  * We're about to construct the chain(s); take into account
3885                  * the number of segments we have created above to hold the
3886                  * residual data for each chain, as well as restore the
3887                  * original count of segments per chain.
3888                  */
3889                 ASSERT(nsegs > 0);
3890                 needed += needed / nsegs;
3891                 nsegs++;
3892         }
3893
3894         for (;;) {
3895                 struct mbuf *m;
3896                 u_int32_t flag;
3897                 struct ext_ref *rfa;
3898                 void *cl;
3899                 int pkthdr;
3900
3901                 ++num;
3902                 if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
3903                         m = (struct mbuf *)mp_list;
3904                         mp_list = mp_list->obj_next;
3905                 } else {
3906                         m = (struct mbuf *)rmp_list;
3907                         rmp_list = rmp_list->obj_next;
3908                 }
3909                 ASSERT(m != NULL);
3910                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3911                 VERIFY(m->m_ext.ext_free == NULL ||
3912                     m->m_ext.ext_free == m_bigfree ||
3913                     m->m_ext.ext_free == m_16kfree);
3914
3915                 cl = m->m_ext.ext_buf;
3916                 rfa = MEXT_RFA(m);
3917
3918                 ASSERT(cl != NULL && rfa != NULL);
3919                 VERIFY(MBUF_IS_COMPOSITE(m));
3920
3921                 flag = MEXT_FLAGS(m);
3922
3923                 pkthdr = (nsegs == 1 || (num % nsegs) == 1);
3924                 if (pkthdr)
3925                         first = m;
3926                 MBUF_INIT(m, pkthdr, MT_DATA);
3927                 if (m->m_ext.ext_free == m_16kfree) {
3928                         MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
3929                 } else if (m->m_ext.ext_free == m_bigfree) {
3930                         MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
3931                 } else {
3932                         MBUF_CL_INIT(m, cl, rfa, 1, flag);
3933                 }
3934 #if CONFIG_MACF_NET
3935                 if (pkthdr && mac_init_mbuf(m, wait) != 0) {
3936                         --num;
3937                         m_freem(m);
3938                         break;
3939                 }
3940 #endif /* MAC_NET */
3941
3942                 *np = m;
3943                 if ((num % nsegs) == 0)
3944                         np = &first->m_nextpkt;
3945                 else
3946                         np = &m->m_next;
3947
3948                 if (num == needed)
3949                         break;
3950         }
3951
3952         if (num > 0) {
3953                 mtype_stat_add(MT_DATA, num);
3954                 mtype_stat_sub(MT_FREE, num);
3955         }
3956
3957         num /= nsegs;
3958
3959         /* We've got them all; return to caller */
3960         if (num == *numlist) {
3961                 ASSERT(mp_list == NULL && rmp_list == NULL);
3962                 return (top);
3963         }
3964
3965 fail:
3966         /* Free up what's left of the above */
3967         if (mp_list != NULL)
3968                 mcache_free_ext(cp, mp_list);
3969         if (rmp_list != NULL)
3970                 mcache_free_ext(rcp, rmp_list);
3971         if (wantall && top != NULL) {
3972                 m_freem(top);
3973                 return (NULL);
3974         }
3975         *numlist = num;
3976         return (top);
3977 }
3978
3979 /*
3980  * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
3981  * packets on receive ring.
3982  */
3983 __private_extern__ struct mbuf *
3984 m_getpacket_how(int wait)
3985 {
3986         unsigned int num_needed = 1;
3987
3988         return (m_getpackets_internal(&num_needed, 1, wait, 1,
3989             m_maxsize(MC_CL)));
3990 }
3991
3992 /*
3993  * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
3994  * packets on receive ring.
3995  */
3996 struct mbuf *
3997 m_getpacket(void)
3998 {
3999         unsigned int num_needed = 1;
4000
4001         return (m_getpackets_internal(&num_needed, 1, M_WAIT, 1,
4002             m_maxsize(MC_CL)));
4003 }
4004
4005 /*
4006  * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
4007  * if this can't be met, return whatever number were available.  Set up the
4008  * first num_with_pkthdrs with mbuf hdrs configured as packet headers.  These
4009  * are chained on the m_nextpkt field.  Any packets requested beyond this are
4010  * chained onto the last packet header's m_next field.
4011  */
4012 struct mbuf *
4013 m_getpackets(int num_needed, int num_with_pkthdrs, int how)
4014 {
4015         unsigned int n = num_needed;
4016
4017         return (m_getpackets_internal(&n, num_with_pkthdrs, how, 0,
4018             m_maxsize(MC_CL)));
4019 }
4020
4021 /*
4022  * Return a list of mbuf hdrs set up as packet hdrs chained together
4023  * on the m_nextpkt field
4024  */
4025 struct mbuf *
4026 m_getpackethdrs(int num_needed, int how)
4027 {
4028         struct mbuf *m;
4029         struct mbuf **np, *top;
4030
4031         top = NULL;
4032         np = &top;
4033
4034         while (num_needed--) {
4035                 m = _M_RETRYHDR(how, MT_DATA);
4036                 if (m == NULL)
4037                         break;
4038
4039                 *np = m;
4040                 np = &m->m_nextpkt;
4041         }
4042
4043         return (top);
4044 }
4045
4046 /*
4047  * Free an mbuf list (m_nextpkt) while following m_next.  Returns the count
4048  * for mbufs packets freed.  Used by the drivers.
4049  */
4050 int
4051 m_freem_list(struct mbuf *m)
4052 {
4053         struct mbuf *nextpkt;
4054         mcache_obj_t *mp_list = NULL;
4055         mcache_obj_t *mcl_list = NULL;
4056         mcache_obj_t *mbc_list = NULL;
4057         mcache_obj_t *m16k_list = NULL;
4058         mcache_obj_t *m_mcl_list = NULL;
4059         mcache_obj_t *m_mbc_list = NULL;
4060         mcache_obj_t *m_m16k_list = NULL;
4061         mcache_obj_t *ref_list = NULL;
4062         int pktcount = 0;
4063         int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
4064
4065         while (m != NULL) {
4066                 pktcount++;
4067
4068                 nextpkt = m->m_nextpkt;
4069                 m->m_nextpkt = NULL;
4070
4071                 while (m != NULL) {
4072                         struct mbuf *next = m->m_next;
4073                         mcache_obj_t *o, *rfa;
4074                         u_int32_t refcnt, composite;
4075
4076                         if (m->m_type == MT_FREE)
4077                                 panic("m_free: freeing an already freed mbuf");
4078
4079                         if (m->m_type != MT_FREE)
4080                                 mt_free++;
4081
4082                         if (m->m_flags & M_PKTHDR) {
4083                                 m_tag_delete_chain(m, NULL);
4084                         }
4085
4086                         if (!(m->m_flags & M_EXT))
4087                                 goto simple_free;
4088
4089                         o = (mcache_obj_t *)m->m_ext.ext_buf;
4090                         refcnt = m_decref(m);
4091                         composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
4092                         if (refcnt == 0 && !composite) {
4093                                 if (m->m_ext.ext_free == NULL) {
4094                                         o->obj_next = mcl_list;
4095                                         mcl_list = o;
4096                                 } else if (m->m_ext.ext_free == m_bigfree) {
4097                                         o->obj_next = mbc_list;
4098                                         mbc_list = o;
4099                                 } else if (m->m_ext.ext_free == m_16kfree) {
4100                                         o->obj_next = m16k_list;
4101                                         m16k_list = o;
4102                                 } else {
4103                                         (*(m->m_ext.ext_free))((caddr_t)o,
4104                                             m->m_ext.ext_size,
4105                                             m->m_ext.ext_arg);
4106                                 }
4107                                 rfa = (mcache_obj_t *)MEXT_RFA(m);
4108                                 rfa->obj_next = ref_list;
4109                                 ref_list = rfa;
4110                                 MEXT_RFA(m) = NULL;
4111                         } else if (refcnt == 0 && composite) {
4112                                 VERIFY(m->m_type != MT_FREE);
4113                                 /*
4114                                  * Amortize the costs of atomic operations
4115                                  * by doing them at the end, if possible.
4116                                  */
4117                                 if (m->m_type == MT_DATA)
4118                                         mt_data++;
4119                                 else if (m->m_type == MT_HEADER)
4120                                         mt_header++;
4121                                 else if (m->m_type == MT_SONAME)
4122                                         mt_soname++;
4123                                 else if (m->m_type == MT_TAG)
4124                                         mt_tag++;
4125                                 else
4126                                         mtype_stat_dec(m->m_type);
4127
4128                                 m->m_type = MT_FREE;
4129                                 m->m_flags = M_EXT;
4130                                 m->m_len = 0;
4131                                 m->m_next = m->m_nextpkt = NULL;
4132
4133                                 MEXT_FLAGS(m) &= ~EXTF_READONLY;
4134
4135                                 /* "Free" into the intermediate cache */
4136                                 o = (mcache_obj_t *)m;
4137                                 if (m->m_ext.ext_free == NULL) {
4138                                         o->obj_next = m_mcl_list;
4139                                         m_mcl_list = o;
4140                                 } else if (m->m_ext.ext_free == m_bigfree) {
4141                                         o->obj_next = m_mbc_list;
4142                                         m_mbc_list = o;
4143                                 } else {
4144                                         VERIFY(m->m_ext.ext_free == m_16kfree);
4145                                         o->obj_next = m_m16k_list;
4146                                         m_m16k_list = o;
4147                                 }
4148                                 m = next;
4149                                 continue;
4150                         }
4151 simple_free:
4152                         /*
4153                          * Amortize the costs of atomic operations
4154                          * by doing them at the end, if possible.
4155                          */
4156                         if (m->m_type == MT_DATA)
4157                                 mt_data++;
4158                         else if (m->m_type == MT_HEADER)
4159                                 mt_header++;
4160                         else if (m->m_type == MT_SONAME)
4161                                 mt_soname++;
4162                         else if (m->m_type == MT_TAG)
4163                                 mt_tag++;
4164                         else if (m->m_type != MT_FREE)
4165                                 mtype_stat_dec(m->m_type);
4166
4167                         m->m_type = MT_FREE;
4168                         m->m_flags = m->m_len = 0;
4169                         m->m_next = m->m_nextpkt = NULL;
4170
4171                         ((mcache_obj_t *)m)->obj_next = mp_list;
4172                         mp_list = (mcache_obj_t *)m;
4173
4174                         m = next;
4175                 }
4176
4177                 m = nextpkt;
4178         }
4179
4180         if (mt_free > 0)
4181                 mtype_stat_add(MT_FREE, mt_free);
4182         if (mt_data > 0)
4183                 mtype_stat_sub(MT_DATA, mt_data);
4184         if (mt_header > 0)
4185                 mtype_stat_sub(MT_HEADER, mt_header);
4186         if (mt_soname > 0)
4187                 mtype_stat_sub(MT_SONAME, mt_soname);
4188         if (mt_tag > 0)
4189                 mtype_stat_sub(MT_TAG, mt_tag);
4190
4191         if (mp_list != NULL)
4192                 mcache_free_ext(m_cache(MC_MBUF), mp_list);
4193         if (mcl_list != NULL)
4194                 mcache_free_ext(m_cache(MC_CL), mcl_list);
4195         if (mbc_list != NULL)
4196                 mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
4197         if (m16k_list != NULL)
4198                 mcache_free_ext(m_cache(MC_16KCL), m16k_list);
4199         if (m_mcl_list != NULL)
4200                 mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
4201         if (m_mbc_list != NULL)
4202                 mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
4203         if (m_m16k_list != NULL)
4204                 mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
4205         if (ref_list != NULL)
4206                 mcache_free_ext(ref_cache, ref_list);
4207
4208         return (pktcount);
4209 }
4210
4211 void
4212 m_freem(struct mbuf *m)
4213 {
4214         while (m != NULL)
4215                 m = m_free(m);
4216 }
4217
4218 /*
4219  * Mbuffer utility routines.
4220  */
4221
4222 /*
4223  * Compute the amount of space available before the current start
4224  * of data in an mbuf.
4225  */
4226 int
4227 m_leadingspace(struct mbuf *m)
4228 {
4229         if (m->m_flags & M_EXT) {
4230                 if (MCLHASREFERENCE(m))
4231                         return (0);
4232                 return (m->m_data - m->m_ext.ext_buf);
4233         }
4234         if (m->m_flags & M_PKTHDR)
4235                 return (m->m_data - m->m_pktdat);
4236         return (m->m_data - m->m_dat);
4237 }
4238
4239 /*
4240  * Compute the amount of space available after the end of data in an mbuf.
4241  */
4242 int
4243 m_trailingspace(struct mbuf *m)
4244 {
4245         if (m->m_flags & M_EXT) {
4246                 if (MCLHASREFERENCE(m))
4247                         return (0);
4248                 return (m->m_ext.ext_buf + m->m_ext.ext_size -
4249                     (m->m_data + m->m_len));
4250         }
4251         return (&m->m_dat[MLEN] - (m->m_data + m->m_len));
4252 }
4253
4254 /*
4255  * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
4256  * copy junk along.  Does not adjust packet header length.
4257  */
4258 struct mbuf *
4259 m_prepend(struct mbuf *m, int len, int how)
4260 {
4261         struct mbuf *mn;
4262
4263         _MGET(mn, how, m->m_type);
4264         if (mn == NULL) {
4265                 m_freem(m);
4266                 return (NULL);
4267         }
4268         if (m->m_flags & M_PKTHDR) {
4269                 M_COPY_PKTHDR(mn, m);
4270                 m->m_flags &= ~M_PKTHDR;
4271         }
4272         mn->m_next = m;
4273         m = mn;
4274         if (len < MHLEN)
4275                 MH_ALIGN(m, len);
4276         m->m_len = len;
4277         return (m);
4278 }
4279
4280 /*
4281  * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
4282  * chain, copy junk along, and adjust length.
4283  */
4284 struct mbuf *
4285 m_prepend_2(struct mbuf *m, int len, int how)
4286 {
4287         if (M_LEADINGSPACE(m) >= len) {
4288                 m->m_data -= len;
4289                 m->m_len += len;
4290         } else {
4291                 m = m_prepend(m, len, how);
4292         }
4293         if ((m) && (m->m_flags & M_PKTHDR))
4294                 m->m_pkthdr.len += len;
4295         return (m);
4296 }
4297
4298 /*
4299  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
4300  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
4301  * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
4302  */
4303 int MCFail;
4304
4305 struct mbuf *
4306 m_copym(struct mbuf *m, int off0, int len, int wait)
4307 {
4308         struct mbuf *n, *mhdr = NULL, **np;
4309         int off = off0;
4310         struct mbuf *top;
4311         int copyhdr = 0;
4312
4313         if (off < 0 || len < 0)
4314                 panic("m_copym: invalid offset %d or len %d", off, len);
4315
4316         if (off == 0 && (m->m_flags & M_PKTHDR)) {
4317                 mhdr = m;
4318                 copyhdr = 1;
4319         }
4320
4321         while (off >= m->m_len) {
4322                 if (m->m_next == NULL)
4323                         panic("m_copym: invalid mbuf chain");
4324                 off -= m->m_len;
4325                 m = m->m_next;
4326         }
4327         np = &top;
4328         top = NULL;
4329
4330         while (len > 0) {
4331                 if (m == NULL) {
4332                         if (len != M_COPYALL)
4333                                 panic("m_copym: len != M_COPYALL");
4334                         break;
4335                 }
4336
4337                 n = _M_RETRY(wait, m->m_type);
4338                 *np = n;
4339
4340                 if (n == NULL)
4341                         goto nospace;
4342
4343                 if (copyhdr != 0) {
4344                         M_COPY_PKTHDR(n, mhdr);
4345                         if (len == M_COPYALL)
4346                                 n->m_pkthdr.len -= off0;
4347                         else
4348                                 n->m_pkthdr.len = len;
4349                         copyhdr = 0;
4350                 }
4351                 if (len == M_COPYALL) {
4352                         if (MIN(len, (m->m_len - off)) == len) {
4353                                 printf("m->m_len %d - off %d = %d, %d\n",
4354                                     m->m_len, off, m->m_len - off,
4355                                     MIN(len, (m->m_len - off)));
4356                         }
4357                 }
4358                 n->m_len = MIN(len, (m->m_len - off));
4359                 if (n->m_len == M_COPYALL) {
4360                         printf("n->m_len == M_COPYALL, fixing\n");
4361                         n->m_len = MHLEN;
4362                 }
4363                 if (m->m_flags & M_EXT) {
4364                         n->m_ext = m->m_ext;
4365                         m_incref(m);
4366                         n->m_data = m->m_data + off;
4367                         n->m_flags |= M_EXT;
4368                 } else {
4369                         bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4370                             (unsigned)n->m_len);
4371                 }
4372                 if (len != M_COPYALL)
4373                         len -= n->m_len;
4374                 off = 0;
4375                 m = m->m_next;
4376                 np = &n->m_next;
4377         }
4378
4379         if (top == NULL)
4380                 MCFail++;
4381
4382         return (top);
4383 nospace:
4384
4385         m_freem(top);
4386         MCFail++;
4387         return (NULL);
4388 }
4389
4390 /*
4391  * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
4392  * within this routine also, the last mbuf and offset accessed are passed
4393  * out and can be passed back in to avoid having to rescan the entire mbuf
4394  * list (normally hung off of the socket)
4395  */
4396 struct mbuf *
4397 m_copym_with_hdrs(struct mbuf *m, int off0, int len0, int wait,
4398     struct mbuf **m_lastm, int *m_off)
4399 {
4400         struct mbuf *n, **np = NULL;
4401         int off = off0, len = len0;
4402         struct mbuf *top = NULL;
4403         int mcflags = MSLEEPF(wait);
4404         int copyhdr = 0;
4405         int type = 0;
4406         mcache_obj_t *list = NULL;
4407         int needed = 0;
4408
4409         if (off == 0 && (m->m_flags & M_PKTHDR))
4410                 copyhdr = 1;
4411
4412         if (*m_lastm != NULL) {
4413                 m = *m_lastm;
4414                 off = *m_off;
4415         } else {
4416                 while (off >= m->m_len) {
4417                         off -= m->m_len;
4418                         m = m->m_next;
4419                 }
4420         }
4421
4422         n = m;
4423         while (len > 0) {
4424                 needed++;
4425                 ASSERT(n != NULL);
4426                 len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
4427                 n = n->m_next;
4428         }
4429         needed++;
4430         len = len0;
4431
4432         /*
4433          * If the caller doesn't want to be put to sleep, mark it with
4434          * MCR_TRYHARD so that we may reclaim buffers from other places
4435          * before giving up.
4436          */
4437         if (mcflags & MCR_NOSLEEP)
4438                 mcflags |= MCR_TRYHARD;
4439
4440         if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
4441             mcflags) != needed)
4442                 goto nospace;
4443
4444         needed = 0;
4445         while (len > 0) {
4446                 n = (struct mbuf *)list;
4447                 list = list->obj_next;
4448                 ASSERT(n != NULL && m != NULL);
4449
4450                 type = (top == NULL) ? MT_HEADER : m->m_type;
4451                 MBUF_INIT(n, (top == NULL), type);
4452 #if CONFIG_MACF_NET
4453                 if (top == NULL && mac_mbuf_label_init(n, wait) != 0) {
4454                         mtype_stat_inc(MT_HEADER);
4455                         mtype_stat_dec(MT_FREE);
4456                         m_free(n);
4457                         goto nospace;
4458                 }
4459 #endif /* MAC_NET */
4460
4461                 if (top == NULL) {
4462                         top = n;
4463                         np = &top->m_next;
4464                         continue;
4465                 } else {
4466                         needed++;
4467                         *np = n;
4468                 }
4469
4470                 if (copyhdr) {
4471                         M_COPY_PKTHDR(n, m);
4472                         n->m_pkthdr.len = len;
4473                         copyhdr = 0;
4474                 }
4475                 n->m_len = MIN(len, (m->m_len - off));
4476
4477                 if (m->m_flags & M_EXT) {
4478                         n->m_ext = m->m_ext;
4479                         m_incref(m);
4480                         n->m_data = m->m_data + off;
4481                         n->m_flags |= M_EXT;
4482                 } else {
4483                         bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4484                             (unsigned)n->m_len);
4485                 }
4486                 len -= n->m_len;
4487
4488                 if (len == 0) {
4489                         if ((off + n->m_len) == m->m_len) {
4490                                 *m_lastm = m->m_next;
4491                                 *m_off  = 0;
4492                         } else {
4493                                 *m_lastm = m;
4494                                 *m_off  = off + n->m_len;
4495                         }
4496                         break;
4497                 }
4498                 off = 0;
4499                 m = m->m_next;
4500                 np = &n->m_next;
4501         }
4502
4503         mtype_stat_inc(MT_HEADER);
4504         mtype_stat_add(type, needed);
4505         mtype_stat_sub(MT_FREE, needed + 1);
4506
4507         ASSERT(list == NULL);
4508         return (top);
4509
4510 nospace:
4511         if (list != NULL)
4512                 mcache_free_ext(m_cache(MC_MBUF), list);
4513         if (top != NULL)
4514                 m_freem(top);
4515         MCFail++;
4516         return (NULL);
4517 }
4518
4519 /*
4520  * Copy data from an mbuf chain starting "off" bytes from the beginning,
4521  * continuing for "len" bytes, into the indicated buffer.
4522  */
4523 void
4524 m_copydata(struct mbuf *m, int off, int len, void *vp)
4525 {
4526         unsigned count;
4527         char *cp = vp;
4528
4529         if (off < 0 || len < 0)
4530                 panic("m_copydata: invalid offset %d or len %d", off, len);
4531
4532         while (off > 0) {
4533                 if (m == NULL)
4534                         panic("m_copydata: invalid mbuf chain");
4535                 if (off < m->m_len)
4536                         break;
4537                 off -= m->m_len;
4538                 m = m->m_next;
4539         }
4540         while (len > 0) {
4541                 if (m == NULL)
4542                         panic("m_copydata: invalid mbuf chain");
4543                 count = MIN(m->m_len - off, len);
4544                 bcopy(MTOD(m, caddr_t) + off, cp, count);
4545                 len -= count;
4546                 cp += count;
4547                 off = 0;
4548                 m = m->m_next;
4549         }
4550 }
4551
4552 /*
4553  * Concatenate mbuf chain n to m.  Both chains must be of the same type
4554  * (e.g. MT_DATA).  Any m_pkthdr is not updated.
4555  */
4556 void
4557 m_cat(struct mbuf *m, struct mbuf *n)
4558 {
4559         while (m->m_next)
4560                 m = m->m_next;
4561         while (n) {
4562                 if ((m->m_flags & M_EXT) ||
4563                     m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
4564                         /* just join the two chains */
4565                         m->m_next = n;
4566                         return;
4567                 }
4568                 /* splat the data from one into the other */
4569                 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4570                     (u_int)n->m_len);
4571                 m->m_len += n->m_len;
4572                 n = m_free(n);
4573         }
4574 }
4575
4576 void
4577 m_adj(struct mbuf *mp, int req_len)
4578 {
4579         int len = req_len;
4580         struct mbuf *m;
4581         int count;
4582
4583         if ((m = mp) == NULL)
4584                 return;
4585         if (len >= 0) {
4586                 /*
4587                  * Trim from head.
4588                  */
4589                 while (m != NULL && len > 0) {
4590                         if (m->m_len <= len) {
4591                                 len -= m->m_len;
4592                                 m->m_len = 0;
4593                                 m = m->m_next;
4594                         } else {
4595                                 m->m_len -= len;
4596                                 m->m_data += len;
4597                                 len = 0;
4598                         }
4599                 }
4600                 m = mp;
4601                 if (m->m_flags & M_PKTHDR)
4602                         m->m_pkthdr.len -= (req_len - len);
4603         } else {
4604                 /*
4605                  * Trim from tail.  Scan the mbuf chain,
4606                  * calculating its length and finding the last mbuf.
4607                  * If the adjustment only affects this mbuf, then just
4608                  * adjust and return.  Otherwise, rescan and truncate
4609                  * after the remaining size.
4610                  */
4611                 len = -len;
4612                 count = 0;
4613                 for (;;) {
4614                         count += m->m_len;
4615                         if (m->m_next == (struct mbuf *)0)
4616                                 break;
4617                         m = m->m_next;
4618                 }
4619                 if (m->m_len >= len) {
4620                         m->m_len -= len;
4621                         m = mp;
4622                         if (m->m_flags & M_PKTHDR)
4623                                 m->m_pkthdr.len -= len;
4624                         return;
4625                 }
4626                 count -= len;
4627                 if (count < 0)
4628                         count = 0;
4629                 /*
4630                  * Correct length for chain is "count".
4631                  * Find the mbuf with last data, adjust its length,
4632                  * and toss data from remaining mbufs on chain.
4633                  */
4634                 m = mp;
4635                 if (m->m_flags & M_PKTHDR)
4636                         m->m_pkthdr.len = count;
4637                 for (; m; m = m->m_next) {
4638                         if (m->m_len >= count) {
4639                                 m->m_len = count;
4640                                 break;
4641                         }
4642                         count -= m->m_len;
4643                 }
4644                 while ((m = m->m_next))
4645                         m->m_len = 0;
4646         }
4647 }
4648
4649 /*
4650  * Rearange an mbuf chain so that len bytes are contiguous
4651  * and in the data area of an mbuf (so that mtod and dtom
4652  * will work for a structure of size len).  Returns the resulting
4653  * mbuf chain on success, frees it and returns null on failure.
4654  * If there is room, it will add up to max_protohdr-len extra bytes to the
4655  * contiguous region in an attempt to avoid being called next time.
4656  */
4657 int MPFail;
4658
4659 struct mbuf *
4660 m_pullup(struct mbuf *n, int len)
4661 {
4662         struct mbuf *m;
4663         int count;
4664         int space;
4665
4666         /*
4667          * If first mbuf has no cluster, and has room for len bytes
4668          * without shifting current data, pullup into it,
4669          * otherwise allocate a new mbuf to prepend to the chain.
4670          */
4671         if ((n->m_flags & M_EXT) == 0 &&
4672             n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
4673                 if (n->m_len >= len)
4674                         return (n);
4675                 m = n;
4676                 n = n->m_next;
4677                 len -= m->m_len;
4678         } else {
4679                 if (len > MHLEN)
4680                         goto bad;
4681                 _MGET(m, M_DONTWAIT, n->m_type);
4682                 if (m == 0)
4683                         goto bad;
4684                 m->m_len = 0;
4685                 if (n->m_flags & M_PKTHDR) {
4686                         M_COPY_PKTHDR(m, n);
4687                         n->m_flags &= ~M_PKTHDR;
4688                 }
4689         }
4690         space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
4691         do {
4692                 count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
4693                 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4694                     (unsigned)count);
4695                 len -= count;
4696                 m->m_len += count;
4697                 n->m_len -= count;
4698                 space -= count;
4699                 if (n->m_len)
4700                         n->m_data += count;
4701                 else
4702                         n = m_free(n);
4703         } while (len > 0 && n);
4704         if (len > 0) {
4705                 (void) m_free(m);
4706                 goto bad;
4707         }
4708         m->m_next = n;
4709         return (m);
4710 bad:
4711         m_freem(n);
4712         MPFail++;
4713         return (0);
4714 }
4715
4716 /*
4717  * Like m_pullup(), except a new mbuf is always allocated, and we allow
4718  * the amount of empty space before the data in the new mbuf to be specified
4719  * (in the event that the caller expects to prepend later).
4720  */
4721 __private_extern__ int MSFail = 0;
4722
4723 __private_extern__ struct mbuf *
4724 m_copyup(struct mbuf *n, int len, int dstoff)
4725 {
4726         struct mbuf *m;
4727         int count, space;
4728
4729         if (len > (MHLEN - dstoff))
4730                 goto bad;
4731         MGET(m, M_DONTWAIT, n->m_type);
4732         if (m == NULL)
4733                 goto bad;
4734         m->m_len = 0;
4735         if (n->m_flags & M_PKTHDR) {
4736                 m_copy_pkthdr(m, n);
4737                 n->m_flags &= ~M_PKTHDR;
4738         }
4739         m->m_data += dstoff;
4740         space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
4741         do {
4742                 count = min(min(max(len, max_protohdr), space), n->m_len);
4743                 memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
4744                     (unsigned)count);
4745                 len -= count;
4746                 m->m_len += count;
4747                 n->m_len -= count;
4748                 space -= count;
4749                 if (n->m_len)
4750                         n->m_data += count;
4751                 else
4752                         n = m_free(n);
4753         } while (len > 0 && n);
4754         if (len > 0) {
4755                 (void) m_free(m);
4756                 goto bad;
4757         }
4758         m->m_next = n;
4759         return (m);
4760 bad:
4761         m_freem(n);
4762         MSFail++;
4763         return (NULL);
4764 }
4765
4766 /*
4767  * Partition an mbuf chain in two pieces, returning the tail --
4768  * all but the first len0 bytes.  In case of failure, it returns NULL and
4769  * attempts to restore the chain to its original state.
4770  */
4771 struct mbuf *
4772 m_split(struct mbuf *m0, int len0, int wait)
4773 {
4774         return (m_split0(m0, len0, wait, 1));
4775 }
4776
4777 static struct mbuf *
4778 m_split0(struct mbuf *m0, int len0, int wait, int copyhdr)
4779 {
4780         struct mbuf *m, *n;
4781         unsigned len = len0, remain;
4782
4783         for (m = m0; m && len > m->m_len; m = m->m_next)
4784                 len -= m->m_len;
4785         if (m == NULL)
4786                 return (NULL);
4787         remain = m->m_len - len;
4788         if (copyhdr && (m0->m_flags & M_PKTHDR)) {
4789                 _MGETHDR(n, wait, m0->m_type);
4790                 if (n == NULL)
4791                         return (NULL);
4792                 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
4793                 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
4794                 m0->m_pkthdr.len = len0;
4795                 if (m->m_flags & M_EXT)
4796                         goto extpacket;
4797                 if (remain > MHLEN) {
4798                         /* m can't be the lead packet */
4799                         MH_ALIGN(n, 0);
4800                         n->m_next = m_split(m, len, wait);
4801                         if (n->m_next == NULL) {
4802                                 (void) m_free(n);
4803                                 return (NULL);
4804                         } else
4805                                 return (n);
4806                 } else
4807                         MH_ALIGN(n, remain);
4808         } else if (remain == 0) {
4809                 n = m->m_next;
4810                 m->m_next = NULL;
4811                 return (n);
4812         } else {
4813                 _MGET(n, wait, m->m_type);
4814                 if (n == NULL)
4815                         return (NULL);
4816                 M_ALIGN(n, remain);
4817         }
4818 extpacket:
4819         if (m->m_flags & M_EXT) {
4820                 n->m_flags |= M_EXT;
4821                 n->m_ext = m->m_ext;
4822                 m_incref(m);
4823                 n->m_data = m->m_data + len;
4824         } else {
4825                 bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain);
4826         }
4827         n->m_len = remain;
4828         m->m_len = len;
4829         n->m_next = m->m_next;
4830         m->m_next = NULL;
4831         return (n);
4832 }
4833
4834 /*
4835  * Routine to copy from device local memory into mbufs.
4836  */
4837 struct mbuf *
4838 m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
4839     void (*copy)(const void *, void *, size_t))
4840 {
4841         struct mbuf *m;
4842         struct mbuf *top = NULL, **mp = &top;
4843         int off = off0, len;
4844         char *cp;
4845         char *epkt;
4846
4847         cp = buf;
4848         epkt = cp + totlen;
4849         if (off) {
4850                 /*
4851                  * If 'off' is non-zero, packet is trailer-encapsulated,
4852                  * so we have to skip the type and length fields.
4853                  */
4854                 cp += off + 2 * sizeof (u_int16_t);
4855                 totlen -= 2 * sizeof (u_int16_t);
4856         }
4857         _MGETHDR(m, M_DONTWAIT, MT_DATA);
4858         if (m == NULL)
4859                 return (NULL);
4860         m->m_pkthdr.rcvif = ifp;
4861         m->m_pkthdr.len = totlen;
4862         m->m_len = MHLEN;
4863
4864         while (totlen > 0) {
4865                 if (top != NULL) {
4866                         _MGET(m, M_DONTWAIT, MT_DATA);
4867                         if (m == NULL) {
4868                                 m_freem(top);
4869                                 return (NULL);
4870                         }
4871                         m->m_len = MLEN;
4872                 }
4873                 len = MIN(totlen, epkt - cp);
4874                 if (len >= MINCLSIZE) {
4875                         MCLGET(m, M_DONTWAIT);
4876                         if (m->m_flags & M_EXT) {
4877                                 m->m_len = len = MIN(len, m_maxsize(MC_CL));
4878                         } else {
4879                                 /* give up when it's out of cluster mbufs */
4880                                 if (top != NULL)
4881                                         m_freem(top);
4882                                 m_freem(m);
4883                                 return (NULL);
4884                         }
4885                 } else {
4886                         /*
4887                          * Place initial small packet/header at end of mbuf.
4888                          */
4889                         if (len < m->m_len) {
4890                                 if (top == NULL &&
4891                                     len + max_linkhdr <= m->m_len)
4892                                         m->m_data += max_linkhdr;
4893                                 m->m_len = len;
4894                         } else {
4895                                 len = m->m_len;
4896                         }
4897                 }
4898                 if (copy)
4899                         copy(cp, MTOD(m, caddr_t), (unsigned)len);
4900                 else
4901                         bcopy(cp, MTOD(m, caddr_t), (unsigned)len);
4902                 cp += len;
4903                 *mp = m;
4904                 mp = &m->m_next;
4905                 totlen -= len;
4906                 if (cp == epkt)
4907                         cp = buf;
4908         }
4909         return (top);
4910 }
4911
4912 #ifndef MBUF_GROWTH_NORMAL_THRESH
4913 #define MBUF_GROWTH_NORMAL_THRESH 25
4914 #endif
4915
4916 /*
4917  * Cluster freelist allocation check.
4918  */
4919 static int
4920 m_howmany(int num, size_t bufsize)
4921 {
4922         int i = 0, j = 0;
4923         u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters;
4924         u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree;
4925         u_int32_t sumclusters, freeclusters;
4926         u_int32_t percent_pool, percent_kmem;
4927         u_int32_t mb_growth, mb_growth_thresh;
4928
4929         VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
4930             bufsize == m_maxsize(MC_16KCL));
4931
4932         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4933
4934         /* Numbers in 2K cluster units */
4935         m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
4936         m_clusters = m_total(MC_CL);
4937         m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
4938         m_16kclusters = m_total(MC_16KCL);
4939         sumclusters = m_mbclusters + m_clusters + m_bigclusters;
4940
4941         m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT;
4942         m_clfree = m_infree(MC_CL);
4943         m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT;
4944         m_16kclfree = m_infree(MC_16KCL);
4945         freeclusters = m_mbfree + m_clfree + m_bigclfree;
4946
4947         /* Bail if we've maxed out the mbuf memory map */
4948         if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) ||
4949             (njcl > 0 && bufsize == m_maxsize(MC_16KCL) &&
4950             (m_16kclusters << NCLPJCLSHIFT) >= njcl)) {
4951                 return (0);
4952         }
4953
4954         if (bufsize == m_maxsize(MC_BIGCL)) {
4955                 /* Under minimum */
4956                 if (m_bigclusters < m_minlimit(MC_BIGCL))
4957                         return (m_minlimit(MC_BIGCL) - m_bigclusters);
4958
4959                 percent_pool =
4960                     ((sumclusters - freeclusters) * 100) / sumclusters;
4961                 percent_kmem = (sumclusters * 100) / nclusters;
4962
4963                 /*
4964                  * If a light/normal user, grow conservatively (75%)
4965                  * If a heavy user, grow aggressively (50%)
4966                  */
4967                 if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH)
4968                         mb_growth = MB_GROWTH_NORMAL;
4969                 else
4970                         mb_growth = MB_GROWTH_AGGRESSIVE;
4971
4972                 if (percent_kmem < 5) {
4973                         /* For initial allocations */
4974                         i = num;
4975                 } else {
4976                         /* Return if >= MBIGCL_LOWAT clusters available */
4977                         if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT &&
4978                             m_total(MC_BIGCL) >=
4979                             MBIGCL_LOWAT + m_minlimit(MC_BIGCL))
4980                                 return (0);
4981
4982                         /* Ensure at least num clusters are accessible */
4983                         if (num >= m_infree(MC_BIGCL))
4984                                 i = num - m_infree(MC_BIGCL);
4985                         if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL))
4986                                 j = num - (m_total(MC_BIGCL) -
4987                                     m_minlimit(MC_BIGCL));
4988
4989                         i = MAX(i, j);
4990
4991                         /*
4992                          * Grow pool if percent_pool > 75 (normal growth)
4993                          * or percent_pool > 50 (aggressive growth).
4994                          */
4995                         mb_growth_thresh = 100 - (100 / (1 << mb_growth));
4996                         if (percent_pool > mb_growth_thresh)
4997                                 j = ((sumclusters + num) >> mb_growth) -
4998                                     freeclusters;
4999                         i = MAX(i, j);
5000                 }
5001
5002                 /* Check to ensure we didn't go over limits */
5003                 if (i + m_bigclusters >= m_maxlimit(MC_BIGCL))
5004                         i = m_maxlimit(MC_BIGCL) - m_bigclusters;
5005                 if ((i << 1) + sumclusters >= nclusters)
5006                         i = (nclusters - sumclusters) >> 1;
5007                 VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
5008                 VERIFY(sumclusters + (i << 1) <= nclusters);
5009
5010         } else { /* 16K CL */
5011                 VERIFY(njcl > 0);
5012                 /* Under minimum */
5013                 if (m_16kclusters < MIN16KCL)
5014                         return (MIN16KCL - m_16kclusters);
5015                 if (m_16kclfree >= M16KCL_LOWAT)
5016                         return (0);
5017
5018                 /* Ensure at least num clusters are available */
5019                 if (num >= m_16kclfree)
5020                         i = num - m_16kclfree;
5021
5022                 /* Always grow 16KCL pool aggressively */
5023                 if (((m_16kclusters + num) >> 1) > m_16kclfree)
5024                         j = ((m_16kclusters + num) >> 1) - m_16kclfree;
5025                 i = MAX(i, j);
5026
5027                 /* Check to ensure we don't go over limit */
5028                 if (i + m_16kclusters >= m_maxlimit(MC_16KCL))
5029                         i = m_maxlimit(MC_16KCL) - m_16kclusters;
5030                 VERIFY((m_total(MC_16KCL) + i) <= m_maxlimit(MC_16KCL));
5031         }
5032         return (i);
5033 }
5034 /*
5035  * Return the number of bytes in the mbuf chain, m.
5036  */
5037 unsigned int
5038 m_length(struct mbuf *m)
5039 {
5040         struct mbuf *m0;
5041         unsigned int pktlen;
5042
5043         if (m->m_flags & M_PKTHDR)
5044                 return (m->m_pkthdr.len);
5045
5046         pktlen = 0;
5047         for (m0 = m; m0 != NULL; m0 = m0->m_next)
5048                 pktlen += m0->m_len;
5049         return (pktlen);
5050 }
5051
5052 /*
5053  * Copy data from a buffer back into the indicated mbuf chain,
5054  * starting "off" bytes from the beginning, extending the mbuf
5055  * chain if necessary.
5056  */
5057 void
5058 m_copyback(struct mbuf *m0, int off, int len, const void *cp)
5059 {
5060 #if DEBUG
5061         struct mbuf *origm = m0;
5062         int error;
5063 #endif /* DEBUG */
5064
5065         if (m0 == NULL)
5066                 return;
5067
5068 #if DEBUG
5069         error =
5070 #endif /* DEBUG */
5071         m_copyback0(&m0, off, len, cp,
5072             M_COPYBACK0_COPYBACK | M_COPYBACK0_EXTEND, M_DONTWAIT);
5073
5074 #if DEBUG
5075         if (error != 0 || (m0 != NULL && origm != m0))
5076                 panic("m_copyback");
5077 #endif /* DEBUG */
5078 }
5079
5080 struct mbuf *
5081 m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp, int how)
5082 {
5083         int error;
5084
5085         /* don't support chain expansion */
5086         VERIFY(off + len <= m_length(m0));
5087
5088         error = m_copyback0(&m0, off, len, cp,
5089             M_COPYBACK0_COPYBACK | M_COPYBACK0_COW, how);
5090         if (error) {
5091                 /*
5092                  * no way to recover from partial success.
5093                  * just free the chain.
5094                  */
5095                 m_freem(m0);
5096                 return (NULL);
5097         }
5098         return (m0);
5099 }
5100
5101 /*
5102  * m_makewritable: ensure the specified range writable.
5103  */
5104 int
5105 m_makewritable(struct mbuf **mp, int off, int len, int how)
5106 {
5107         int error;
5108 #if DEBUG
5109         struct mbuf *n;
5110         int origlen, reslen;
5111
5112         origlen = m_length(*mp);
5113 #endif /* DEBUG */
5114
5115 #if 0 /* M_COPYALL is large enough */
5116         if (len == M_COPYALL)
5117                 len = m_length(*mp) - off; /* XXX */
5118 #endif
5119
5120         error = m_copyback0(mp, off, len, NULL,
5121             M_COPYBACK0_PRESERVE | M_COPYBACK0_COW, how);
5122
5123 #if DEBUG
5124         reslen = 0;
5125         for (n = *mp; n; n = n->m_next)
5126                 reslen += n->m_len;
5127         if (origlen != reslen)
5128                 panic("m_makewritable: length changed");
5129         if (((*mp)->m_flags & M_PKTHDR) && reslen != (*mp)->m_pkthdr.len)
5130                 panic("m_makewritable: inconsist");
5131 #endif /* DEBUG */
5132
5133         return (error);
5134 }
5135
5136 static int
5137 m_copyback0(struct mbuf **mp0, int off, int len, const void *vp, int flags,
5138     int how)
5139 {
5140         int mlen;
5141         struct mbuf *m, *n;
5142         struct mbuf **mp;
5143         int totlen = 0;
5144         const char *cp = vp;
5145
5146         VERIFY(mp0 != NULL);
5147         VERIFY(*mp0 != NULL);
5148         VERIFY((flags & M_COPYBACK0_PRESERVE) == 0 || cp == NULL);
5149         VERIFY((flags & M_COPYBACK0_COPYBACK) == 0 || cp != NULL);
5150
5151         /*
5152          * we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
5153          * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
5154          */
5155
5156         VERIFY((~flags & (M_COPYBACK0_EXTEND|M_COPYBACK0_COW)) != 0);
5157
5158         mp = mp0;
5159         m = *mp;
5160         while (off > (mlen = m->m_len)) {
5161                 off -= mlen;
5162                 totlen += mlen;
5163                 if (m->m_next == NULL) {
5164                         int tspace;
5165 extend:
5166                         if (!(flags & M_COPYBACK0_EXTEND))
5167                                 goto out;
5168
5169                         /*
5170                          * try to make some space at the end of "m".
5171                          */
5172
5173                         mlen = m->m_len;
5174                         if (off + len >= MINCLSIZE &&
5175                             !(m->m_flags & M_EXT) && m->m_len == 0) {
5176                                 MCLGET(m, how);
5177                         }
5178                         tspace = M_TRAILINGSPACE(m);
5179                         if (tspace > 0) {
5180                                 tspace = MIN(tspace, off + len);
5181                                 VERIFY(tspace > 0);
5182                                 bzero(mtod(m, char *) + m->m_len,
5183                                     MIN(off, tspace));
5184                                 m->m_len += tspace;
5185                                 off += mlen;
5186                                 totlen -= mlen;
5187                                 continue;
5188                         }
5189
5190                         /*
5191                          * need to allocate an mbuf.
5192                          */
5193
5194                         if (off + len >= MINCLSIZE) {
5195                                 n = m_getcl(how, m->m_type, 0);
5196                         } else {
5197                                 n = _M_GET(how, m->m_type);
5198                         }
5199                         if (n == NULL) {
5200                                 goto out;
5201                         }
5202                         n->m_len = 0;
5203                         n->m_len = MIN(M_TRAILINGSPACE(n), off + len);
5204                         bzero(mtod(n, char *), MIN(n->m_len, off));
5205                         m->m_next = n;
5206                 }
5207                 mp = &m->m_next;
5208                 m = m->m_next;
5209         }
5210         while (len > 0) {
5211                 mlen = m->m_len - off;
5212                 if (mlen != 0 && m_mclhasreference(m)) {
5213                         char *datap;
5214                         int eatlen;
5215
5216                         /*
5217                          * this mbuf is read-only.
5218                          * allocate a new writable mbuf and try again.
5219                          */
5220
5221 #if defined(DIAGNOSTIC)
5222                         if (!(flags & M_COPYBACK0_COW))
5223                                 panic("m_copyback0: read-only");
5224 #endif /* defined(DIAGNOSTIC) */
5225
5226                         /*
5227                          * if we're going to write into the middle of
5228                          * a mbuf, split it first.
5229                          */
5230                         if (off > 0 && len < mlen) {
5231                                 n = m_split0(m, off, how, 0);
5232                                 if (n == NULL)
5233                                         goto enobufs;
5234                                 m->m_next = n;
5235                                 mp = &m->m_next;
5236                                 m = n;
5237                                 off = 0;
5238                                 continue;
5239                         }
5240
5241                         /*
5242                          * XXX TODO coalesce into the trailingspace of
5243                          * the previous mbuf when possible.
5244                          */
5245
5246                         /*
5247                          * allocate a new mbuf.  copy packet header if needed.
5248                          */
5249                         n = _M_GET(how, m->m_type);
5250                         if (n == NULL)
5251                                 goto enobufs;
5252                         if (off == 0 && (m->m_flags & M_PKTHDR)) {
5253                                 M_COPY_PKTHDR(n, m);
5254                                 n->m_len = MHLEN;
5255                         } else {
5256                                 if (len >= MINCLSIZE)
5257                                         MCLGET(n, M_DONTWAIT);
5258                                 n->m_len =
5259                                     (n->m_flags & M_EXT) ? MCLBYTES : MLEN;
5260                         }
5261                         if (n->m_len > len)
5262                                 n->m_len = len;
5263
5264                         /*
5265                          * free the region which has been overwritten.
5266                          * copying data from old mbufs if requested.
5267                          */
5268                         if (flags & M_COPYBACK0_PRESERVE)
5269                                 datap = mtod(n, char *);
5270                         else
5271                                 datap = NULL;
5272                         eatlen = n->m_len;
5273                         VERIFY(off == 0 || eatlen >= mlen);
5274                         if (off > 0) {
5275                                 VERIFY(len >= mlen);
5276                                 m->m_len = off;
5277                                 m->m_next = n;
5278                                 if (datap) {
5279                                         m_copydata(m, off, mlen, datap);
5280                                         datap += mlen;
5281                                 }
5282                                 eatlen -= mlen;
5283                                 mp = &m->m_next;
5284                                 m = m->m_next;
5285                         }
5286                         while (m != NULL && m_mclhasreference(m) &&
5287                             n->m_type == m->m_type && eatlen > 0) {
5288                                 mlen = MIN(eatlen, m->m_len);
5289                                 if (datap) {
5290                                         m_copydata(m, 0, mlen, datap);
5291                                         datap += mlen;
5292                                 }
5293                                 m->m_data += mlen;
5294                                 m->m_len -= mlen;
5295                                 eatlen -= mlen;
5296                                 if (m->m_len == 0)
5297                                         *mp = m = m_free(m);
5298                         }
5299                         if (eatlen > 0)
5300                                 n->m_len -= eatlen;
5301                         n->m_next = m;
5302                         *mp = m = n;
5303                         continue;
5304                 }
5305                 mlen = MIN(mlen, len);
5306                 if (flags & M_COPYBACK0_COPYBACK) {
5307                         bcopy(cp, mtod(m, caddr_t) + off, (unsigned)mlen);
5308                         cp += mlen;
5309                 }
5310                 len -= mlen;
5311                 mlen += off;
5312                 off = 0;
5313                 totlen += mlen;
5314                 if (len == 0)
5315                         break;
5316                 if (m->m_next == NULL) {
5317                         goto extend;
5318                 }
5319                 mp = &m->m_next;
5320                 m = m->m_next;
5321         }
5322 out:
5323         if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) {
5324                 VERIFY(flags & M_COPYBACK0_EXTEND);
5325                 m->m_pkthdr.len = totlen;
5326         }
5327
5328         return (0);
5329
5330 enobufs:
5331         return (ENOBUFS);
5332 }
5333
5334 char *
5335 mcl_to_paddr(char *addr)
5336 {
5337         vm_offset_t base_phys;
5338
5339         if (!MBUF_IN_MAP(addr))
5340                 return (NULL);
5341         base_phys = mcl_paddr[(addr - (char *)mbutl) >> PGSHIFT];
5342
5343         if (base_phys == 0)
5344                 return (NULL);
5345         return ((char *)((uintptr_t)base_phys | ((uintptr_t)addr & PGOFSET)));
5346 }
5347
5348 /*
5349  * Dup the mbuf chain passed in.  The whole thing.  No cute additional cruft.
5350  * And really copy the thing.  That way, we don't "precompute" checksums
5351  * for unsuspecting consumers.  Assumption: m->m_nextpkt == 0.  Trick: for
5352  * small packets, don't dup into a cluster.  That way received  packets
5353  * don't take up too much room in the sockbuf (cf. sbspace()).
5354  */
5355 int MDFail;
5356
5357 struct mbuf *
5358 m_dup(struct mbuf *m, int how)
5359 {
5360         struct mbuf *n, **np;
5361         struct mbuf *top;
5362         int copyhdr = 0;
5363
5364         np = &top;
5365         top = NULL;
5366         if (m->m_flags & M_PKTHDR)
5367                 copyhdr = 1;
5368
5369         /*
5370          * Quick check: if we have one mbuf and its data fits in an
5371          *  mbuf with packet header, just copy and go.
5372          */
5373         if (m->m_next == NULL) {
5374                 /* Then just move the data into an mbuf and be done... */
5375                 if (copyhdr) {
5376                         if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
5377                                 if ((n = _M_GETHDR(how, m->m_type)) == NULL)
5378                                         return (NULL);
5379                                 n->m_len = m->m_len;
5380                                 m_dup_pkthdr(n, m, how);
5381                                 bcopy(m->m_data, n->m_data, m->m_len);
5382                                 return (n);
5383                         }
5384                 } else if (m->m_len <= MLEN) {
5385                         if ((n = _M_GET(how, m->m_type)) == NULL)
5386                                 return (NULL);
5387                         bcopy(m->m_data, n->m_data, m->m_len);
5388                         n->m_len = m->m_len;
5389                         return (n);
5390                 }
5391         }
5392         while (m != NULL) {
5393 #if BLUE_DEBUG
5394                 kprintf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
5395                     m->m_data);
5396 #endif
5397                 if (copyhdr)
5398                         n = _M_GETHDR(how, m->m_type);
5399                 else
5400                         n = _M_GET(how, m->m_type);
5401                 if (n == NULL)
5402                         goto nospace;
5403                 if (m->m_flags & M_EXT) {
5404                         if (m->m_len <= m_maxsize(MC_CL))
5405                                 MCLGET(n, how);
5406                         else if (m->m_len <= m_maxsize(MC_BIGCL))
5407                                 n = m_mbigget(n, how);
5408                         else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0)
5409                                 n = m_m16kget(n, how);
5410                         if (!(n->m_flags & M_EXT)) {
5411                                 (void) m_free(n);
5412                                 goto nospace;
5413                         }
5414                 }
5415                 *np = n;
5416                 if (copyhdr) {
5417                         /* Don't use M_COPY_PKTHDR: preserve m_data */
5418                         m_dup_pkthdr(n, m, how);
5419                         copyhdr = 0;
5420                         if (!(n->m_flags & M_EXT))
5421                                 n->m_data = n->m_pktdat;
5422                 }
5423                 n->m_len = m->m_len;
5424                 /*
5425                  * Get the dup on the same bdry as the original
5426                  * Assume that the two mbufs have the same offset to data area
5427                  * (up to word boundaries)
5428                  */
5429                 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len);
5430                 m = m->m_next;
5431                 np = &n->m_next;
5432 #if BLUE_DEBUG
5433                 kprintf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
5434                     n->m_data);
5435 #endif
5436         }
5437
5438         if (top == NULL)
5439                 MDFail++;
5440         return (top);
5441
5442 nospace:
5443         m_freem(top);
5444         MDFail++;
5445         return (NULL);
5446 }
5447
5448 #define MBUF_MULTIPAGES(m)                                              \
5449         (((m)->m_flags & M_EXT) &&                                      \
5450         ((IS_P2ALIGNED((m)->m_data, NBPG) && (m)->m_len > NBPG) ||      \
5451         (!IS_P2ALIGNED((m)->m_data, NBPG) &&                            \
5452         P2ROUNDUP((m)->m_data, NBPG) < ((uintptr_t)(m)->m_data + (m)->m_len))))
5453
5454 static struct mbuf *
5455 m_expand(struct mbuf *m, struct mbuf **last)
5456 {
5457         struct mbuf *top = NULL;
5458         struct mbuf **nm = &top;
5459         uintptr_t data0, data;
5460         unsigned int len0, len;
5461
5462         VERIFY(MBUF_MULTIPAGES(m));
5463         VERIFY(m->m_next == NULL);
5464         data0 = (uintptr_t)m->m_data;
5465         len0 = m->m_len;
5466         *last = top;
5467
5468         for (;;) {
5469                 struct mbuf *n;
5470
5471                 data = data0;
5472                 if (IS_P2ALIGNED(data, NBPG) && len0 > NBPG)
5473                         len = NBPG;
5474                 else if (!IS_P2ALIGNED(data, NBPG) &&
5475                     P2ROUNDUP(data, NBPG) < (data + len0))
5476                         len = P2ROUNDUP(data, NBPG) - data;
5477                 else
5478                         len = len0;
5479
5480                 VERIFY(len > 0);
5481                 VERIFY(m->m_flags & M_EXT);
5482                 m->m_data = (void *)data;
5483                 m->m_len = len;
5484
5485                 *nm = *last = m;
5486                 nm = &m->m_next;
5487                 m->m_next = NULL;
5488
5489                 data0 += len;
5490                 len0 -= len;
5491                 if (len0 == 0)
5492                         break;
5493
5494                 n = _M_RETRY(M_DONTWAIT, MT_DATA);
5495                 if (n == NULL) {
5496                         m_freem(top);
5497                         top = *last = NULL;
5498                         break;
5499                 }
5500
5501                 n->m_ext = m->m_ext;
5502                 m_incref(m);
5503                 n->m_flags |= M_EXT;
5504                 m = n;
5505         }
5506         return (top);
5507 }
5508
5509 struct mbuf *
5510 m_normalize(struct mbuf *m)
5511 {
5512         struct mbuf *top = NULL;
5513         struct mbuf **nm = &top;
5514         boolean_t expanded = FALSE;
5515
5516         while (m != NULL) {
5517                 struct mbuf *n;
5518
5519                 n = m->m_next;
5520                 m->m_next = NULL;
5521
5522                 /* Does the data cross one or more page boundaries? */
5523                 if (MBUF_MULTIPAGES(m)) {
5524                         struct mbuf *last;
5525                         if ((m = m_expand(m, &last)) == NULL) {
5526                                 m_freem(n);
5527                                 m_freem(top);
5528                                 top = NULL;
5529                                 break;
5530                         }
5531                         *nm = m;
5532                         nm = &last->m_next;
5533                         expanded = TRUE;
5534                 } else {
5535                         *nm = m;
5536                         nm = &m->m_next;
5537                 }
5538                 m = n;
5539         }
5540         if (expanded)
5541                 atomic_add_32(&mb_normalized, 1);
5542         return (top);
5543 }
5544
5545 /*
5546  * Append the specified data to the indicated mbuf chain,
5547  * Extend the mbuf chain if the new data does not fit in
5548  * existing space.
5549  *
5550  * Return 1 if able to complete the job; otherwise 0.
5551  */
5552 int
5553 m_append(struct mbuf *m0, int len, caddr_t cp)
5554 {
5555         struct mbuf *m, *n;
5556         int remainder, space;
5557
5558         for (m = m0; m->m_next != NULL; m = m->m_next)
5559                 ;
5560         remainder = len;
5561         space = M_TRAILINGSPACE(m);
5562         if (space > 0) {
5563                 /*
5564                  * Copy into available space.
5565                  */
5566                 if (space > remainder)
5567                         space = remainder;
5568                 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
5569                 m->m_len += space;
5570                 cp += space, remainder -= space;
5571         }
5572         while (remainder > 0) {
5573                 /*
5574                  * Allocate a new mbuf; could check space
5575                  * and allocate a cluster instead.
5576                  */
5577                 n = m_get(M_WAITOK, m->m_type);
5578                 if (n == NULL)
5579                         break;
5580                 n->m_len = min(MLEN, remainder);
5581                 bcopy(cp, mtod(n, caddr_t), n->m_len);
5582                 cp += n->m_len;
5583                 remainder -= n->m_len;
5584                 m->m_next = n;
5585                 m = n;
5586         }
5587         if (m0->m_flags & M_PKTHDR)
5588                 m0->m_pkthdr.len += len - remainder;
5589         return (remainder == 0);
5590 }
5591
5592 struct mbuf *
5593 m_last(struct mbuf *m)
5594 {
5595         while (m->m_next != NULL)
5596                 m = m->m_next;
5597         return (m);
5598 }
5599
5600 void
5601 m_mchtype(struct mbuf *m, int t)
5602 {
5603         mtype_stat_inc(t);
5604         mtype_stat_dec(m->m_type);
5605         (m)->m_type = t;
5606 }
5607
5608 void *
5609 m_mtod(struct mbuf *m)
5610 {
5611         return (MTOD(m, void *));
5612 }
5613
5614 struct mbuf *
5615 m_dtom(void *x)
5616 {
5617         return ((struct mbuf *)((uintptr_t)(x) & ~(MSIZE-1)));
5618 }
5619
5620 void
5621 m_mcheck(struct mbuf *m)
5622 {
5623         _MCHECK(m);
5624 }
5625
5626 /*
5627  * Return a pointer to mbuf/offset of location in mbuf chain.
5628  */
5629 struct mbuf *
5630 m_getptr(struct mbuf *m, int loc, int *off)
5631 {
5632
5633         while (loc >= 0) {
5634                 /* Normal end of search. */
5635                 if (m->m_len > loc) {
5636                         *off = loc;
5637                         return (m);
5638                 } else {
5639                         loc -= m->m_len;
5640                         if (m->m_next == NULL) {
5641                                 if (loc == 0) {
5642                                         /* Point at the end of valid data. */
5643                                         *off = m->m_len;
5644                                         return (m);
5645                                 }
5646                                 return (NULL);
5647                         }
5648                         m = m->m_next;
5649                 }
5650         }
5651         return (NULL);
5652 }
5653
5654 /*
5655  * Inform the corresponding mcache(s) that there's a waiter below.
5656  */
5657 static void
5658 mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
5659 {
5660         mcache_waiter_inc(m_cache(class));
5661         if (comp) {
5662                 if (class == MC_CL) {
5663                         mcache_waiter_inc(m_cache(MC_MBUF_CL));
5664                 } else if (class == MC_BIGCL) {
5665                         mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
5666                 } else if (class == MC_16KCL) {
5667                         mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
5668                 } else {
5669                         mcache_waiter_inc(m_cache(MC_MBUF_CL));
5670                         mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
5671                 }
5672         }
5673 }
5674
5675 /*
5676  * Inform the corresponding mcache(s) that there's no more waiter below.
5677  */
5678 static void
5679 mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
5680 {
5681         mcache_waiter_dec(m_cache(class));
5682         if (comp) {
5683                 if (class == MC_CL) {
5684                         mcache_waiter_dec(m_cache(MC_MBUF_CL));
5685                 } else if (class == MC_BIGCL) {
5686                         mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
5687                 } else if (class == MC_16KCL) {
5688                         mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
5689                 } else {
5690                         mcache_waiter_dec(m_cache(MC_MBUF_CL));
5691                         mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
5692                 }
5693         }
5694 }
5695
5696 /*
5697  * Called during slab (blocking and non-blocking) allocation.  If there
5698  * is at least one waiter, and the time since the first waiter is blocked
5699  * is greater than the watchdog timeout, panic the system.
5700  */
5701 static void
5702 mbuf_watchdog(void)
5703 {
5704         struct timeval now;
5705         unsigned int since;
5706
5707         if (mb_waiters == 0 || !mb_watchdog)
5708                 return;
5709
5710         microuptime(&now);
5711         since = now.tv_sec - mb_wdtstart.tv_sec;
5712         if (since >= MB_WDT_MAXTIME) {
5713                 panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__,
5714                     mb_waiters, since, mbuf_dump());
5715                 /* NOTREACHED */
5716         }
5717 }
5718
5719 /*
5720  * Called during blocking allocation.  Returns TRUE if one or more objects
5721  * are available at the per-CPU caches layer and that allocation should be
5722  * retried at that level.
5723  */
5724 static boolean_t
5725 mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
5726 {
5727         boolean_t mcache_retry = FALSE;
5728
5729         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5730
5731         /* Check if there's anything at the cache layer */
5732         if (mbuf_cached_above(class, wait)) {
5733                 mcache_retry = TRUE;
5734                 goto done;
5735         }
5736
5737         /* Nothing?  Then try hard to get it from somewhere */
5738         m_reclaim(class, num, (wait & MCR_COMP));
5739
5740         /* We tried hard and got something? */
5741         if (m_infree(class) > 0) {
5742                 mbstat.m_wait++;
5743                 goto done;
5744         } else if (mbuf_cached_above(class, wait)) {
5745                 mbstat.m_wait++;
5746                 mcache_retry = TRUE;
5747                 goto done;
5748         } else if (wait & MCR_TRYHARD) {
5749                 mcache_retry = TRUE;
5750                 goto done;
5751         }
5752
5753         /*
5754          * There's really nothing for us right now; inform the
5755          * cache(s) that there is a waiter below and go to sleep.
5756          */
5757         mbuf_waiter_inc(class, (wait & MCR_COMP));
5758
5759         VERIFY(!(wait & MCR_NOSLEEP));
5760
5761         /*
5762          * If this is the first waiter, arm the watchdog timer.  Otherwise
5763          * check if we need to panic the system due to watchdog timeout.
5764          */
5765         if (mb_waiters == 0)
5766                 microuptime(&mb_wdtstart);
5767         else
5768                 mbuf_watchdog();
5769
5770         mb_waiters++;
5771         (void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL);
5772
5773         /* We are now up; stop getting notified until next round */
5774         mbuf_waiter_dec(class, (wait & MCR_COMP));
5775
5776         /* We waited and got something */
5777         if (m_infree(class) > 0) {
5778                 mbstat.m_wait++;
5779                 goto done;
5780         } else if (mbuf_cached_above(class, wait)) {
5781                 mbstat.m_wait++;
5782                 mcache_retry = TRUE;
5783         }
5784 done:
5785         return (mcache_retry);
5786 }
5787
5788 static void
5789 mbuf_worker_thread(void)
5790 {
5791         int mbuf_expand;
5792
5793         while (1) {
5794                 lck_mtx_lock(mbuf_mlock);
5795
5796                 mbuf_expand = 0;
5797                 if (mbuf_expand_mcl) {
5798                         int n;
5799
5800                         /* Adjust to current number of cluster in use */
5801                         n = mbuf_expand_mcl -
5802                             (m_total(MC_CL) - m_infree(MC_CL));
5803                         if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL))
5804                                 n = m_maxlimit(MC_CL) - m_total(MC_CL);
5805                         mbuf_expand_mcl = 0;
5806
5807                         if (n > 0 && freelist_populate(MC_CL, n, M_WAIT) > 0)
5808                                 mbuf_expand++;
5809                 }
5810                 if (mbuf_expand_big) {
5811                         int n;
5812
5813                         /* Adjust to current number of 4 KB cluster in use */
5814                         n = mbuf_expand_big -
5815                             (m_total(MC_BIGCL) - m_infree(MC_BIGCL));
5816                         if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL))
5817                                 n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
5818                         mbuf_expand_big = 0;
5819
5820                         if (n > 0 && freelist_populate(MC_BIGCL, n, M_WAIT) > 0)
5821                                 mbuf_expand++;
5822                 }
5823                 if (mbuf_expand_16k) {
5824                         int n;
5825
5826                         /* Adjust to current number of 16 KB cluster in use */
5827                         n = mbuf_expand_16k -
5828                             (m_total(MC_16KCL) - m_infree(MC_16KCL));
5829                         if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL))
5830                                 n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
5831                         mbuf_expand_16k = 0;
5832
5833                         if (n > 0)
5834                                 (void) freelist_populate(MC_16KCL, n, M_WAIT);
5835                 }
5836
5837                 /*
5838                  * Because we can run out of memory before filling the mbuf
5839                  * map, we should not allocate more clusters than they are
5840                  * mbufs -- otherwise we could have a large number of useless
5841                  * clusters allocated.
5842                  */
5843                 if (mbuf_expand) {
5844                         while (m_total(MC_MBUF) <
5845                             (m_total(MC_BIGCL) + m_total(MC_CL))) {
5846                                 if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0)
5847                                         break;
5848                         }
5849                 }
5850
5851                 lck_mtx_unlock(mbuf_mlock);
5852
5853                 assert_wait(&mbuf_worker_run, THREAD_UNINT);
5854                 (void) thread_block((thread_continue_t)mbuf_worker_thread);
5855         }
5856 }
5857
5858 static void
5859 mbuf_worker_thread_init(void)
5860 {
5861         mbuf_worker_ready++;
5862         mbuf_worker_thread();
5863 }
5864
5865 static mcl_slab_t *
5866 slab_get(void *buf)
5867 {
5868         mcl_slabg_t *slg;
5869         unsigned int ix, k;
5870
5871         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5872
5873         VERIFY(MBUF_IN_MAP(buf));
5874         ix = ((char *)buf - (char *)mbutl) >> MBSHIFT;
5875         VERIFY(ix < maxslabgrp);
5876
5877         if ((slg = slabstbl[ix]) == NULL) {
5878                 /*
5879                  * In the current implementation, we never shrink the memory
5880                  * pool (hence the cluster map); if we attempt to reallocate
5881                  * a cluster group when it's already allocated, panic since
5882                  * this is a sign of a memory corruption (slabstbl[ix] got
5883                  * nullified).  This also means that there shouldn't be any
5884                  * hole in the kernel sub-map for the mbuf pool.
5885                  */
5886                 ++slabgrp;
5887                 VERIFY(ix < slabgrp);
5888                 /*
5889                  * Slabs expansion can only be done single threaded; when
5890                  * we get here, it must be as a result of m_clalloc() which
5891                  * is serialized and therefore mb_clalloc_busy must be set.
5892                  */
5893                 VERIFY(mb_clalloc_busy);
5894                 lck_mtx_unlock(mbuf_mlock);
5895
5896                 /* This is a new buffer; create the slabs group for it */
5897                 MALLOC(slg, mcl_slabg_t *, sizeof (*slg), M_TEMP,
5898                     M_WAITOK | M_ZERO);
5899                 VERIFY(slg != NULL);
5900
5901                 lck_mtx_lock(mbuf_mlock);
5902                 /*
5903                  * No other thread could have gone into m_clalloc() after
5904                  * we dropped the lock above, so verify that it's true.
5905                  */
5906                 VERIFY(mb_clalloc_busy);
5907
5908                 slabstbl[ix] = slg;
5909
5910                 /* Chain each slab in the group to its forward neighbor */
5911                 for (k = 1; k < NSLABSPMB; k++)
5912                         slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
5913                 VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
5914
5915                 /* And chain the last slab in the previous group to this */
5916                 if (ix > 0) {
5917                         VERIFY(slabstbl[ix - 1]->
5918                             slg_slab[NSLABSPMB - 1].sl_next == NULL);
5919                         slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
5920                             &slg->slg_slab[0];
5921                 }
5922         }
5923
5924         ix = MTOBG(buf) % NSLABSPMB;
5925         VERIFY(ix < NSLABSPMB);
5926
5927         return (&slg->slg_slab[ix]);
5928 }
5929
5930 static void
5931 slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
5932     void *base, void *head, unsigned int len, int refcnt, int chunks)
5933 {
5934         sp->sl_class = class;
5935         sp->sl_flags = flags;
5936         sp->sl_base = base;
5937         sp->sl_head = head;
5938         sp->sl_len = len;
5939         sp->sl_refcnt = refcnt;
5940         sp->sl_chunks = chunks;
5941         slab_detach(sp);
5942 }
5943
5944 static void
5945 slab_insert(mcl_slab_t *sp, mbuf_class_t class)
5946 {
5947         VERIFY(slab_is_detached(sp));
5948         m_slab_cnt(class)++;
5949         TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
5950         sp->sl_flags &= ~SLF_DETACHED;
5951         if (class == MC_16KCL) {
5952                 int k;
5953                 for (k = 1; k < NSLABSP16KB; k++) {
5954                         sp = sp->sl_next;
5955                         /* Next slab must already be present */
5956                         VERIFY(sp != NULL);
5957                         VERIFY(slab_is_detached(sp));
5958                         sp->sl_flags &= ~SLF_DETACHED;
5959                 }
5960         }
5961 }
5962
5963 static void
5964 slab_remove(mcl_slab_t *sp, mbuf_class_t class)
5965 {
5966         VERIFY(!slab_is_detached(sp));
5967         VERIFY(m_slab_cnt(class) > 0);
5968         m_slab_cnt(class)--;
5969         TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
5970         slab_detach(sp);
5971         if (class == MC_16KCL) {
5972                 int k;
5973                 for (k = 1; k < NSLABSP16KB; k++) {
5974                         sp = sp->sl_next;
5975                         /* Next slab must already be present */
5976                         VERIFY(sp != NULL);
5977                         VERIFY(!slab_is_detached(sp));
5978                         slab_detach(sp);
5979                 }
5980         }
5981 }
5982
5983 static boolean_t
5984 slab_inrange(mcl_slab_t *sp, void *buf)
5985 {
5986         return ((uintptr_t)buf >= (uintptr_t)sp->sl_base &&
5987             (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len));
5988 }
5989
5990 #undef panic
5991
5992 static void
5993 slab_nextptr_panic(mcl_slab_t *sp, void *addr)
5994 {
5995         int i;
5996         unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
5997         uintptr_t buf = (uintptr_t)sp->sl_base;
5998
5999         for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
6000                 void *next = ((mcache_obj_t *)buf)->obj_next;
6001                 if (next != addr)
6002                         continue;
6003                 if (!mclverify) {
6004                         if (next != NULL && !MBUF_IN_MAP(next)) {
6005                                 mcache_t *cp = m_cache(sp->sl_class);
6006                                 panic("%s: %s buffer %p in slab %p modified "
6007                                     "after free at offset 0: %p out of range "
6008                                     "[%p-%p)\n", __func__, cp->mc_name,
6009                                     (void *)buf, sp, next, mbutl, embutl);
6010                                 /* NOTREACHED */
6011                         }
6012                 } else {
6013                         mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
6014                             (mcache_obj_t *)buf);
6015                         mcl_audit_verify_nextptr(next, mca);
6016                 }
6017         }
6018 }
6019
6020 static void
6021 slab_detach(mcl_slab_t *sp)
6022 {
6023         sp->sl_link.tqe_next = (mcl_slab_t *)-1;
6024         sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
6025         sp->sl_flags |= SLF_DETACHED;
6026 }
6027
6028 static boolean_t
6029 slab_is_detached(mcl_slab_t *sp)
6030 {
6031         return ((intptr_t)sp->sl_link.tqe_next == -1 &&
6032             (intptr_t)sp->sl_link.tqe_prev == -1 &&
6033             (sp->sl_flags & SLF_DETACHED));
6034 }
6035
6036 static void
6037 mcl_audit_init(void *buf, mcache_audit_t **mca_list,
6038     mcache_obj_t **con_list, size_t con_size, unsigned int num)
6039 {
6040         mcache_audit_t *mca, *mca_tail;
6041         mcache_obj_t *con = NULL;
6042         boolean_t save_contents = (con_list != NULL);
6043         unsigned int i, ix;
6044
6045         ASSERT(num <= NMBPBG);
6046         ASSERT(con_list == NULL || con_size != 0);
6047
6048         ix = MTOBG(buf);
6049         VERIFY(ix < maxclaudit);
6050
6051         /* Make sure we haven't been here before */
6052         for (i = 0; i < NMBPBG; i++)
6053                 VERIFY(mclaudit[ix].cl_audit[i] == NULL);
6054
6055         mca = mca_tail = *mca_list;
6056         if (save_contents)
6057                 con = *con_list;
6058
6059         for (i = 0; i < num; i++) {
6060                 mcache_audit_t *next;
6061
6062                 next = mca->mca_next;
6063                 bzero(mca, sizeof (*mca));
6064                 mca->mca_next = next;
6065                 mclaudit[ix].cl_audit[i] = mca;
6066
6067                 /* Attach the contents buffer if requested */
6068                 if (save_contents) {
6069                         VERIFY(con != NULL);
6070                         mca->mca_contents_size = con_size;
6071                         mca->mca_contents = con;
6072                         con = con->obj_next;
6073                         bzero(mca->mca_contents, mca->mca_contents_size);
6074                 }
6075
6076                 mca_tail = mca;
6077                 mca = mca->mca_next;
6078         }
6079
6080         if (save_contents)
6081                 *con_list = con;
6082
6083         *mca_list = mca_tail->mca_next;
6084         mca_tail->mca_next = NULL;
6085 }
6086
6087 /*
6088  * Given an address of a buffer (mbuf/2KB/4KB/16KB), return
6089  * the corresponding audit structure for that buffer.
6090  */
6091 static mcache_audit_t *
6092 mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *o)
6093 {
6094         mcache_audit_t *mca = NULL;
6095         int ix = MTOBG(o);
6096
6097         VERIFY(ix < maxclaudit);
6098         VERIFY(IS_P2ALIGNED(o, MIN(m_maxsize(class), NBPG)));
6099
6100         switch (class) {
6101         case MC_MBUF:
6102                 /*
6103                  * For the mbuf case, find the index of the page
6104                  * used by the mbuf and use that index to locate the
6105                  * base address of the page.  Then find out the
6106                  * mbuf index relative to the page base and use
6107                  * it to locate the audit structure.
6108                  */
6109                 VERIFY(MCLIDX(BGTOM(ix), o) < (int)NMBPBG);
6110                 mca = mclaudit[ix].cl_audit[MCLIDX(BGTOM(ix), o)];
6111                 break;
6112
6113         case MC_CL:
6114                 /*
6115                  * Same thing as above, but for 2KB clusters in a page.
6116                  */
6117                 VERIFY(CLBGIDX(BGTOM(ix), o) < (int)NCLPBG);
6118                 mca = mclaudit[ix].cl_audit[CLBGIDX(BGTOM(ix), o)];
6119                 break;
6120
6121         case MC_BIGCL:
6122         case MC_16KCL:
6123                 /*
6124                  * Same as above, but only return the first element.
6125                  */
6126                 mca = mclaudit[ix].cl_audit[0];
6127                 break;
6128
6129         default:
6130                 VERIFY(0);
6131                 /* NOTREACHED */
6132         }
6133
6134         return (mca);
6135 }
6136
6137 static void
6138 mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite,
6139     boolean_t alloc)
6140 {
6141         struct mbuf *m = addr;
6142         mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next;
6143
6144         VERIFY(mca->mca_contents != NULL &&
6145             mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
6146
6147         if (mclverify)
6148                 mcl_audit_verify_nextptr(next, mca);
6149
6150         if (!alloc) {
6151                 /* Save constructed mbuf fields */
6152                 mcl_audit_save_mbuf(m, mca);
6153                 if (mclverify) {
6154                         mcache_set_pattern(MCACHE_FREE_PATTERN, m,
6155                             m_maxsize(MC_MBUF));
6156                 }
6157                 ((mcache_obj_t *)m)->obj_next = next;
6158                 return;
6159         }
6160
6161         /* Check if the buffer has been corrupted while in freelist */
6162         if (mclverify) {
6163                 mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
6164         }
6165         /* Restore constructed mbuf fields */
6166         mcl_audit_restore_mbuf(m, mca, composite);
6167 }
6168
6169 static void
6170 mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite)
6171 {
6172         struct mbuf *ms = (struct mbuf *)mca->mca_contents;
6173
6174         if (composite) {
6175                 struct mbuf *next = m->m_next;
6176                 VERIFY(ms->m_flags == M_EXT && MEXT_RFA(ms) != NULL &&
6177                     MBUF_IS_COMPOSITE(ms));
6178                 /*
6179                  * We could have hand-picked the mbuf fields and restore
6180                  * them individually, but that will be a maintenance
6181                  * headache.  Instead, restore everything that was saved;
6182                  * the mbuf layer will recheck and reinitialize anyway.
6183                  */
6184                 bcopy(ms, m, mca->mca_contents_size);
6185                 m->m_next = next;
6186         } else {
6187                 /*
6188                  * For a regular mbuf (no cluster attached) there's nothing
6189                  * to restore other than the type field, which is expected
6190                  * to be MT_FREE.
6191                  */
6192                 m->m_type = ms->m_type;
6193         }
6194         _MCHECK(m);
6195 }
6196
6197 static void
6198 mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca)
6199 {
6200         _MCHECK(m);
6201         bcopy(m, mca->mca_contents, mca->mca_contents_size);
6202 }
6203
6204 static void
6205 mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc,
6206     boolean_t save_next)
6207 {
6208         mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next;
6209
6210         if (!alloc) {
6211                 if (mclverify) {
6212                         mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
6213                 }
6214                 if (save_next) {
6215                         mcl_audit_verify_nextptr(next, mca);
6216                         ((mcache_obj_t *)addr)->obj_next = next;
6217                 }
6218         } else if (mclverify) {
6219                 /* Check if the buffer has been corrupted while in freelist */
6220                 mcl_audit_verify_nextptr(next, mca);
6221                 mcache_audit_free_verify_set(mca, addr, 0, size);
6222         }
6223 }
6224
6225 static void
6226 mcl_audit_mcheck_panic(struct mbuf *m)
6227 {
6228         mcache_audit_t *mca;
6229
6230         MRANGE(m);
6231         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
6232
6233         panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n",
6234             m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(mca));
6235         /* NOTREACHED */
6236 }
6237
6238 static void
6239 mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
6240 {
6241         if (next != NULL && !MBUF_IN_MAP(next) &&
6242             (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) {
6243                 panic("mcl_audit: buffer %p modified after free at offset 0: "
6244                     "%p out of range [%p-%p)\n%s\n",
6245                     mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca));
6246                 /* NOTREACHED */
6247         }
6248 }
6249
6250 /* This function turns on mbuf leak detection */
6251 static void
6252 mleak_activate(void)
6253 {
6254         mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR;
6255         PE_parse_boot_argn("mleak_sample_factor",
6256             &mleak_table.mleak_sample_factor,
6257             sizeof (mleak_table.mleak_sample_factor));
6258
6259         if (mleak_table.mleak_sample_factor == 0)
6260                 mclfindleak = 0;
6261
6262         if (mclfindleak == 0)
6263                 return;
6264
6265         vm_size_t alloc_size =
6266             mleak_alloc_buckets * sizeof (struct mallocation);
6267         vm_size_t trace_size = mleak_trace_buckets * sizeof (struct mtrace);
6268
6269         MALLOC(mleak_allocations, struct mallocation *, alloc_size,
6270             M_TEMP, M_WAITOK | M_ZERO);
6271         VERIFY(mleak_allocations != NULL);
6272
6273         MALLOC(mleak_traces, struct mtrace *, trace_size,
6274             M_TEMP, M_WAITOK | M_ZERO);
6275         VERIFY(mleak_traces != NULL);
6276
6277         MALLOC(mleak_stat, mleak_stat_t *, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES),
6278             M_TEMP, M_WAITOK | M_ZERO);
6279         VERIFY(mleak_stat != NULL);
6280         mleak_stat->ml_cnt = MLEAK_NUM_TRACES;
6281 #ifdef __LP64__
6282         mleak_stat->ml_isaddr64 = 1;
6283 #endif /* __LP64__ */
6284 }
6285
6286 static void
6287 mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc)
6288 {
6289         int temp;
6290
6291         if (mclfindleak == 0)
6292                 return;
6293
6294         if (!alloc)
6295                 return (mleak_free(addr));
6296
6297         temp = atomic_add_32_ov(&mleak_table.mleak_capture, 1);
6298
6299         if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) {
6300                 uintptr_t bt[MLEAK_STACK_DEPTH];
6301                 int logged = fastbacktrace(bt, MLEAK_STACK_DEPTH);
6302                 mleak_log(bt, addr, logged, num);
6303         }
6304 }
6305
6306 /*
6307  * This function records the allocation in the mleak_allocations table
6308  * and the backtrace in the mleak_traces table; if allocation slot is in use,
6309  * replace old allocation with new one if the trace slot is in use, return
6310  * (or increment refcount if same trace).
6311  */
6312 static boolean_t
6313 mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num)
6314 {
6315         struct mallocation *allocation;
6316         struct mtrace *trace;
6317         uint32_t trace_index;
6318         int i;
6319
6320         /* Quit if someone else modifying the tables */
6321         if (!lck_mtx_try_lock_spin(mleak_lock)) {
6322                 mleak_table.total_conflicts++;
6323                 return (FALSE);
6324         }
6325
6326         allocation = &mleak_allocations[hashaddr((uintptr_t)addr,
6327             mleak_alloc_buckets)];
6328         trace_index = hashbacktrace(bt, depth, mleak_trace_buckets);
6329         trace = &mleak_traces[trace_index];
6330
6331         VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]);
6332         VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]);
6333
6334         allocation->hitcount++;
6335         trace->hitcount++;
6336
6337         /*
6338          * If the allocation bucket we want is occupied
6339          * and the occupier has the same trace, just bail.
6340          */
6341         if (allocation->element != NULL &&
6342             trace_index == allocation->trace_index) {
6343                 mleak_table.alloc_collisions++;
6344                 lck_mtx_unlock(mleak_lock);
6345                 return (TRUE);
6346         }
6347
6348         /*
6349          * Store the backtrace in the traces array;
6350          * Size of zero = trace bucket is free.
6351          */
6352         if (trace->allocs > 0 &&
6353             bcmp(trace->addr, bt, (depth * sizeof (uintptr_t))) != 0) {
6354                 /* Different, unique trace, but the same hash! Bail out. */
6355                 trace->collisions++;
6356                 mleak_table.trace_collisions++;
6357                 lck_mtx_unlock(mleak_lock);
6358                 return (TRUE);
6359         } else if (trace->allocs > 0) {
6360                 /* Same trace, already added, so increment refcount */
6361                 trace->allocs++;
6362         } else {
6363                 /* Found an unused trace bucket, so record the trace here */
6364                 if (trace->depth != 0) {
6365                         /* this slot previously used but not currently in use */
6366                         mleak_table.trace_overwrites++;
6367                 }
6368                 mleak_table.trace_recorded++;
6369                 trace->allocs = 1;
6370                 memcpy(trace->addr, bt, (depth * sizeof (uintptr_t)));
6371                 trace->depth = depth;
6372                 trace->collisions = 0;
6373         }
6374
6375         /* Step 2: Store the allocation record in the allocations array */
6376         if (allocation->element != NULL) {
6377                 /*
6378                  * Replace an existing allocation.  No need to preserve
6379                  * because only a subset of the allocations are being
6380                  * recorded anyway.
6381                  */
6382                 mleak_table.alloc_collisions++;
6383         } else if (allocation->trace_index != 0) {
6384                 mleak_table.alloc_overwrites++;
6385         }
6386         allocation->element = addr;
6387         allocation->trace_index = trace_index;
6388         allocation->count = num;
6389         mleak_table.alloc_recorded++;
6390         mleak_table.outstanding_allocs++;
6391
6392         /* keep a log of the last 5 traces to be top trace, in order */
6393         for (i = 0; i < MLEAK_NUM_TRACES; i++) {
6394                 if (mleak_top_trace[i] == NULL ||
6395                     mleak_top_trace[i]->allocs <= trace->allocs) {
6396                         if (mleak_top_trace[i] != trace) {
6397                                 int j = MLEAK_NUM_TRACES;
6398                                 while (--j > i) {
6399                                         mleak_top_trace[j] =
6400                                             mleak_top_trace[j - 1];
6401                                 }
6402                                 mleak_top_trace[i] = trace;
6403                         }
6404                         break;
6405                 }
6406         }
6407
6408         lck_mtx_unlock(mleak_lock);
6409         return (TRUE);
6410 }
6411
6412 static void
6413 mleak_free(mcache_obj_t *addr)
6414 {
6415         while (addr != NULL) {
6416                 struct mallocation *allocation = &mleak_allocations
6417                     [hashaddr((uintptr_t)addr, mleak_alloc_buckets)];
6418
6419                 if (allocation->element == addr &&
6420                     allocation->trace_index < mleak_trace_buckets) {
6421                         lck_mtx_lock_spin(mleak_lock);
6422                         if (allocation->element == addr &&
6423                             allocation->trace_index < mleak_trace_buckets) {
6424                                 struct mtrace *trace;
6425                                 trace = &mleak_traces[allocation->trace_index];
6426                                 /* allocs = 0 means trace bucket is unused */
6427                                 if (trace->allocs > 0)
6428                                         trace->allocs--;
6429                                 if (trace->allocs == 0)
6430                                         trace->depth = 0;
6431                                 /* NULL element means alloc bucket is unused */
6432                                 allocation->element = NULL;
6433                                 mleak_table.outstanding_allocs--;
6434                         }
6435                         lck_mtx_unlock(mleak_lock);
6436                 }
6437                 addr = addr->obj_next;
6438         }
6439 }
6440
6441 static struct mbtypes {
6442         int             mt_type;
6443         const char      *mt_name;
6444 } mbtypes[] = {
6445         { MT_DATA,      "data" },
6446         { MT_OOBDATA,   "oob data" },
6447         { MT_CONTROL,   "ancillary data" },
6448         { MT_HEADER,    "packet headers" },
6449         { MT_SOCKET,    "socket structures" },
6450         { MT_PCB,       "protocol control blocks" },
6451         { MT_RTABLE,    "routing table entries" },
6452         { MT_HTABLE,    "IMP host table entries" },
6453         { MT_ATABLE,    "address resolution tables" },
6454         { MT_FTABLE,    "fragment reassembly queue headers" },
6455         { MT_SONAME,    "socket names and addresses" },
6456         { MT_SOOPTS,    "socket options" },
6457         { MT_RIGHTS,    "access rights" },
6458         { MT_IFADDR,    "interface addresses" },
6459         { MT_TAG,       "packet tags" },
6460         { 0,            NULL }
6461 };
6462
6463 #define MBUF_DUMP_BUF_CHK() {   \
6464         clen -= k;              \
6465         if (clen < 1)           \
6466                 goto done;      \
6467         c += k;                 \
6468 }
6469
6470 static char *
6471 mbuf_dump(void)
6472 {
6473         unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct;
6474         u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0;
6475         u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0;
6476         u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0;
6477         int nmbtypes = sizeof (mbstat.m_mtypes) / sizeof (short);
6478         uint8_t seen[256];
6479         struct mbtypes *mp;
6480         mb_class_stat_t *sp;
6481         char *c = mbuf_dump_buf;
6482         int i, k, clen = sizeof (mbuf_dump_buf);
6483
6484         mbuf_dump_buf[0] = '\0';
6485
6486         /* synchronize all statistics in the mbuf table */
6487         mbuf_stat_sync();
6488         mbuf_mtypes_sync(TRUE);
6489
6490         sp = &mb_stat->mbs_class[0];
6491         for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) {
6492                 u_int32_t mem;
6493
6494                 if (m_class(i) == MC_MBUF) {
6495                         m_mbufs = sp->mbcl_active;
6496                 } else if (m_class(i) == MC_CL) {
6497                         m_clfree = sp->mbcl_total - sp->mbcl_active;
6498                 } else if (m_class(i) == MC_BIGCL) {
6499                         m_bigclfree = sp->mbcl_total - sp->mbcl_active;
6500                 } else if (njcl > 0 && m_class(i) == MC_16KCL) {
6501                         m_16kclfree = sp->mbcl_total - sp->mbcl_active;
6502                         m_16kclusters = sp->mbcl_total;
6503                 } else if (m_class(i) == MC_MBUF_CL) {
6504                         m_mbufclfree = sp->mbcl_total - sp->mbcl_active;
6505                 } else if (m_class(i) == MC_MBUF_BIGCL) {
6506                         m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active;
6507                 } else if (njcl > 0 && m_class(i) == MC_MBUF_16KCL) {
6508                         m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active;
6509                 }
6510
6511                 mem = sp->mbcl_ctotal * sp->mbcl_size;
6512                 totmem += mem;
6513                 totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) *
6514                     sp->mbcl_size;
6515
6516         }
6517
6518         /* adjust free counts to include composite caches */
6519         m_clfree += m_mbufclfree;
6520         m_bigclfree += m_mbufbigclfree;
6521         m_16kclfree += m_mbuf16kclfree;
6522
6523         totmbufs = 0;
6524         for (mp = mbtypes; mp->mt_name != NULL; mp++)
6525                 totmbufs += mbstat.m_mtypes[mp->mt_type];
6526         if (totmbufs > m_mbufs)
6527                 totmbufs = m_mbufs;
6528         k = snprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs);
6529         MBUF_DUMP_BUF_CHK();
6530
6531         bzero(&seen, sizeof (seen));
6532         for (mp = mbtypes; mp->mt_name != NULL; mp++) {
6533                 if (mbstat.m_mtypes[mp->mt_type] != 0) {
6534                         seen[mp->mt_type] = 1;
6535                         k = snprintf(c, clen, "\t%u mbufs allocated to %s\n",
6536                             mbstat.m_mtypes[mp->mt_type], mp->mt_name);
6537                         MBUF_DUMP_BUF_CHK();
6538                 }
6539         }
6540         seen[MT_FREE] = 1;
6541         for (i = 0; i < nmbtypes; i++)
6542                 if (!seen[i] && mbstat.m_mtypes[i] != 0) {
6543                         k = snprintf(c, clen, "\t%u mbufs allocated to "
6544                             "<mbuf type %d>\n", mbstat.m_mtypes[i], i);
6545                         MBUF_DUMP_BUF_CHK();
6546                 }
6547         if ((m_mbufs - totmbufs) > 0) {
6548                 k = snprintf(c, clen, "\t%lu mbufs allocated to caches\n",
6549                     m_mbufs - totmbufs);
6550                 MBUF_DUMP_BUF_CHK();
6551         }
6552         k = snprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n"
6553             "%u/%u mbuf 4KB clusters in use\n",
6554             (unsigned int)(mbstat.m_clusters - m_clfree),
6555             (unsigned int)mbstat.m_clusters,
6556             (unsigned int)(mbstat.m_bigclusters - m_bigclfree),
6557             (unsigned int)mbstat.m_bigclusters);
6558         MBUF_DUMP_BUF_CHK();
6559
6560         if (njcl > 0) {
6561                 k = snprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n",
6562                     m_16kclusters - m_16kclfree, m_16kclusters,
6563                     njclbytes / 1024);
6564                 MBUF_DUMP_BUF_CHK();
6565         }
6566         totused = totmem - totfree;
6567         if (totmem == 0) {
6568                 totpct = 0;
6569         } else if (totused < (ULONG_MAX / 100)) {
6570                 totpct = (totused * 100) / totmem;
6571         } else {
6572                 u_long totmem1 = totmem / 100;
6573                 u_long totused1 = totused / 100;
6574                 totpct = (totused1 * 100) / totmem1;
6575         }
6576         k = snprintf(c, clen, "%lu KB allocated to network (approx. %lu%% "
6577             "in use)\n", totmem / 1024, totpct);
6578         MBUF_DUMP_BUF_CHK();
6579
6580 done:
6581         return (mbuf_dump_buf);
6582 }
6583
6584 #undef MBUF_DUMP_BUF_CHK
6585
6586 SYSCTL_DECL(_kern_ipc);
6587 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat,
6588     CTLFLAG_RD | CTLFLAG_LOCKED,
6589     0, 0, mbstat_sysctl, "S,mbstat", "");
6590 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat,
6591     CTLFLAG_RD | CTLFLAG_LOCKED,
6592     0, 0, mb_stat_sysctl, "S,mb_stat", "");
6593 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace,
6594     CTLFLAG_RD | CTLFLAG_LOCKED,
6595     0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", "");
6596 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table,
6597     CTLFLAG_RD | CTLFLAG_LOCKED,
6598     0, 0, mleak_table_sysctl, "S,mleak_table", "");
6599 SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor,
6600     CTLFLAG_RW | CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, "");
6601 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized,
6602     CTLFLAG_RD | CTLFLAG_LOCKED, &mb_normalized, 0, "");
6603 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog,
6604     CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, "");