bsd/kern/uipc_mbuf.c

   1 /*
   2  * Copyright (c) 1998-2018 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1988, 1991, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/malloc.h>
  73 #include <sys/mbuf.h>
  74 #include <sys/kernel.h>
  75 #include <sys/sysctl.h>
  76 #include <sys/syslog.h>
  77 #include <sys/protosw.h>
  78 #include <sys/domain.h>
  79 #include <sys/queue.h>
  80 #include <sys/proc.h>
  81
  82 #include <dev/random/randomdev.h>
  83
  84 #include <kern/kern_types.h>
  85 #include <kern/simple_lock.h>
  86 #include <kern/queue.h>
  87 #include <kern/sched_prim.h>
  88 #include <kern/backtrace.h>
  89 #include <kern/cpu_number.h>
  90 #include <kern/zalloc.h>
  91
  92 #include <libkern/OSAtomic.h>
  93 #include <libkern/OSDebug.h>
  94 #include <libkern/libkern.h>
  95
  96 #include <os/log.h>
  97
  98 #include <IOKit/IOMapper.h>
  99
 100 #include <machine/limits.h>
 101 #include <machine/machine_routines.h>
 102
 103 #if CONFIG_MACF_NET
 104 #include <security/mac_framework.h>
 105 #endif /* MAC_NET */
 106
 107 #include <sys/mcache.h>
 108 #include <net/ntstat.h>
 109
 110 /*
 111  * MBUF IMPLEMENTATION NOTES.
 112  *
 113  * There is a total of 5 per-CPU caches:
 114  *
 115  * MC_MBUF:
 116  *      This is a cache of rudimentary objects of MSIZE in size; each
 117  *      object represents an mbuf structure.  This cache preserves only
 118  *      the m_type field of the mbuf during its transactions.
 119  *
 120  * MC_CL:
 121  *      This is a cache of rudimentary objects of MCLBYTES in size; each
 122  *      object represents a mcluster structure.  This cache does not
 123  *      preserve the contents of the objects during its transactions.
 124  *
 125  * MC_BIGCL:
 126  *      This is a cache of rudimentary objects of MBIGCLBYTES in size; each
 127  *      object represents a mbigcluster structure.  This cache does not
 128  *      preserve the contents of the objects during its transaction.
 129  *
 130  * MC_MBUF_CL:
 131  *      This is a cache of mbufs each having a cluster attached to it.
 132  *      It is backed by MC_MBUF and MC_CL rudimentary caches.  Several
 133  *      fields of the mbuf related to the external cluster are preserved
 134  *      during transactions.
 135  *
 136  * MC_MBUF_BIGCL:
 137  *      This is a cache of mbufs each having a big cluster attached to it.
 138  *      It is backed by MC_MBUF and MC_BIGCL rudimentary caches.  Several
 139  *      fields of the mbuf related to the external cluster are preserved
 140  *      during transactions.
 141  *
 142  * OBJECT ALLOCATION:
 143  *
 144  * Allocation requests are handled first at the per-CPU (mcache) layer
 145  * before falling back to the slab layer.  Performance is optimal when
 146  * the request is satisfied at the CPU layer because global data/lock
 147  * never gets accessed.  When the slab layer is entered for allocation,
 148  * the slab freelist will be checked first for available objects before
 149  * the VM backing store is invoked.  Slab layer operations are serialized
 150  * for all of the caches as the mbuf global lock is held most of the time.
 151  * Allocation paths are different depending on the class of objects:
 152  *
 153  * a. Rudimentary object:
 154  *
 155  *      { m_get_common(), m_clattach(), m_mclget(),
 156  *        m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
 157  *        composite object allocation }
 158  *                      |       ^
 159  *                      |       |
 160  *                      |       +-----------------------+
 161  *                      v                               |
 162  *         mcache_alloc/mcache_alloc_ext()      mbuf_slab_audit()
 163  *                      |                               ^
 164  *                      v                               |
 165  *                 [CPU cache] -------> (found?) -------+
 166  *                      |                               |
 167  *                      v                               |
 168  *               mbuf_slab_alloc()                      |
 169  *                      |                               |
 170  *                      v                               |
 171  *      +---------> [freelist] -------> (found?) -------+
 172  *      |               |
 173  *      |               v
 174  *      |           m_clalloc()
 175  *      |               |
 176  *      |               v
 177  *      +---<<---- kmem_mb_alloc()
 178  *
 179  * b. Composite object:
 180  *
 181  *      { m_getpackets_internal(), m_allocpacket_internal() }
 182  *                      |       ^
 183  *                      |       |
 184  *                      |       +------ (done) ---------+
 185  *                      v                               |
 186  *         mcache_alloc/mcache_alloc_ext()      mbuf_cslab_audit()
 187  *                      |                               ^
 188  *                      v                               |
 189  *                 [CPU cache] -------> (found?) -------+
 190  *                      |                               |
 191  *                      v                               |
 192  *               mbuf_cslab_alloc()                     |
 193  *                      |                               |
 194  *                      v                               |
 195  *                  [freelist] -------> (found?) -------+
 196  *                      |                               |
 197  *                      v                               |
 198  *              (rudimentary object)                    |
 199  *         mcache_alloc/mcache_alloc_ext() ------>>-----+
 200  *
 201  * Auditing notes: If auditing is enabled, buffers will be subjected to
 202  * integrity checks by the audit routine.  This is done by verifying their
 203  * contents against DEADBEEF (free) pattern before returning them to caller.
 204  * As part of this step, the routine will also record the transaction and
 205  * pattern-fill the buffers with BADDCAFE (uninitialized) pattern.  It will
 206  * also restore any constructed data structure fields if necessary.
 207  *
 208  * OBJECT DEALLOCATION:
 209  *
 210  * Freeing an object simply involves placing it into the CPU cache; this
 211  * pollutes the cache to benefit subsequent allocations.  The slab layer
 212  * will only be entered if the object is to be purged out of the cache.
 213  * During normal operations, this happens only when the CPU layer resizes
 214  * its bucket while it's adjusting to the allocation load.  Deallocation
 215  * paths are different depending on the class of objects:
 216  *
 217  * a. Rudimentary object:
 218  *
 219  *      { m_free(), m_freem_list(), composite object deallocation }
 220  *                      |       ^
 221  *                      |       |
 222  *                      |       +------ (done) ---------+
 223  *                      v                               |
 224  *         mcache_free/mcache_free_ext()                |
 225  *                      |                               |
 226  *                      v                               |
 227  *              mbuf_slab_audit()                       |
 228  *                      |                               |
 229  *                      v                               |
 230  *                 [CPU cache] ---> (not purging?) -----+
 231  *                      |                               |
 232  *                      v                               |
 233  *               mbuf_slab_free()                       |
 234  *                      |                               |
 235  *                      v                               |
 236  *                  [freelist] ----------->>------------+
 237  *       (objects get purged to VM only on demand)
 238  *
 239  * b. Composite object:
 240  *
 241  *      { m_free(), m_freem_list() }
 242  *                      |       ^
 243  *                      |       |
 244  *                      |       +------ (done) ---------+
 245  *                      v                               |
 246  *         mcache_free/mcache_free_ext()                |
 247  *                      |                               |
 248  *                      v                               |
 249  *              mbuf_cslab_audit()                      |
 250  *                      |                               |
 251  *                      v                               |
 252  *                 [CPU cache] ---> (not purging?) -----+
 253  *                      |                               |
 254  *                      v                               |
 255  *               mbuf_cslab_free()                      |
 256  *                      |                               |
 257  *                      v                               |
 258  *                  [freelist] ---> (not purging?) -----+
 259  *                      |                               |
 260  *                      v                               |
 261  *              (rudimentary object)                    |
 262  *         mcache_free/mcache_free_ext() ------->>------+
 263  *
 264  * Auditing notes: If auditing is enabled, the audit routine will save
 265  * any constructed data structure fields (if necessary) before filling the
 266  * contents of the buffers with DEADBEEF (free) pattern and recording the
 267  * transaction.  Buffers that are freed (whether at CPU or slab layer) are
 268  * expected to contain the free pattern.
 269  *
 270  * DEBUGGING:
 271  *
 272  * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
 273  * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT).  Additionally,
 274  * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
 275  * i.e. modify the boot argument parameter to "mbuf_debug=0x13".  Leak
 276  * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g.
 277  * "mbuf_debug=0x113".  Note that debugging consumes more CPU and memory.
 278  *
 279  * Each object is associated with exactly one mcache_audit_t structure that
 280  * contains the information related to its last buffer transaction.  Given
 281  * an address of an object, the audit structure can be retrieved by finding
 282  * the position of the object relevant to the base address of the cluster:
 283  *
 284  *      +------------+                  +=============+
 285  *      | mbuf addr  |                  | mclaudit[i] |
 286  *      +------------+                  +=============+
 287  *            |                         | cl_audit[0] |
 288  *      i = MTOBG(addr)                 +-------------+
 289  *            |                 +-----> | cl_audit[1] | -----> mcache_audit_t
 290  *      b = BGTOM(i)            |       +-------------+
 291  *            |                 |       |     ...     |
 292  *      x = MCLIDX(b, addr)     |       +-------------+
 293  *            |                 |       | cl_audit[7] |
 294  *            +-----------------+       +-------------+
 295  *               (e.g. x == 1)
 296  *
 297  * The mclaudit[] array is allocated at initialization time, but its contents
 298  * get populated when the corresponding cluster is created.  Because a page
 299  * can be turned into NMBPG number of mbufs, we preserve enough space for the
 300  * mbufs so that there is a 1-to-1 mapping between them.  A page that never
 301  * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
 302  * remaining entries unused.  For 16KB cluster, only one entry from the first
 303  * page is allocated and used for the entire object.
 304  */
 305
 306 /* TODO: should be in header file */
 307 /* kernel translater */
 308 extern vm_offset_t kmem_mb_alloc(vm_map_t, int, int, kern_return_t *);
 309 extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
 310 extern vm_map_t mb_map;         /* special map */
 311
 312 static uint32_t mb_kmem_contig_failed;
 313 static uint32_t mb_kmem_failed;
 314 static uint32_t mb_kmem_one_failed;
 315 /* Timestamp of allocation failures. */
 316 static uint64_t mb_kmem_contig_failed_ts;
 317 static uint64_t mb_kmem_failed_ts;
 318 static uint64_t mb_kmem_one_failed_ts;
 319 static uint64_t mb_kmem_contig_failed_size;
 320 static uint64_t mb_kmem_failed_size;
 321 static uint32_t mb_kmem_stats[6];
 322 static const char *mb_kmem_stats_labels[] = { "INVALID_ARGUMENT",
 323                                               "INVALID_ADDRESS",
 324                                               "RESOURCE_SHORTAGE",
 325                                               "NO_SPACE",
 326                                               "KERN_FAILURE",
 327                                               "OTHERS" };
 328
 329 /* Global lock */
 330 decl_lck_mtx_data(static, mbuf_mlock_data);
 331 static lck_mtx_t *mbuf_mlock = &mbuf_mlock_data;
 332 static lck_attr_t *mbuf_mlock_attr;
 333 static lck_grp_t *mbuf_mlock_grp;
 334 static lck_grp_attr_t *mbuf_mlock_grp_attr;
 335
 336 /* Back-end (common) layer */
 337 static uint64_t mb_expand_cnt;
 338 static uint64_t mb_expand_cl_cnt;
 339 static uint64_t mb_expand_cl_total;
 340 static uint64_t mb_expand_bigcl_cnt;
 341 static uint64_t mb_expand_bigcl_total;
 342 static uint64_t mb_expand_16kcl_cnt;
 343 static uint64_t mb_expand_16kcl_total;
 344 static boolean_t mbuf_worker_needs_wakeup; /* wait channel for mbuf worker */
 345 static uint32_t mbuf_worker_run_cnt;
 346 static uint64_t mbuf_worker_last_runtime;
 347 static int mbuf_worker_ready;   /* worker thread is runnable */
 348 static int ncpu;                /* number of CPUs */
 349 static ppnum_t *mcl_paddr;      /* Array of cluster physical addresses */
 350 static ppnum_t mcl_pages;       /* Size of array (# physical pages) */
 351 static ppnum_t mcl_paddr_base;  /* Handle returned by IOMapper::iovmAlloc() */
 352 static mcache_t *ref_cache;     /* Cache of cluster reference & flags */
 353 static mcache_t *mcl_audit_con_cache; /* Audit contents cache */
 354 static unsigned int mbuf_debug; /* patchable mbuf mcache flags */
 355 static unsigned int mb_normalized; /* number of packets "normalized" */
 356
 357 #define MB_GROWTH_AGGRESSIVE    1       /* Threshold: 1/2 of total */
 358 #define MB_GROWTH_NORMAL        2       /* Threshold: 3/4 of total */
 359
 360 typedef enum {
 361         MC_MBUF = 0,    /* Regular mbuf */
 362         MC_CL,          /* Cluster */
 363         MC_BIGCL,       /* Large (4KB) cluster */
 364         MC_16KCL,       /* Jumbo (16KB) cluster */
 365         MC_MBUF_CL,     /* mbuf + cluster */
 366         MC_MBUF_BIGCL,  /* mbuf + large (4KB) cluster */
 367         MC_MBUF_16KCL   /* mbuf + jumbo (16KB) cluster */
 368 } mbuf_class_t;
 369
 370 #define MBUF_CLASS_MIN          MC_MBUF
 371 #define MBUF_CLASS_MAX          MC_MBUF_16KCL
 372 #define MBUF_CLASS_LAST         MC_16KCL
 373 #define MBUF_CLASS_VALID(c) \
 374         ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
 375 #define MBUF_CLASS_COMPOSITE(c) \
 376         ((int)(c) > MBUF_CLASS_LAST)
 377
 378
 379 /*
 380  * mbuf specific mcache allocation request flags.
 381  */
 382 #define MCR_COMP        MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
 383
 384 /*
 385  * Per-cluster slab structure.
 386  *
 387  * A slab is a cluster control structure that contains one or more object
 388  * chunks; the available chunks are chained in the slab's freelist (sl_head).
 389  * Each time a chunk is taken out of the slab, the slab's reference count
 390  * gets incremented.  When all chunks have been taken out, the empty slab
 391  * gets removed (SLF_DETACHED) from the class's slab list.  A chunk that is
 392  * returned to a slab causes the slab's reference count to be decremented;
 393  * it also causes the slab to be reinserted back to class's slab list, if
 394  * it's not already done.
 395  *
 396  * Compartmentalizing of the object chunks into slabs allows us to easily
 397  * merge one or more slabs together when the adjacent slabs are idle, as
 398  * well as to convert or move a slab from one class to another; e.g. the
 399  * mbuf cluster slab can be converted to a regular cluster slab when all
 400  * mbufs in the slab have been freed.
 401  *
 402  * A slab may also span across multiple clusters for chunks larger than
 403  * a cluster's size.  In this case, only the slab of the first cluster is
 404  * used.  The rest of the slabs are marked with SLF_PARTIAL to indicate
 405  * that they are part of the larger slab.
 406  *
 407  * Each slab controls a page of memory.
 408  */
 409 typedef struct mcl_slab {
 410         struct mcl_slab *sl_next;       /* neighboring slab */
 411         u_int8_t        sl_class;       /* controlling mbuf class */
 412         int8_t          sl_refcnt;      /* outstanding allocations */
 413         int8_t          sl_chunks;      /* chunks (bufs) in this slab */
 414         u_int16_t       sl_flags;       /* slab flags (see below) */
 415         u_int16_t       sl_len;         /* slab length */
 416         void            *sl_base;       /* base of allocated memory */
 417         void            *sl_head;       /* first free buffer */
 418         TAILQ_ENTRY(mcl_slab) sl_link;  /* next/prev slab on freelist */
 419 } mcl_slab_t;
 420
 421 #define SLF_MAPPED      0x0001          /* backed by a mapped page */
 422 #define SLF_PARTIAL     0x0002          /* part of another slab */
 423 #define SLF_DETACHED    0x0004          /* not in slab freelist */
 424
 425 /*
 426  * The array of slabs are broken into groups of arrays per 1MB of kernel
 427  * memory to reduce the footprint.  Each group is allocated on demand
 428  * whenever a new piece of memory mapped in from the VM crosses the 1MB
 429  * boundary.
 430  */
 431 #define NSLABSPMB       ((1 << MBSHIFT) >> PAGE_SHIFT)
 432
 433 typedef struct mcl_slabg {
 434         mcl_slab_t      *slg_slab;      /* group of slabs */
 435 } mcl_slabg_t;
 436
 437 /*
 438  * Number of slabs needed to control a 16KB cluster object.
 439  */
 440 #define NSLABSP16KB     (M16KCLBYTES >> PAGE_SHIFT)
 441
 442 /*
 443  * Per-cluster audit structure.
 444  */
 445 typedef struct {
 446         mcache_audit_t  **cl_audit;     /* array of audits */
 447 } mcl_audit_t;
 448
 449 typedef struct {
 450         struct thread   *msa_thread;    /* thread doing transaction */
 451         struct thread   *msa_pthread;   /* previous transaction thread */
 452         uint32_t        msa_tstamp;     /* transaction timestamp (ms) */
 453         uint32_t        msa_ptstamp;    /* prev transaction timestamp (ms) */
 454         uint16_t        msa_depth;      /* pc stack depth */
 455         uint16_t        msa_pdepth;     /* previous transaction pc stack */
 456         void            *msa_stack[MCACHE_STACK_DEPTH];
 457         void            *msa_pstack[MCACHE_STACK_DEPTH];
 458 } mcl_scratch_audit_t;
 459
 460 typedef struct {
 461         /*
 462          * Size of data from the beginning of an mbuf that covers m_hdr,
 463          * pkthdr and m_ext structures.  If auditing is enabled, we allocate
 464          * a shadow mbuf structure of this size inside each audit structure,
 465          * and the contents of the real mbuf gets copied into it when the mbuf
 466          * is freed.  This allows us to pattern-fill the mbuf for integrity
 467          * check, and to preserve any constructed mbuf fields (e.g. mbuf +
 468          * cluster cache case).  Note that we don't save the contents of
 469          * clusters when they are freed; we simply pattern-fill them.
 470          */
 471         u_int8_t                sc_mbuf[(MSIZE - _MHLEN) + sizeof (_m_ext_t)];
 472         mcl_scratch_audit_t     sc_scratch __attribute__((aligned(8)));
 473 } mcl_saved_contents_t;
 474
 475 #define AUDIT_CONTENTS_SIZE     (sizeof (mcl_saved_contents_t))
 476
 477 #define MCA_SAVED_MBUF_PTR(_mca)                                        \
 478         ((struct mbuf *)(void *)((mcl_saved_contents_t *)               \
 479         (_mca)->mca_contents)->sc_mbuf)
 480 #define MCA_SAVED_MBUF_SIZE                                             \
 481         (sizeof (((mcl_saved_contents_t *)0)->sc_mbuf))
 482 #define MCA_SAVED_SCRATCH_PTR(_mca)                                     \
 483         (&((mcl_saved_contents_t *)(_mca)->mca_contents)->sc_scratch)
 484
 485 /*
 486  * mbuf specific mcache audit flags
 487  */
 488 #define MB_INUSE        0x01    /* object has not been returned to slab */
 489 #define MB_COMP_INUSE   0x02    /* object has not been returned to cslab */
 490 #define MB_SCVALID      0x04    /* object has valid saved contents */
 491
 492 /*
 493  * Each of the following two arrays hold up to nmbclusters elements.
 494  */
 495 static mcl_audit_t *mclaudit;   /* array of cluster audit information */
 496 static unsigned int maxclaudit; /* max # of entries in audit table */
 497 static mcl_slabg_t **slabstbl;  /* cluster slabs table */
 498 static unsigned int maxslabgrp; /* max # of entries in slabs table */
 499 static unsigned int slabgrp;    /* # of entries in slabs table */
 500
 501 /* Globals */
 502 int nclusters;                  /* # of clusters for non-jumbo (legacy) sizes */
 503 int njcl;                       /* # of clusters for jumbo sizes */
 504 int njclbytes;                  /* size of a jumbo cluster */
 505 unsigned char *mbutl;           /* first mapped cluster address */
 506 unsigned char *embutl;          /* ending virtual address of mclusters */
 507 int _max_linkhdr;               /* largest link-level header */
 508 int _max_protohdr;              /* largest protocol header */
 509 int max_hdr;                    /* largest link+protocol header */
 510 int max_datalen;                /* MHLEN - max_hdr */
 511
 512 static boolean_t mclverify;     /* debug: pattern-checking */
 513 static boolean_t mcltrace;      /* debug: stack tracing */
 514 static boolean_t mclfindleak;   /* debug: leak detection */
 515 static boolean_t mclexpleak;    /* debug: expose leak info to user space */
 516
 517 static struct timeval mb_start; /* beginning of time */
 518
 519 /* mbuf leak detection variables */
 520 static struct mleak_table mleak_table;
 521 static mleak_stat_t *mleak_stat;
 522
 523 #define MLEAK_STAT_SIZE(n) \
 524         __builtin_offsetof(mleak_stat_t, ml_trace[n])
 525
 526 struct mallocation {
 527         mcache_obj_t *element;  /* the alloc'ed element, NULL if unused */
 528         u_int32_t trace_index;  /* mtrace index for corresponding backtrace */
 529         u_int32_t count;        /* How many objects were requested */
 530         u_int64_t hitcount;     /* for determining hash effectiveness */
 531 };
 532
 533 struct mtrace {
 534         u_int64_t       collisions;
 535         u_int64_t       hitcount;
 536         u_int64_t       allocs;
 537         u_int64_t       depth;
 538         uintptr_t       addr[MLEAK_STACK_DEPTH];
 539 };
 540
 541 /* Size must be a power of two for the zhash to be able to just mask off bits */
 542 #define MLEAK_ALLOCATION_MAP_NUM        512
 543 #define MLEAK_TRACE_MAP_NUM             256
 544
 545 /*
 546  * Sample factor for how often to record a trace.  This is overwritable
 547  * by the boot-arg mleak_sample_factor.
 548  */
 549 #define MLEAK_SAMPLE_FACTOR             500
 550
 551 /*
 552  * Number of top leakers recorded.
 553  */
 554 #define MLEAK_NUM_TRACES                5
 555
 556 #define MB_LEAK_SPACING_64 "                    "
 557 #define MB_LEAK_SPACING_32 "            "
 558
 559
 560 #define MB_LEAK_HDR_32  "\n\
 561     trace [1]   trace [2]   trace [3]   trace [4]   trace [5]  \n\
 562     ----------  ----------  ----------  ----------  ---------- \n\
 563 "
 564
 565 #define MB_LEAK_HDR_64  "\n\
 566     trace [1]           trace [2]           trace [3]       \
 567         trace [4]           trace [5]      \n\
 568     ------------------  ------------------  ------------------  \
 569     ------------------  ------------------ \n\
 570 "
 571
 572 static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM;
 573 static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM;
 574
 575 /* Hashmaps of allocations and their corresponding traces */
 576 static struct mallocation *mleak_allocations;
 577 static struct mtrace *mleak_traces;
 578 static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES];
 579
 580 /* Lock to protect mleak tables from concurrent modification */
 581 decl_lck_mtx_data(static, mleak_lock_data);
 582 static lck_mtx_t *mleak_lock = &mleak_lock_data;
 583 static lck_attr_t *mleak_lock_attr;
 584 static lck_grp_t *mleak_lock_grp;
 585 static lck_grp_attr_t *mleak_lock_grp_attr;
 586
 587 /* *Failed* large allocations. */
 588 struct mtracelarge {
 589         uint64_t        size;
 590         uint64_t        depth;
 591         uintptr_t       addr[MLEAK_STACK_DEPTH];
 592 };
 593
 594 #define MTRACELARGE_NUM_TRACES          5
 595 static struct mtracelarge mtracelarge_table[MTRACELARGE_NUM_TRACES];
 596
 597 static void mtracelarge_register(size_t size);
 598
 599 /* Lock to protect the completion callback table */
 600 static lck_grp_attr_t *mbuf_tx_compl_tbl_lck_grp_attr = NULL;
 601 static lck_attr_t *mbuf_tx_compl_tbl_lck_attr = NULL;
 602 static lck_grp_t *mbuf_tx_compl_tbl_lck_grp = NULL;
 603 decl_lck_rw_data(, mbuf_tx_compl_tbl_lck_rw_data);
 604 lck_rw_t *mbuf_tx_compl_tbl_lock = &mbuf_tx_compl_tbl_lck_rw_data;
 605
 606 extern u_int32_t high_sb_max;
 607
 608 /* The minimum number of objects that are allocated, to start. */
 609 #define MINCL           32
 610 #define MINBIGCL        (MINCL >> 1)
 611 #define MIN16KCL        (MINCL >> 2)
 612
 613 /* Low watermarks (only map in pages once free counts go below) */
 614 #define MBIGCL_LOWAT    MINBIGCL
 615 #define M16KCL_LOWAT    MIN16KCL
 616
 617 typedef struct {
 618         mbuf_class_t    mtbl_class;     /* class type */
 619         mcache_t        *mtbl_cache;    /* mcache for this buffer class */
 620         TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */
 621         mcache_obj_t    *mtbl_cobjlist; /* composite objects freelist */
 622         mb_class_stat_t *mtbl_stats;    /* statistics fetchable via sysctl */
 623         u_int32_t       mtbl_maxsize;   /* maximum buffer size */
 624         int             mtbl_minlimit;  /* minimum allowed */
 625         int             mtbl_maxlimit;  /* maximum allowed */
 626         u_int32_t       mtbl_wantpurge; /* purge during next reclaim */
 627         uint32_t        mtbl_avgtotal;  /* average total on iOS */
 628         u_int32_t       mtbl_expand;    /* worker should expand the class */
 629 } mbuf_table_t;
 630
 631 #define m_class(c)      mbuf_table[c].mtbl_class
 632 #define m_cache(c)      mbuf_table[c].mtbl_cache
 633 #define m_slablist(c)   mbuf_table[c].mtbl_slablist
 634 #define m_cobjlist(c)   mbuf_table[c].mtbl_cobjlist
 635 #define m_maxsize(c)    mbuf_table[c].mtbl_maxsize
 636 #define m_minlimit(c)   mbuf_table[c].mtbl_minlimit
 637 #define m_maxlimit(c)   mbuf_table[c].mtbl_maxlimit
 638 #define m_wantpurge(c)  mbuf_table[c].mtbl_wantpurge
 639 #define m_cname(c)      mbuf_table[c].mtbl_stats->mbcl_cname
 640 #define m_size(c)       mbuf_table[c].mtbl_stats->mbcl_size
 641 #define m_total(c)      mbuf_table[c].mtbl_stats->mbcl_total
 642 #define m_active(c)     mbuf_table[c].mtbl_stats->mbcl_active
 643 #define m_infree(c)     mbuf_table[c].mtbl_stats->mbcl_infree
 644 #define m_slab_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_slab_cnt
 645 #define m_alloc_cnt(c)  mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
 646 #define m_free_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_free_cnt
 647 #define m_notified(c)   mbuf_table[c].mtbl_stats->mbcl_notified
 648 #define m_purge_cnt(c)  mbuf_table[c].mtbl_stats->mbcl_purge_cnt
 649 #define m_fail_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_fail_cnt
 650 #define m_ctotal(c)     mbuf_table[c].mtbl_stats->mbcl_ctotal
 651 #define m_peak(c)       mbuf_table[c].mtbl_stats->mbcl_peak_reported
 652 #define m_release_cnt(c) mbuf_table[c].mtbl_stats->mbcl_release_cnt
 653 #define m_region_expand(c)      mbuf_table[c].mtbl_expand
 654
 655 static mbuf_table_t mbuf_table[] = {
 656         /*
 657          * The caches for mbufs, regular clusters and big clusters.
 658          * The average total values were based on data gathered by actual
 659          * usage patterns on iOS.
 660          */
 661         { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
 662             NULL, NULL, 0, 0, 0, 0, 3000, 0 },
 663         { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
 664             NULL, NULL, 0, 0, 0, 0, 2000, 0 },
 665         { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
 666             NULL, NULL, 0, 0, 0, 0, 1000, 0 },
 667         { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
 668             NULL, NULL, 0, 0, 0, 0, 200, 0 },
 669         /*
 670          * The following are special caches; they serve as intermediate
 671          * caches backed by the above rudimentary caches.  Each object
 672          * in the cache is an mbuf with a cluster attached to it.  Unlike
 673          * the above caches, these intermediate caches do not directly
 674          * deal with the slab structures; instead, the constructed
 675          * cached elements are simply stored in the freelists.
 676          */
 677         { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 2000, 0 },
 678         { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 1000, 0 },
 679         { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 200, 0 },
 680 };
 681
 682 #define NELEM(a)        (sizeof (a) / sizeof ((a)[0]))
 683
 684
 685 static uint32_t
 686 m_avgtotal(mbuf_class_t c)
 687 {
 688         return (mbuf_table[c].mtbl_avgtotal);
 689 }
 690
 691 static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */
 692 static int mb_waiters;                  /* number of waiters */
 693
 694 boolean_t mb_peak_newreport = FALSE;
 695 boolean_t mb_peak_firstreport = FALSE;
 696
 697 /* generate a report by default after 1 week of uptime */
 698 #define MBUF_PEAK_FIRST_REPORT_THRESHOLD        604800
 699
 700 #define MB_WDT_MAXTIME  10              /* # of secs before watchdog panic */
 701 static struct timeval mb_wdtstart;      /* watchdog start timestamp */
 702 static char *mbuf_dump_buf;
 703
 704 #define MBUF_DUMP_BUF_SIZE      4096
 705
 706 /*
 707  * mbuf watchdog is enabled by default on embedded platforms.  It is
 708  * also toggeable via the kern.ipc.mb_watchdog sysctl.
 709  * Garbage collection is also enabled by default on embedded platforms.
 710  * mb_drain_maxint controls the amount of time to wait (in seconds) before
 711  * consecutive calls to m_drain().
 712  */
 713 #if CONFIG_EMBEDDED
 714 static unsigned int mb_watchdog = 1;
 715 static unsigned int mb_drain_maxint = 60;
 716 #else
 717 static unsigned int mb_watchdog = 0;
 718 static unsigned int mb_drain_maxint = 0;
 719 #endif /* CONFIG_EMBEDDED */
 720
 721 uintptr_t mb_obscure_extfree __attribute__((visibility("hidden")));
 722 uintptr_t mb_obscure_extref __attribute__((visibility("hidden")));
 723
 724 /* Red zone */
 725 static u_int32_t mb_redzone_cookie;
 726 static void m_redzone_init(struct mbuf *);
 727 static void m_redzone_verify(struct mbuf *m);
 728
 729 /* The following are used to serialize m_clalloc() */
 730 static boolean_t mb_clalloc_busy;
 731 static void *mb_clalloc_waitchan = &mb_clalloc_busy;
 732 static int mb_clalloc_waiters;
 733
 734 static void mbuf_mtypes_sync(boolean_t);
 735 static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
 736 static void mbuf_stat_sync(void);
 737 static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
 738 static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS;
 739 static int mleak_table_sysctl SYSCTL_HANDLER_ARGS;
 740 static char *mbuf_dump(void);
 741 static void mbuf_table_init(void);
 742 static inline void m_incref(struct mbuf *);
 743 static inline u_int16_t m_decref(struct mbuf *);
 744 static int m_clalloc(const u_int32_t, const int, const u_int32_t);
 745 static void mbuf_worker_thread_init(void);
 746 static mcache_obj_t *slab_alloc(mbuf_class_t, int);
 747 static void slab_free(mbuf_class_t, mcache_obj_t *);
 748 static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***,
 749     unsigned int, int);
 750 static void mbuf_slab_free(void *, mcache_obj_t *, int);
 751 static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t);
 752 static void mbuf_slab_notify(void *, u_int32_t);
 753 static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
 754     unsigned int);
 755 static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
 756 static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***,
 757     unsigned int, int);
 758 static void mbuf_cslab_free(void *, mcache_obj_t *, int);
 759 static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t);
 760 static int freelist_populate(mbuf_class_t, unsigned int, int);
 761 static void freelist_init(mbuf_class_t);
 762 static boolean_t mbuf_cached_above(mbuf_class_t, int);
 763 static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
 764 static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
 765 static int m_howmany(int, size_t);
 766 static void mbuf_worker_thread(void);
 767 static void mbuf_watchdog(void);
 768 static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
 769
 770 static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **,
 771     size_t, unsigned int);
 772 static void mcl_audit_free(void *, unsigned int);
 773 static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *);
 774 static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t);
 775 static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t,
 776     boolean_t);
 777 static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t);
 778 static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *);
 779 static void mcl_audit_scratch(mcache_audit_t *);
 780 static void mcl_audit_mcheck_panic(struct mbuf *);
 781 static void mcl_audit_verify_nextptr(void *, mcache_audit_t *);
 782
 783 static void mleak_activate(void);
 784 static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t);
 785 static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int);
 786 static void mleak_free(mcache_obj_t *);
 787 static void mleak_sort_traces(void);
 788 static void mleak_update_stats(void);
 789
 790 static mcl_slab_t *slab_get(void *);
 791 static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
 792     void *, void *, unsigned int, int, int);
 793 static void slab_insert(mcl_slab_t *, mbuf_class_t);
 794 static void slab_remove(mcl_slab_t *, mbuf_class_t);
 795 static boolean_t slab_inrange(mcl_slab_t *, void *);
 796 static void slab_nextptr_panic(mcl_slab_t *, void *);
 797 static void slab_detach(mcl_slab_t *);
 798 static boolean_t slab_is_detached(mcl_slab_t *);
 799
 800 static int m_copyback0(struct mbuf **, int, int, const void *, int, int);
 801 static struct mbuf *m_split0(struct mbuf *, int, int, int);
 802 __private_extern__ void mbuf_report_peak_usage(void);
 803 static boolean_t mbuf_report_usage(mbuf_class_t);
 804
 805 /* flags for m_copyback0 */
 806 #define M_COPYBACK0_COPYBACK    0x0001  /* copyback from cp */
 807 #define M_COPYBACK0_PRESERVE    0x0002  /* preserve original data */
 808 #define M_COPYBACK0_COW         0x0004  /* do copy-on-write */
 809 #define M_COPYBACK0_EXTEND      0x0008  /* extend chain */
 810
 811 /*
 812  * This flag is set for all mbufs that come out of and into the composite
 813  * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL.  mbufs that
 814  * are marked with such a flag have clusters attached to them, and will be
 815  * treated differently when they are freed; instead of being placed back
 816  * into the mbuf and cluster freelists, the composite mbuf + cluster objects
 817  * are placed back into the appropriate composite cache's freelist, and the
 818  * actual freeing is deferred until the composite objects are purged.  At
 819  * such a time, this flag will be cleared from the mbufs and the objects
 820  * will be freed into their own separate freelists.
 821  */
 822 #define EXTF_COMPOSITE  0x1
 823
 824 /*
 825  * This flag indicates that the external cluster is read-only, i.e. it is
 826  * or was referred to by more than one mbufs.  Once set, this flag is never
 827  * cleared.
 828  */
 829 #define EXTF_READONLY   0x2
 830 /*
 831  * This flag indicates that the external cluster is paired with the mbuf.
 832  * Pairing implies an external free routine defined which will be invoked
 833  * when the reference count drops to the minimum at m_free time.  This
 834  * flag is never cleared.
 835  */
 836 #define EXTF_PAIRED     0x4
 837
 838 #define EXTF_MASK       \
 839         (EXTF_COMPOSITE | EXTF_READONLY | EXTF_PAIRED)
 840
 841 #define MEXT_MINREF(m)          ((m_get_rfa(m))->minref)
 842 #define MEXT_REF(m)             ((m_get_rfa(m))->refcnt)
 843 #define MEXT_PREF(m)            ((m_get_rfa(m))->prefcnt)
 844 #define MEXT_FLAGS(m)           ((m_get_rfa(m))->flags)
 845 #define MEXT_PRIV(m)            ((m_get_rfa(m))->priv)
 846 #define MEXT_PMBUF(m)           ((m_get_rfa(m))->paired)
 847 #define MEXT_TOKEN(m)           ((m_get_rfa(m))->ext_token)
 848 #define MBUF_IS_COMPOSITE(m)                                            \
 849         (MEXT_REF(m) == MEXT_MINREF(m) &&                               \
 850         (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE)
 851 /*
 852  * This macro can be used to test if the mbuf is paired to an external
 853  * cluster.  The test for MEXT_PMBUF being equal to the mbuf in subject
 854  * is important, as EXTF_PAIRED alone is insufficient since it is immutable,
 855  * and thus survives calls to m_free_paired.
 856  */
 857 #define MBUF_IS_PAIRED(m)                                               \
 858         (((m)->m_flags & M_EXT) &&                                      \
 859         (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_PAIRED &&                   \
 860         MEXT_PMBUF(m) == (m))
 861
 862 /*
 863  * Macros used to verify the integrity of the mbuf.
 864  */
 865 #define _MCHECK(m) {                                                    \
 866         if ((m)->m_type != MT_FREE && !MBUF_IS_PAIRED(m)) {             \
 867                 if (mclaudit == NULL)                                   \
 868                         panic("MCHECK: m_type=%d m=%p",                 \
 869                             (u_int16_t)(m)->m_type, m);                 \
 870                 else                                                    \
 871                         mcl_audit_mcheck_panic(m);                      \
 872         }                                                               \
 873 }
 874
 875 #define MBUF_IN_MAP(addr)                                               \
 876         ((unsigned char *)(addr) >= mbutl &&                            \
 877         (unsigned char *)(addr) < embutl)
 878
 879 #define MRANGE(addr) {                                                  \
 880         if (!MBUF_IN_MAP(addr))                                         \
 881                 panic("MRANGE: address out of range 0x%p", addr);       \
 882 }
 883
 884 /*
 885  * Macro version of mtod.
 886  */
 887 #define MTOD(m, t)      ((t)((m)->m_data))
 888
 889 /*
 890  * Macros to obtain page index given a base cluster address
 891  */
 892 #define MTOPG(x)        (((unsigned char *)x - mbutl) >> PAGE_SHIFT)
 893 #define PGTOM(x)        (mbutl + (x << PAGE_SHIFT))
 894
 895 /*
 896  * Macro to find the mbuf index relative to a base.
 897  */
 898 #define MBPAGEIDX(c, m) \
 899         (((unsigned char *)(m) - (unsigned char *)(c)) >> MSIZESHIFT)
 900
 901 /*
 902  * Same thing for 2KB cluster index.
 903  */
 904 #define CLPAGEIDX(c, m) \
 905         (((unsigned char *)(m) - (unsigned char *)(c)) >> MCLSHIFT)
 906
 907 /*
 908  * Macro to find 4KB cluster index relative to a base
 909  */
 910 #define BCLPAGEIDX(c, m) \
 911         (((unsigned char *)(m) - (unsigned char *)(c)) >> MBIGCLSHIFT)
 912
 913 /*
 914  * Macros used during mbuf and cluster initialization.
 915  */
 916 #define MBUF_INIT_PKTHDR(m) {                                           \
 917         (m)->m_pkthdr.rcvif = NULL;                                     \
 918         (m)->m_pkthdr.pkt_hdr = NULL;                                   \
 919         (m)->m_pkthdr.len = 0;                                          \
 920         (m)->m_pkthdr.csum_flags = 0;                                   \
 921         (m)->m_pkthdr.csum_data = 0;                                    \
 922         (m)->m_pkthdr.vlan_tag = 0;                                     \
 923         m_classifier_init(m, 0);                                        \
 924         m_tag_init(m, 1);                                               \
 925         m_scratch_init(m);                                              \
 926         m_redzone_init(m);                                              \
 927 }
 928
 929 #define MBUF_INIT(m, pkthdr, type) {                                    \
 930         _MCHECK(m);                                                     \
 931         (m)->m_next = (m)->m_nextpkt = NULL;                            \
 932         (m)->m_len = 0;                                                 \
 933         (m)->m_type = type;                                             \
 934         if ((pkthdr) == 0) {                                            \
 935                 (m)->m_data = (m)->m_dat;                               \
 936                 (m)->m_flags = 0;                                       \
 937         } else {                                                        \
 938                 (m)->m_data = (m)->m_pktdat;                            \
 939                 (m)->m_flags = M_PKTHDR;                                \
 940                 MBUF_INIT_PKTHDR(m);                                    \
 941         }                                                               \
 942 }
 943
 944 #define MEXT_INIT(m, buf, size, free, arg, rfa, min, ref, pref, flag,   \
 945     priv, pm) {                                                         \
 946         (m)->m_data = (m)->m_ext.ext_buf = (buf);                       \
 947         (m)->m_flags |= M_EXT;                                          \
 948         m_set_ext((m), (rfa), (free), (arg));                           \
 949         (m)->m_ext.ext_size = (size);                                   \
 950         MEXT_MINREF(m) = (min);                                         \
 951         MEXT_REF(m) = (ref);                                            \
 952         MEXT_PREF(m) = (pref);                                          \
 953         MEXT_FLAGS(m) = (flag);                                         \
 954         MEXT_PRIV(m) = (priv);                                          \
 955         MEXT_PMBUF(m) = (pm);                                           \
 956 }
 957
 958 #define MBUF_CL_INIT(m, buf, rfa, ref, flag)    \
 959         MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, 0,         \
 960             ref, 0, flag, 0, NULL)
 961
 962 #define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
 963         MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, 0, \
 964             ref, 0, flag, 0, NULL)
 965
 966 #define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
 967         MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, 0, \
 968             ref, 0, flag, 0, NULL)
 969
 970 /*
 971  * Macro to convert BSD malloc sleep flag to mcache's
 972  */
 973 #define MSLEEPF(f)      ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
 974
 975 /*
 976  * The structure that holds all mbuf class statistics exportable via sysctl.
 977  * Similar to mbstat structure, the mb_stat structure is protected by the
 978  * global mbuf lock.  It contains additional information about the classes
 979  * that allows for a more accurate view of the state of the allocator.
 980  */
 981 struct mb_stat *mb_stat;
 982 struct omb_stat *omb_stat;      /* For backwards compatibility */
 983
 984 #define MB_STAT_SIZE(n) \
 985         __builtin_offsetof(mb_stat_t, mbs_class[n])
 986 #define OMB_STAT_SIZE(n) \
 987         ((size_t)(&((struct omb_stat *)0)->mbs_class[n]))
 988
 989 /*
 990  * The legacy structure holding all of the mbuf allocation statistics.
 991  * The actual statistics used by the kernel are stored in the mbuf_table
 992  * instead, and are updated atomically while the global mbuf lock is held.
 993  * They are mirrored in mbstat to support legacy applications (e.g. netstat).
 994  * Unlike before, the kernel no longer relies on the contents of mbstat for
 995  * its operations (e.g. cluster expansion) because the structure is exposed
 996  * to outside and could possibly be modified, therefore making it unsafe.
 997  * With the exception of the mbstat.m_mtypes array (see below), all of the
 998  * statistics are updated as they change.
 999  */
1000 struct mbstat mbstat;
1001
1002 #define MBSTAT_MTYPES_MAX \
1003         (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
1004
1005 /*
1006  * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
1007  * atomically and stored in a per-CPU structure which is lock-free; this is
1008  * done in order to avoid writing to the global mbstat data structure which
1009  * would cause false sharing.  During sysctl request for kern.ipc.mbstat,
1010  * the statistics across all CPUs will be converged into the mbstat.m_mtypes
1011  * array and returned to the application.  Any updates for types greater or
1012  * equal than MT_MAX would be done atomically to the mbstat; this slows down
1013  * performance but is okay since the kernel uses only up to MT_MAX-1 while
1014  * anything beyond that (up to type 255) is considered a corner case.
1015  */
1016 typedef struct {
1017         unsigned int    cpu_mtypes[MT_MAX];
1018 } __attribute__((aligned(MAX_CPU_CACHE_LINE_SIZE), packed)) mtypes_cpu_t;
1019
1020 typedef struct {
1021         mtypes_cpu_t    mbs_cpu[1];
1022 } mbuf_mtypes_t;
1023
1024 static mbuf_mtypes_t *mbuf_mtypes;      /* per-CPU statistics */
1025
1026 #define MBUF_MTYPES_SIZE(n) \
1027         ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n]))
1028
1029 #define MTYPES_CPU(p) \
1030         ((mtypes_cpu_t *)(void *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number())))
1031
1032 #define mtype_stat_add(type, n) {                                       \
1033         if ((unsigned)(type) < MT_MAX) {                                \
1034                 mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes);            \
1035                 atomic_add_32(&mbs->cpu_mtypes[type], n);               \
1036         } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) {    \
1037                 atomic_add_16((int16_t *)&mbstat.m_mtypes[type], n);    \
1038         }                                                               \
1039 }
1040
1041 #define mtype_stat_sub(t, n)    mtype_stat_add(t, -(n))
1042 #define mtype_stat_inc(t)       mtype_stat_add(t, 1)
1043 #define mtype_stat_dec(t)       mtype_stat_sub(t, 1)
1044
1045 static void
1046 mbuf_mtypes_sync(boolean_t locked)
1047 {
1048         int m, n;
1049         mtypes_cpu_t mtc;
1050
1051         if (locked)
1052                 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1053
1054         bzero(&mtc, sizeof (mtc));
1055         for (m = 0; m < ncpu; m++) {
1056                 mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m];
1057                 mtypes_cpu_t temp;
1058
1059                 bcopy(&scp->cpu_mtypes, &temp.cpu_mtypes,
1060                     sizeof (temp.cpu_mtypes));
1061
1062                 for (n = 0; n < MT_MAX; n++)
1063                         mtc.cpu_mtypes[n] += temp.cpu_mtypes[n];
1064         }
1065         if (!locked)
1066                 lck_mtx_lock(mbuf_mlock);
1067         for (n = 0; n < MT_MAX; n++)
1068                 mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
1069         if (!locked)
1070                 lck_mtx_unlock(mbuf_mlock);
1071 }
1072
1073 static int
1074 mbstat_sysctl SYSCTL_HANDLER_ARGS
1075 {
1076 #pragma unused(oidp, arg1, arg2)
1077         mbuf_mtypes_sync(FALSE);
1078
1079         return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat)));
1080 }
1081
1082 static void
1083 mbuf_stat_sync(void)
1084 {
1085         mb_class_stat_t *sp;
1086         mcache_cpu_t *ccp;
1087         mcache_t *cp;
1088         int k, m, bktsize;
1089
1090         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1091
1092         for (k = 0; k < NELEM(mbuf_table); k++) {
1093                 cp = m_cache(k);
1094                 ccp = &cp->mc_cpu[0];
1095                 bktsize = ccp->cc_bktsize;
1096                 sp = mbuf_table[k].mtbl_stats;
1097
1098                 if (cp->mc_flags & MCF_NOCPUCACHE)
1099                         sp->mbcl_mc_state = MCS_DISABLED;
1100                 else if (cp->mc_purge_cnt > 0)
1101                         sp->mbcl_mc_state = MCS_PURGING;
1102                 else if (bktsize == 0)
1103                         sp->mbcl_mc_state = MCS_OFFLINE;
1104                 else
1105                         sp->mbcl_mc_state = MCS_ONLINE;
1106
1107                 sp->mbcl_mc_cached = 0;
1108                 for (m = 0; m < ncpu; m++) {
1109                         ccp = &cp->mc_cpu[m];
1110                         if (ccp->cc_objs > 0)
1111                                 sp->mbcl_mc_cached += ccp->cc_objs;
1112                         if (ccp->cc_pobjs > 0)
1113                                 sp->mbcl_mc_cached += ccp->cc_pobjs;
1114                 }
1115                 sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
1116                 sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
1117                     sp->mbcl_infree;
1118
1119                 sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
1120                 sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
1121                 sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
1122
1123                 /* Calculate total count specific to each class */
1124                 sp->mbcl_ctotal = sp->mbcl_total;
1125                 switch (m_class(k)) {
1126                 case MC_MBUF:
1127                         /* Deduct mbufs used in composite caches */
1128                         sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
1129                             m_total(MC_MBUF_BIGCL));
1130                         break;
1131
1132                 case MC_CL:
1133                         /* Deduct clusters used in composite cache */
1134                         sp->mbcl_ctotal -= m_total(MC_MBUF_CL);
1135                         break;
1136
1137                 case MC_BIGCL:
1138                         /* Deduct clusters used in composite cache */
1139                         sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
1140                         break;
1141
1142                 case MC_16KCL:
1143                         /* Deduct clusters used in composite cache */
1144                         sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
1145                         break;
1146
1147                 default:
1148                         break;
1149                 }
1150         }
1151 }
1152
1153 static int
1154 mb_stat_sysctl SYSCTL_HANDLER_ARGS
1155 {
1156 #pragma unused(oidp, arg1, arg2)
1157         void *statp;
1158         int k, statsz, proc64 = proc_is64bit(req->p);
1159
1160         lck_mtx_lock(mbuf_mlock);
1161         mbuf_stat_sync();
1162
1163         if (!proc64) {
1164                 struct omb_class_stat *oc;
1165                 struct mb_class_stat *c;
1166
1167                 omb_stat->mbs_cnt = mb_stat->mbs_cnt;
1168                 oc = &omb_stat->mbs_class[0];
1169                 c = &mb_stat->mbs_class[0];
1170                 for (k = 0; k < omb_stat->mbs_cnt; k++, oc++, c++) {
1171                         (void) snprintf(oc->mbcl_cname, sizeof (oc->mbcl_cname),
1172                             "%s", c->mbcl_cname);
1173                         oc->mbcl_size = c->mbcl_size;
1174                         oc->mbcl_total = c->mbcl_total;
1175                         oc->mbcl_active = c->mbcl_active;
1176                         oc->mbcl_infree = c->mbcl_infree;
1177                         oc->mbcl_slab_cnt = c->mbcl_slab_cnt;
1178                         oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt;
1179                         oc->mbcl_free_cnt = c->mbcl_free_cnt;
1180                         oc->mbcl_notified = c->mbcl_notified;
1181                         oc->mbcl_purge_cnt = c->mbcl_purge_cnt;
1182                         oc->mbcl_fail_cnt = c->mbcl_fail_cnt;
1183                         oc->mbcl_ctotal = c->mbcl_ctotal;
1184                         oc->mbcl_release_cnt = c->mbcl_release_cnt;
1185                         oc->mbcl_mc_state = c->mbcl_mc_state;
1186                         oc->mbcl_mc_cached = c->mbcl_mc_cached;
1187                         oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt;
1188                         oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt;
1189                         oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt;
1190                 }
1191                 statp = omb_stat;
1192                 statsz = OMB_STAT_SIZE(NELEM(mbuf_table));
1193         } else {
1194                 statp = mb_stat;
1195                 statsz = MB_STAT_SIZE(NELEM(mbuf_table));
1196         }
1197
1198         lck_mtx_unlock(mbuf_mlock);
1199
1200         return (SYSCTL_OUT(req, statp, statsz));
1201 }
1202
1203 static int
1204 mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS
1205 {
1206 #pragma unused(oidp, arg1, arg2)
1207         int i;
1208
1209         /* Ensure leak tracing turned on */
1210         if (!mclfindleak || !mclexpleak)
1211                 return (ENXIO);
1212
1213         lck_mtx_lock(mleak_lock);
1214         mleak_update_stats();
1215         i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES));
1216         lck_mtx_unlock(mleak_lock);
1217
1218         return (i);
1219 }
1220
1221 static int
1222 mleak_table_sysctl SYSCTL_HANDLER_ARGS
1223 {
1224 #pragma unused(oidp, arg1, arg2)
1225         int i = 0;
1226
1227         /* Ensure leak tracing turned on */
1228         if (!mclfindleak || !mclexpleak)
1229                 return (ENXIO);
1230
1231         lck_mtx_lock(mleak_lock);
1232         i = SYSCTL_OUT(req, &mleak_table, sizeof (mleak_table));
1233         lck_mtx_unlock(mleak_lock);
1234
1235         return (i);
1236 }
1237
1238 static inline void
1239 m_incref(struct mbuf *m)
1240 {
1241         UInt16 old, new;
1242         volatile UInt16 *addr = (volatile UInt16 *)&MEXT_REF(m);
1243
1244         do {
1245                 old = *addr;
1246                 new = old + 1;
1247                 ASSERT(new != 0);
1248         } while (!OSCompareAndSwap16(old, new, addr));
1249
1250         /*
1251          * If cluster is shared, mark it with (sticky) EXTF_READONLY;
1252          * we don't clear the flag when the refcount goes back to the
1253          * minimum, to simplify code calling m_mclhasreference().
1254          */
1255         if (new > (MEXT_MINREF(m) + 1) && !(MEXT_FLAGS(m) & EXTF_READONLY))
1256                 (void) OSBitOrAtomic16(EXTF_READONLY, &MEXT_FLAGS(m));
1257 }
1258
1259 static inline u_int16_t
1260 m_decref(struct mbuf *m)
1261 {
1262         UInt16 old, new;
1263         volatile UInt16 *addr = (volatile UInt16 *)&MEXT_REF(m);
1264
1265         do {
1266                 old = *addr;
1267                 new = old - 1;
1268                 ASSERT(old != 0);
1269         } while (!OSCompareAndSwap16(old, new, addr));
1270
1271         return (new);
1272 }
1273
1274 static void
1275 mbuf_table_init(void)
1276 {
1277         unsigned int b, c, s;
1278         int m, config_mbuf_jumbo = 0;
1279
1280         MALLOC(omb_stat, struct omb_stat *, OMB_STAT_SIZE(NELEM(mbuf_table)),
1281             M_TEMP, M_WAITOK | M_ZERO);
1282         VERIFY(omb_stat != NULL);
1283
1284         MALLOC(mb_stat, mb_stat_t *, MB_STAT_SIZE(NELEM(mbuf_table)),
1285             M_TEMP, M_WAITOK | M_ZERO);
1286         VERIFY(mb_stat != NULL);
1287
1288         mb_stat->mbs_cnt = NELEM(mbuf_table);
1289         for (m = 0; m < NELEM(mbuf_table); m++)
1290                 mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
1291
1292 #if CONFIG_MBUF_JUMBO
1293         config_mbuf_jumbo = 1;
1294 #endif /* CONFIG_MBUF_JUMBO */
1295
1296         if (config_mbuf_jumbo == 1 || PAGE_SIZE == M16KCLBYTES) {
1297                 /*
1298                  * Set aside 1/3 of the mbuf cluster map for jumbo
1299                  * clusters; we do this only on platforms where jumbo
1300                  * cluster pool is enabled.
1301                  */
1302                 njcl = nmbclusters / 3;
1303                 njclbytes = M16KCLBYTES;
1304         }
1305
1306         /*
1307          * nclusters holds both the 2KB and 4KB pools, so ensure it's
1308          * a multiple of 4KB clusters.
1309          */
1310         nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG);
1311         if (njcl > 0) {
1312                 /*
1313                  * Each jumbo cluster takes 8 2KB clusters, so make
1314                  * sure that the pool size is evenly divisible by 8;
1315                  * njcl is in 2KB unit, hence treated as such.
1316                  */
1317                 njcl = P2ROUNDDOWN(nmbclusters - nclusters, NCLPJCL);
1318
1319                 /* Update nclusters with rounded down value of njcl */
1320                 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG);
1321         }
1322
1323         /*
1324          * njcl is valid only on platforms with 16KB jumbo clusters or
1325          * with 16KB pages, where it is configured to 1/3 of the pool
1326          * size.  On these platforms, the remaining is used for 2KB
1327          * and 4KB clusters.  On platforms without 16KB jumbo clusters,
1328          * the entire pool is used for both 2KB and 4KB clusters.  A 4KB
1329          * cluster can either be splitted into 16 mbufs, or into 2 2KB
1330          * clusters.
1331          *
1332          *  +---+---+------------ ... -----------+------- ... -------+
1333          *  | c | b |              s             |        njcl       |
1334          *  +---+---+------------ ... -----------+------- ... -------+
1335          *
1336          * 1/32th of the shared region is reserved for pure 2KB and 4KB
1337          * clusters (1/64th each.)
1338          */
1339         c = P2ROUNDDOWN((nclusters >> 6), NCLPG);       /* in 2KB unit */
1340         b = P2ROUNDDOWN((nclusters >> (6 + NCLPBGSHIFT)), NBCLPG); /* in 4KB unit */
1341         s = nclusters - (c + (b << NCLPBGSHIFT));       /* in 2KB unit */
1342
1343         /*
1344          * 1/64th (c) is reserved for 2KB clusters.
1345          */
1346         m_minlimit(MC_CL) = c;
1347         m_maxlimit(MC_CL) = s + c;                      /* in 2KB unit */
1348         m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
1349         (void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
1350
1351         /*
1352          * Another 1/64th (b) of the map is reserved for 4KB clusters.
1353          * It cannot be turned into 2KB clusters or mbufs.
1354          */
1355         m_minlimit(MC_BIGCL) = b;
1356         m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b;  /* in 4KB unit */
1357         m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES;
1358         (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
1359
1360         /*
1361          * The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB)
1362          */
1363         m_minlimit(MC_MBUF) = 0;
1364         m_maxlimit(MC_MBUF) = (s << NMBPCLSHIFT);       /* in mbuf unit */
1365         m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE;
1366         (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
1367
1368         /*
1369          * Set limits for the composite classes.
1370          */
1371         m_minlimit(MC_MBUF_CL) = 0;
1372         m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL);
1373         m_maxsize(MC_MBUF_CL) = MCLBYTES;
1374         m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
1375         (void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
1376
1377         m_minlimit(MC_MBUF_BIGCL) = 0;
1378         m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
1379         m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES;
1380         m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
1381         (void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
1382
1383         /*
1384          * And for jumbo classes.
1385          */
1386         m_minlimit(MC_16KCL) = 0;
1387         m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT);  /* in 16KB unit */
1388         m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
1389         (void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
1390
1391         m_minlimit(MC_MBUF_16KCL) = 0;
1392         m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
1393         m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
1394         m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
1395         (void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
1396
1397         /*
1398          * Initialize the legacy mbstat structure.
1399          */
1400         bzero(&mbstat, sizeof (mbstat));
1401         mbstat.m_msize = m_maxsize(MC_MBUF);
1402         mbstat.m_mclbytes = m_maxsize(MC_CL);
1403         mbstat.m_minclsize = MINCLSIZE;
1404         mbstat.m_mlen = MLEN;
1405         mbstat.m_mhlen = MHLEN;
1406         mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
1407 }
1408
1409 #if defined(__LP64__)
1410 typedef struct ncl_tbl {
1411         uint64_t nt_maxmem;     /* memory (sane) size */
1412         uint32_t nt_mbpool;     /* mbuf pool size */
1413 } ncl_tbl_t;
1414
1415 /* Non-server */
1416 static ncl_tbl_t ncl_table[] = {
1417         { (1ULL << GBSHIFT)       /*  1 GB */,  (64 << MBSHIFT)  /*  64 MB */ },
1418         { (1ULL << (GBSHIFT + 3)) /*  8 GB */,  (96 << MBSHIFT)  /*  96 MB */ },
1419         { (1ULL << (GBSHIFT + 4)) /* 16 GB */,  (128 << MBSHIFT) /* 128 MB */ },
1420         { 0, 0 }
1421 };
1422
1423 /* Server */
1424 static ncl_tbl_t ncl_table_srv[] = {
1425         { (1ULL << GBSHIFT)       /*  1 GB */,  (96 << MBSHIFT)  /*  96 MB */ },
1426         { (1ULL << (GBSHIFT + 2)) /*  4 GB */,  (128 << MBSHIFT) /* 128 MB */ },
1427         { (1ULL << (GBSHIFT + 3)) /*  8 GB */,  (160 << MBSHIFT) /* 160 MB */ },
1428         { (1ULL << (GBSHIFT + 4)) /* 16 GB */,  (192 << MBSHIFT) /* 192 MB */ },
1429         { (1ULL << (GBSHIFT + 5)) /* 32 GB */,  (256 << MBSHIFT) /* 256 MB */ },
1430         { (1ULL << (GBSHIFT + 6)) /* 64 GB */,  (384 << MBSHIFT) /* 384 MB */ },
1431         { 0, 0 }
1432 };
1433 #endif /* __LP64__ */
1434
1435 __private_extern__ unsigned int
1436 mbuf_default_ncl(int server, uint64_t mem)
1437 {
1438 #if !defined(__LP64__)
1439 #pragma unused(server)
1440         unsigned int n;
1441         /*
1442          * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
1443          */
1444         if ((n = ((mem / 16) / MCLBYTES)) > 32768)
1445                 n = 32768;
1446 #else
1447         unsigned int n, i;
1448         ncl_tbl_t *tbl = (server ? ncl_table_srv : ncl_table);
1449         /*
1450          * 64-bit kernel (mbuf pool size based on table).
1451          */
1452         n = tbl[0].nt_mbpool;
1453         for (i = 0; tbl[i].nt_mbpool != 0; i++) {
1454                 if (mem < tbl[i].nt_maxmem)
1455                         break;
1456                 n = tbl[i].nt_mbpool;
1457         }
1458         n >>= MCLSHIFT;
1459 #endif /* !__LP64__ */
1460         return (n);
1461 }
1462
1463 __private_extern__ void
1464 mbinit(void)
1465 {
1466         unsigned int m;
1467         unsigned int initmcl = 0;
1468         void *buf;
1469         thread_t thread = THREAD_NULL;
1470
1471         microuptime(&mb_start);
1472
1473         /*
1474          * These MBUF_ values must be equal to their private counterparts.
1475          */
1476         _CASSERT(MBUF_EXT == M_EXT);
1477         _CASSERT(MBUF_PKTHDR == M_PKTHDR);
1478         _CASSERT(MBUF_EOR == M_EOR);
1479         _CASSERT(MBUF_LOOP == M_LOOP);
1480         _CASSERT(MBUF_BCAST == M_BCAST);
1481         _CASSERT(MBUF_MCAST == M_MCAST);
1482         _CASSERT(MBUF_FRAG == M_FRAG);
1483         _CASSERT(MBUF_FIRSTFRAG == M_FIRSTFRAG);
1484         _CASSERT(MBUF_LASTFRAG == M_LASTFRAG);
1485         _CASSERT(MBUF_PROMISC == M_PROMISC);
1486         _CASSERT(MBUF_HASFCS == M_HASFCS);
1487
1488         _CASSERT(MBUF_TYPE_FREE == MT_FREE);
1489         _CASSERT(MBUF_TYPE_DATA == MT_DATA);
1490         _CASSERT(MBUF_TYPE_HEADER == MT_HEADER);
1491         _CASSERT(MBUF_TYPE_SOCKET == MT_SOCKET);
1492         _CASSERT(MBUF_TYPE_PCB == MT_PCB);
1493         _CASSERT(MBUF_TYPE_RTABLE == MT_RTABLE);
1494         _CASSERT(MBUF_TYPE_HTABLE == MT_HTABLE);
1495         _CASSERT(MBUF_TYPE_ATABLE == MT_ATABLE);
1496         _CASSERT(MBUF_TYPE_SONAME == MT_SONAME);
1497         _CASSERT(MBUF_TYPE_SOOPTS == MT_SOOPTS);
1498         _CASSERT(MBUF_TYPE_FTABLE == MT_FTABLE);
1499         _CASSERT(MBUF_TYPE_RIGHTS == MT_RIGHTS);
1500         _CASSERT(MBUF_TYPE_IFADDR == MT_IFADDR);
1501         _CASSERT(MBUF_TYPE_CONTROL == MT_CONTROL);
1502         _CASSERT(MBUF_TYPE_OOBDATA == MT_OOBDATA);
1503
1504         _CASSERT(MBUF_TSO_IPV4 == CSUM_TSO_IPV4);
1505         _CASSERT(MBUF_TSO_IPV6 == CSUM_TSO_IPV6);
1506         _CASSERT(MBUF_CSUM_REQ_SUM16 == CSUM_PARTIAL);
1507         _CASSERT(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16);
1508         _CASSERT(MBUF_CSUM_REQ_ZERO_INVERT == CSUM_ZERO_INVERT);
1509         _CASSERT(MBUF_CSUM_REQ_IP == CSUM_IP);
1510         _CASSERT(MBUF_CSUM_REQ_TCP == CSUM_TCP);
1511         _CASSERT(MBUF_CSUM_REQ_UDP == CSUM_UDP);
1512         _CASSERT(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6);
1513         _CASSERT(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6);
1514         _CASSERT(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED);
1515         _CASSERT(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID);
1516         _CASSERT(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID);
1517         _CASSERT(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR);
1518
1519         _CASSERT(MBUF_WAITOK == M_WAIT);
1520         _CASSERT(MBUF_DONTWAIT == M_DONTWAIT);
1521         _CASSERT(MBUF_COPYALL == M_COPYALL);
1522
1523         _CASSERT(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK);
1524         _CASSERT(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK);
1525         _CASSERT(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE);
1526         _CASSERT(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE);
1527         _CASSERT(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE);
1528         _CASSERT(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI);
1529         _CASSERT(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI);
1530         _CASSERT(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI);
1531         _CASSERT(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO);
1532         _CASSERT(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO);
1533
1534         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK);
1535         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE);
1536         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI);
1537         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO);
1538
1539         /* Module specific scratch space (32-bit alignment requirement) */
1540         _CASSERT(!(offsetof(struct mbuf, m_pkthdr.pkt_mpriv) %
1541             sizeof (uint32_t)));
1542
1543         /* Initialize random red zone cookie value */
1544         _CASSERT(sizeof (mb_redzone_cookie) ==
1545             sizeof (((struct pkthdr *)0)->redzone));
1546         read_random(&mb_redzone_cookie, sizeof (mb_redzone_cookie));
1547         read_random(&mb_obscure_extref, sizeof (mb_obscure_extref));
1548         read_random(&mb_obscure_extfree, sizeof (mb_obscure_extfree));
1549         mb_obscure_extref |= 0x3;
1550         mb_obscure_extfree |= 0x3;
1551
1552         /* Make sure we don't save more than we should */
1553         _CASSERT(MCA_SAVED_MBUF_SIZE <= sizeof (struct mbuf));
1554
1555         if (nmbclusters == 0)
1556                 nmbclusters = NMBCLUSTERS;
1557
1558         /* This should be a sane (at least even) value by now */
1559         VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1));
1560
1561         /* Setup the mbuf table */
1562         mbuf_table_init();
1563
1564         /* Global lock for common layer */
1565         mbuf_mlock_grp_attr = lck_grp_attr_alloc_init();
1566         mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr);
1567         mbuf_mlock_attr = lck_attr_alloc_init();
1568         lck_mtx_init(mbuf_mlock, mbuf_mlock_grp, mbuf_mlock_attr);
1569
1570         /*
1571          * Allocate cluster slabs table:
1572          *
1573          *      maxslabgrp = (N * 2048) / (1024 * 1024)
1574          *
1575          * Where N is nmbclusters rounded up to the nearest 512.  This yields
1576          * mcl_slab_g_t units, each one representing a MB of memory.
1577          */
1578         maxslabgrp =
1579             (P2ROUNDUP(nmbclusters, (MBSIZE >> MCLSHIFT)) << MCLSHIFT) >> MBSHIFT;
1580         MALLOC(slabstbl, mcl_slabg_t **, maxslabgrp * sizeof (mcl_slabg_t *),
1581             M_TEMP, M_WAITOK | M_ZERO);
1582         VERIFY(slabstbl != NULL);
1583
1584         /*
1585          * Allocate audit structures, if needed:
1586          *
1587          *      maxclaudit = (maxslabgrp * 1024 * 1024) / PAGE_SIZE
1588          *
1589          * This yields mcl_audit_t units, each one representing a page.
1590          */
1591         PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof (mbuf_debug));
1592         mbuf_debug |= mcache_getflags();
1593         if (mbuf_debug & MCF_DEBUG) {
1594                 int l;
1595                 mcl_audit_t *mclad;
1596                 maxclaudit = ((maxslabgrp << MBSHIFT) >> PAGE_SHIFT);
1597                 MALLOC(mclaudit, mcl_audit_t *, maxclaudit * sizeof (*mclaudit),
1598                     M_TEMP, M_WAITOK | M_ZERO);
1599                 VERIFY(mclaudit != NULL);
1600                 for (l = 0, mclad = mclaudit; l < maxclaudit; l++) {
1601                         MALLOC(mclad[l].cl_audit, mcache_audit_t **,
1602                             NMBPG * sizeof(mcache_audit_t *),
1603                             M_TEMP, M_WAITOK | M_ZERO);
1604                         VERIFY(mclad[l].cl_audit != NULL);
1605                 }
1606
1607                 mcl_audit_con_cache = mcache_create("mcl_audit_contents",
1608                     AUDIT_CONTENTS_SIZE, sizeof (u_int64_t), 0, MCR_SLEEP);
1609                 VERIFY(mcl_audit_con_cache != NULL);
1610         }
1611         mclverify = (mbuf_debug & MCF_VERIFY);
1612         mcltrace = (mbuf_debug & MCF_TRACE);
1613         mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG);
1614         mclexpleak = mclfindleak && (mbuf_debug & MCF_EXPLEAKLOG);
1615
1616         /* Enable mbuf leak logging, with a lock to protect the tables */
1617
1618         mleak_lock_grp_attr = lck_grp_attr_alloc_init();
1619         mleak_lock_grp = lck_grp_alloc_init("mleak_lock", mleak_lock_grp_attr);
1620         mleak_lock_attr = lck_attr_alloc_init();
1621         lck_mtx_init(mleak_lock, mleak_lock_grp, mleak_lock_attr);
1622
1623         mleak_activate();
1624
1625         /*
1626          * Allocate structure for per-CPU statistics that's aligned
1627          * on the CPU cache boundary; this code assumes that we never
1628          * uninitialize this framework, since the original address
1629          * before alignment is not saved.
1630          */
1631         ncpu = ml_get_max_cpus();
1632         MALLOC(buf, void *, MBUF_MTYPES_SIZE(ncpu) + CPU_CACHE_LINE_SIZE,
1633             M_TEMP, M_WAITOK);
1634         VERIFY(buf != NULL);
1635
1636         mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf,
1637             CPU_CACHE_LINE_SIZE);
1638         bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu));
1639
1640         /* Calculate the number of pages assigned to the cluster pool */
1641         mcl_pages = (nmbclusters << MCLSHIFT) / PAGE_SIZE;
1642         MALLOC(mcl_paddr, ppnum_t *, mcl_pages * sizeof (ppnum_t),
1643             M_TEMP, M_WAITOK);
1644         VERIFY(mcl_paddr != NULL);
1645
1646         /* Register with the I/O Bus mapper */
1647         mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
1648         bzero((char *)mcl_paddr, mcl_pages * sizeof (ppnum_t));
1649
1650         embutl = (mbutl + (nmbclusters * MCLBYTES));
1651         VERIFY(((embutl - mbutl) % MBIGCLBYTES) == 0);
1652
1653         /* Prime up the freelist */
1654         PE_parse_boot_argn("initmcl", &initmcl, sizeof (initmcl));
1655         if (initmcl != 0) {
1656                 initmcl >>= NCLPBGSHIFT;        /* become a 4K unit */
1657                 if (initmcl > m_maxlimit(MC_BIGCL))
1658                         initmcl = m_maxlimit(MC_BIGCL);
1659         }
1660         if (initmcl < m_minlimit(MC_BIGCL))
1661                 initmcl = m_minlimit(MC_BIGCL);
1662
1663         lck_mtx_lock(mbuf_mlock);
1664
1665         /*
1666          * For classes with non-zero minimum limits, populate their freelists
1667          * so that m_total(class) is at least m_minlimit(class).
1668          */
1669         VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0);
1670         freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT);
1671         VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
1672         freelist_init(m_class(MC_CL));
1673
1674         for (m = 0; m < NELEM(mbuf_table); m++) {
1675                 /* Make sure we didn't miss any */
1676                 VERIFY(m_minlimit(m_class(m)) == 0 ||
1677                     m_total(m_class(m)) >= m_minlimit(m_class(m)));
1678
1679                 /* populate the initial sizes and report from there on */
1680                 m_peak(m_class(m)) = m_total(m_class(m));
1681         }
1682         mb_peak_newreport = FALSE;
1683
1684         lck_mtx_unlock(mbuf_mlock);
1685
1686         (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init,
1687             NULL, &thread);
1688         thread_deallocate(thread);
1689
1690         ref_cache = mcache_create("mext_ref", sizeof (struct ext_ref),
1691             0, 0, MCR_SLEEP);
1692
1693         /* Create the cache for each class */
1694         for (m = 0; m < NELEM(mbuf_table); m++) {
1695                 void *allocfunc, *freefunc, *auditfunc, *logfunc;
1696                 u_int32_t flags;
1697
1698                 flags = mbuf_debug;
1699                 if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL ||
1700                     m_class(m) == MC_MBUF_16KCL) {
1701                         allocfunc = mbuf_cslab_alloc;
1702                         freefunc = mbuf_cslab_free;
1703                         auditfunc = mbuf_cslab_audit;
1704                         logfunc = mleak_logger;
1705                 } else {
1706                         allocfunc = mbuf_slab_alloc;
1707                         freefunc = mbuf_slab_free;
1708                         auditfunc = mbuf_slab_audit;
1709                         logfunc = mleak_logger;
1710                 }
1711
1712                 /*
1713                  * Disable per-CPU caches for jumbo classes if there
1714                  * is no jumbo cluster pool available in the system.
1715                  * The cache itself is still created (but will never
1716                  * be populated) since it simplifies the code.
1717                  */
1718                 if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) &&
1719                     njcl == 0)
1720                         flags |= MCF_NOCPUCACHE;
1721
1722                 if (!mclfindleak)
1723                         flags |= MCF_NOLEAKLOG;
1724
1725                 m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
1726                     allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify,
1727                     (void *)(uintptr_t)m, flags, MCR_SLEEP);
1728         }
1729
1730         /*
1731          * Set the max limit on sb_max to be 1/16 th of the size of
1732          * memory allocated for mbuf clusters.
1733          */
1734         high_sb_max = (nmbclusters << (MCLSHIFT - 4));
1735         if (high_sb_max < sb_max) {
1736                 /* sb_max is too large for this configuration, scale it down */
1737                 if (high_sb_max > (1 << MBSHIFT)) {
1738                         /* We have atleast 16 M of mbuf pool */
1739                         sb_max = high_sb_max;
1740                 } else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) {
1741                         /*
1742                          * If we have more than 1M of mbufpool, cap the size of
1743                          * max sock buf at 1M
1744                          */
1745                         sb_max = high_sb_max = (1 << MBSHIFT);
1746                 } else {
1747                         sb_max = high_sb_max;
1748                 }
1749         }
1750
1751         /* allocate space for mbuf_dump_buf */
1752         MALLOC(mbuf_dump_buf, char *, MBUF_DUMP_BUF_SIZE, M_TEMP, M_WAITOK);
1753         VERIFY(mbuf_dump_buf != NULL);
1754
1755         if (mbuf_debug & MCF_DEBUG) {
1756                 printf("%s: MLEN %d, MHLEN %d\n", __func__,
1757                     (int)_MLEN, (int)_MHLEN);
1758         }
1759
1760         printf("%s: done [%d MB total pool size, (%d/%d) split]\n", __func__,
1761             (nmbclusters << MCLSHIFT) >> MBSHIFT,
1762             (nclusters << MCLSHIFT) >> MBSHIFT,
1763             (njcl << MCLSHIFT) >> MBSHIFT);
1764
1765         /* initialize lock form tx completion callback table */
1766         mbuf_tx_compl_tbl_lck_grp_attr = lck_grp_attr_alloc_init();
1767         if (mbuf_tx_compl_tbl_lck_grp_attr == NULL) {
1768                 panic("%s: lck_grp_attr_alloc_init failed", __func__);
1769                 /* NOTREACHED */
1770         }
1771         mbuf_tx_compl_tbl_lck_grp = lck_grp_alloc_init("mbuf_tx_compl_tbl",
1772             mbuf_tx_compl_tbl_lck_grp_attr);
1773         if (mbuf_tx_compl_tbl_lck_grp == NULL) {
1774                 panic("%s: lck_grp_alloc_init failed", __func__);
1775                 /* NOTREACHED */
1776         }
1777         mbuf_tx_compl_tbl_lck_attr = lck_attr_alloc_init();
1778         if (mbuf_tx_compl_tbl_lck_attr == NULL) {
1779                 panic("%s: lck_attr_alloc_init failed", __func__);
1780                 /* NOTREACHED */
1781         }
1782         lck_rw_init(mbuf_tx_compl_tbl_lock, mbuf_tx_compl_tbl_lck_grp,
1783             mbuf_tx_compl_tbl_lck_attr);
1784
1785 }
1786
1787 /*
1788  * Obtain a slab of object(s) from the class's freelist.
1789  */
1790 static mcache_obj_t *
1791 slab_alloc(mbuf_class_t class, int wait)
1792 {
1793         mcl_slab_t *sp;
1794         mcache_obj_t *buf;
1795
1796         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1797
1798         /* This should always be NULL for us */
1799         VERIFY(m_cobjlist(class) == NULL);
1800
1801         /*
1802          * Treat composite objects as having longer lifespan by using
1803          * a slab from the reverse direction, in hoping that this could
1804          * reduce the probability of fragmentation for slabs that hold
1805          * more than one buffer chunks (e.g. mbuf slabs).  For other
1806          * slabs, this probably doesn't make much of a difference.
1807          */
1808         if ((class == MC_MBUF || class == MC_CL || class == MC_BIGCL)
1809             && (wait & MCR_COMP))
1810                 sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
1811         else
1812                 sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
1813
1814         if (sp == NULL) {
1815                 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1816                 /* The slab list for this class is empty */
1817                 return (NULL);
1818         }
1819
1820         VERIFY(m_infree(class) > 0);
1821         VERIFY(!slab_is_detached(sp));
1822         VERIFY(sp->sl_class == class &&
1823             (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1824         buf = sp->sl_head;
1825         VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
1826         sp->sl_head = buf->obj_next;
1827         /* Increment slab reference */
1828         sp->sl_refcnt++;
1829
1830         VERIFY(sp->sl_head != NULL || sp->sl_refcnt == sp->sl_chunks);
1831
1832         if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
1833                 slab_nextptr_panic(sp, sp->sl_head);
1834                 /* In case sl_head is in the map but not in the slab */
1835                 VERIFY(slab_inrange(sp, sp->sl_head));
1836                 /* NOTREACHED */
1837         }
1838
1839         if (mclaudit != NULL) {
1840                 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1841                 mca->mca_uflags = 0;
1842                 /* Save contents on mbuf objects only */
1843                 if (class == MC_MBUF)
1844                         mca->mca_uflags |= MB_SCVALID;
1845         }
1846
1847         if (class == MC_CL) {
1848                 mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1849                 /*
1850                  * A 2K cluster slab can have at most NCLPG references.
1851                  */
1852                 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPG &&
1853                     sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE);
1854                 VERIFY(sp->sl_refcnt < NCLPG || sp->sl_head == NULL);
1855         } else if (class == MC_BIGCL) {
1856                 mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
1857                     m_infree(MC_MBUF_BIGCL);
1858                 /*
1859                  * A 4K cluster slab can have NBCLPG references.
1860                  */
1861                 VERIFY(sp->sl_refcnt >= 1 && sp->sl_chunks == NBCLPG &&
1862                     sp->sl_len == PAGE_SIZE &&
1863                     (sp->sl_refcnt < NBCLPG || sp->sl_head == NULL));
1864         } else if (class == MC_16KCL) {
1865                 mcl_slab_t *nsp;
1866                 int k;
1867
1868                 --m_infree(MC_16KCL);
1869                 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1870                     sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1871                 /*
1872                  * Increment 2nd-Nth slab reference, where N is NSLABSP16KB.
1873                  * A 16KB big cluster takes NSLABSP16KB slabs, each having at
1874                  * most 1 reference.
1875                  */
1876                 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1877                         nsp = nsp->sl_next;
1878                         /* Next slab must already be present */
1879                         VERIFY(nsp != NULL);
1880                         nsp->sl_refcnt++;
1881                         VERIFY(!slab_is_detached(nsp));
1882                         VERIFY(nsp->sl_class == MC_16KCL &&
1883                             nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1884                             nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1885                             nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1886                             nsp->sl_head == NULL);
1887                 }
1888         } else {
1889                 VERIFY(class == MC_MBUF);
1890                 --m_infree(MC_MBUF);
1891                 /*
1892                  * If auditing is turned on, this check is
1893                  * deferred until later in mbuf_slab_audit().
1894                  */
1895                 if (mclaudit == NULL)
1896                         _MCHECK((struct mbuf *)buf);
1897                 /*
1898                  * Since we have incremented the reference count above,
1899                  * an mbuf slab (formerly a 4KB cluster slab that was cut
1900                  * up into mbufs) must have a reference count between 1
1901                  * and NMBPG at this point.
1902                  */
1903                 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPG &&
1904                     sp->sl_chunks == NMBPG &&
1905                     sp->sl_len == PAGE_SIZE);
1906                 VERIFY(sp->sl_refcnt < NMBPG || sp->sl_head == NULL);
1907         }
1908
1909         /* If empty, remove this slab from the class's freelist */
1910         if (sp->sl_head == NULL) {
1911                 VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPG);
1912                 VERIFY(class != MC_CL || sp->sl_refcnt == NCLPG);
1913                 VERIFY(class != MC_BIGCL || sp->sl_refcnt == NBCLPG);
1914                 slab_remove(sp, class);
1915         }
1916
1917         return (buf);
1918 }
1919
1920 /*
1921  * Place a slab of object(s) back into a class's slab list.
1922  */
1923 static void
1924 slab_free(mbuf_class_t class, mcache_obj_t *buf)
1925 {
1926         mcl_slab_t *sp;
1927         boolean_t reinit_supercl = false;
1928         mbuf_class_t super_class;
1929
1930         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1931
1932         VERIFY(class != MC_16KCL || njcl > 0);
1933         VERIFY(buf->obj_next == NULL);
1934
1935         /*
1936          * Synchronizing with m_clalloc, as it reads m_total, while we here
1937          * are modifying m_total.
1938          */
1939         while (mb_clalloc_busy) {
1940                 mb_clalloc_waiters++;
1941                 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
1942                     (PZERO-1), "m_clalloc", NULL);
1943                 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1944         }
1945
1946         /* We are busy now; tell everyone else to go away */
1947         mb_clalloc_busy = TRUE;
1948
1949         sp = slab_get(buf);
1950         VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
1951             (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1952
1953         /* Decrement slab reference */
1954         sp->sl_refcnt--;
1955
1956         if (class == MC_CL) {
1957                 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1958                 /*
1959                  * A slab that has been splitted for 2KB clusters can have
1960                  * at most 1 outstanding reference at this point.
1961                  */
1962                 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPG - 1) &&
1963                     sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE);
1964                 VERIFY(sp->sl_refcnt < (NCLPG - 1) ||
1965                     (slab_is_detached(sp) && sp->sl_head == NULL));
1966         } else if (class == MC_BIGCL) {
1967                 VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES));
1968
1969                 /* A 4KB cluster slab can have NBCLPG references at most */
1970                 VERIFY(sp->sl_refcnt >= 0 && sp->sl_chunks == NBCLPG);
1971                 VERIFY(sp->sl_refcnt < (NBCLPG - 1) ||
1972                     (slab_is_detached(sp) && sp->sl_head == NULL));
1973         } else if (class == MC_16KCL) {
1974                 mcl_slab_t *nsp;
1975                 int k;
1976                 /*
1977                  * A 16KB cluster takes NSLABSP16KB slabs, all must
1978                  * now have 0 reference.
1979                  */
1980                 VERIFY(IS_P2ALIGNED(buf, PAGE_SIZE));
1981                 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1982                     sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1983                 VERIFY(slab_is_detached(sp));
1984                 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1985                         nsp = nsp->sl_next;
1986                         /* Next slab must already be present */
1987                         VERIFY(nsp != NULL);
1988                         nsp->sl_refcnt--;
1989                         VERIFY(slab_is_detached(nsp));
1990                         VERIFY(nsp->sl_class == MC_16KCL &&
1991                             (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
1992                             nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
1993                             nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1994                             nsp->sl_head == NULL);
1995                 }
1996         } else {
1997                 /*
1998                  * A slab that has been splitted for mbufs has at most
1999                  * NMBPG reference counts.  Since we have decremented
2000                  * one reference above, it must now be between 0 and
2001                  * NMBPG-1.
2002                  */
2003                 VERIFY(class == MC_MBUF);
2004                 VERIFY(sp->sl_refcnt >= 0 &&
2005                     sp->sl_refcnt <= (NMBPG - 1) &&
2006                     sp->sl_chunks == NMBPG &&
2007                     sp->sl_len == PAGE_SIZE);
2008                 VERIFY(sp->sl_refcnt < (NMBPG - 1) ||
2009                     (slab_is_detached(sp) && sp->sl_head == NULL));
2010         }
2011
2012         /*
2013          * When auditing is enabled, ensure that the buffer still
2014          * contains the free pattern.  Otherwise it got corrupted
2015          * while at the CPU cache layer.
2016          */
2017         if (mclaudit != NULL) {
2018                 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
2019                 if (mclverify) {
2020                         mcache_audit_free_verify(mca, buf, 0,
2021                             m_maxsize(class));
2022                 }
2023                 mca->mca_uflags &= ~MB_SCVALID;
2024         }
2025
2026         if (class == MC_CL) {
2027                 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
2028                 buf->obj_next = sp->sl_head;
2029         } else if (class == MC_BIGCL) {
2030                 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
2031                     m_infree(MC_MBUF_BIGCL);
2032                 buf->obj_next = sp->sl_head;
2033         } else if (class == MC_16KCL) {
2034                 ++m_infree(MC_16KCL);
2035         } else {
2036                 ++m_infree(MC_MBUF);
2037                 buf->obj_next = sp->sl_head;
2038         }
2039         sp->sl_head = buf;
2040
2041         /*
2042          * If a slab has been split to either one which holds 2KB clusters,
2043          * or one which holds mbufs, turn it back to one which holds a
2044          * 4 or 16 KB cluster depending on the page size.
2045          */
2046         if (m_maxsize(MC_BIGCL) == PAGE_SIZE) {
2047                 super_class = MC_BIGCL;
2048         } else {
2049                 VERIFY(PAGE_SIZE == m_maxsize(MC_16KCL));
2050                 super_class = MC_16KCL;
2051         }
2052         if (class == MC_MBUF && sp->sl_refcnt == 0 &&
2053             m_total(class) >= (m_minlimit(class) + NMBPG) &&
2054             m_total(super_class) < m_maxlimit(super_class)) {
2055                 int i = NMBPG;
2056
2057                 m_total(MC_MBUF) -= NMBPG;
2058                 mbstat.m_mbufs = m_total(MC_MBUF);
2059                 m_infree(MC_MBUF) -= NMBPG;
2060                 mtype_stat_add(MT_FREE, -((unsigned)NMBPG));
2061
2062                 while (i--) {
2063                         struct mbuf *m = sp->sl_head;
2064                         VERIFY(m != NULL);
2065                         sp->sl_head = m->m_next;
2066                         m->m_next = NULL;
2067                 }
2068                 reinit_supercl = true;
2069         } else if (class == MC_CL && sp->sl_refcnt == 0 &&
2070             m_total(class) >=  (m_minlimit(class) + NCLPG) &&
2071             m_total(super_class) < m_maxlimit(super_class)) {
2072                 int i = NCLPG;
2073
2074                 m_total(MC_CL) -= NCLPG;
2075                 mbstat.m_clusters = m_total(MC_CL);
2076                 m_infree(MC_CL) -= NCLPG;
2077
2078                 while (i--) {
2079                         union mcluster *c = sp->sl_head;
2080                         VERIFY(c != NULL);
2081                         sp->sl_head = c->mcl_next;
2082                         c->mcl_next = NULL;
2083                 }
2084                 reinit_supercl = true;
2085         } else if (class == MC_BIGCL && super_class != MC_BIGCL &&
2086             sp->sl_refcnt == 0 &&
2087             m_total(class) >= (m_minlimit(class) + NBCLPG) &&
2088             m_total(super_class) < m_maxlimit(super_class)) {
2089                 int i = NBCLPG;
2090
2091                 VERIFY(super_class == MC_16KCL);
2092                 m_total(MC_BIGCL) -= NBCLPG;
2093                 mbstat.m_bigclusters = m_total(MC_BIGCL);
2094                 m_infree(MC_BIGCL) -= NBCLPG;
2095
2096                 while (i--) {
2097                         union mbigcluster *bc = sp->sl_head;
2098                         VERIFY(bc != NULL);
2099                         sp->sl_head = bc->mbc_next;
2100                         bc->mbc_next = NULL;
2101                 }
2102                 reinit_supercl = true;
2103         }
2104
2105         if (reinit_supercl) {
2106                 VERIFY(sp->sl_head == NULL);
2107                 VERIFY(m_total(class) >= m_minlimit(class));
2108                 slab_remove(sp, class);
2109
2110                 /* Reinitialize it as a cluster for the super class */
2111                 m_total(super_class)++;
2112                 m_infree(super_class)++;
2113                 VERIFY(sp->sl_flags == (SLF_MAPPED | SLF_DETACHED) &&
2114                     sp->sl_len == PAGE_SIZE && sp->sl_refcnt == 0);
2115
2116                 slab_init(sp, super_class, SLF_MAPPED, sp->sl_base,
2117                     sp->sl_base, PAGE_SIZE, 0, 1);
2118                 if (mclverify)
2119                         mcache_set_pattern(MCACHE_FREE_PATTERN,
2120                             (caddr_t)sp->sl_base, sp->sl_len);
2121                 ((mcache_obj_t *)(sp->sl_base))->obj_next = NULL;
2122
2123                 if (super_class == MC_BIGCL) {
2124                         mbstat.m_bigclusters = m_total(MC_BIGCL);
2125                         mbstat.m_bigclfree = m_infree(MC_BIGCL) +
2126                             m_infree(MC_MBUF_BIGCL);
2127                 }
2128
2129                 VERIFY(slab_is_detached(sp));
2130                 VERIFY(m_total(super_class) <= m_maxlimit(super_class));
2131
2132                 /* And finally switch class */
2133                 class = super_class;
2134         }
2135
2136         /* Reinsert the slab to the class's slab list */
2137         if (slab_is_detached(sp))
2138                 slab_insert(sp, class);
2139
2140         /* We're done; let others enter */
2141         mb_clalloc_busy = FALSE;
2142         if (mb_clalloc_waiters > 0) {
2143                 mb_clalloc_waiters = 0;
2144                 wakeup(mb_clalloc_waitchan);
2145         }
2146 }
2147
2148 /*
2149  * Common allocator for rudimentary objects called by the CPU cache layer
2150  * during an allocation request whenever there is no available element in the
2151  * bucket layer.  It returns one or more elements from the appropriate global
2152  * freelist.  If the freelist is empty, it will attempt to populate it and
2153  * retry the allocation.
2154  */
2155 static unsigned int
2156 mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
2157 {
2158         mbuf_class_t class = (mbuf_class_t)arg;
2159         unsigned int need = num;
2160         mcache_obj_t **list = *plist;
2161
2162         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2163         ASSERT(need > 0);
2164
2165         lck_mtx_lock(mbuf_mlock);
2166
2167         for (;;) {
2168                 if ((*list = slab_alloc(class, wait)) != NULL) {
2169                         (*list)->obj_next = NULL;
2170                         list = *plist = &(*list)->obj_next;
2171
2172                         if (--need == 0) {
2173                                 /*
2174                                  * If the number of elements in freelist has
2175                                  * dropped below low watermark, asynchronously
2176                                  * populate the freelist now rather than doing
2177                                  * it later when we run out of elements.
2178                                  */
2179                                 if (!mbuf_cached_above(class, wait) &&
2180                                     m_infree(class) < (m_total(class) >> 5)) {
2181                                         (void) freelist_populate(class, 1,
2182                                             M_DONTWAIT);
2183                                 }
2184                                 break;
2185                         }
2186                 } else {
2187                         VERIFY(m_infree(class) == 0 || class == MC_CL);
2188
2189                         (void) freelist_populate(class, 1,
2190                             (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
2191
2192                         if (m_infree(class) > 0)
2193                                 continue;
2194
2195                         /* Check if there's anything at the cache layer */
2196                         if (mbuf_cached_above(class, wait))
2197                                 break;
2198
2199                         /* watchdog checkpoint */
2200                         mbuf_watchdog();
2201
2202                         /* We have nothing and cannot block; give up */
2203                         if (wait & MCR_NOSLEEP) {
2204                                 if (!(wait & MCR_TRYHARD)) {
2205                                         m_fail_cnt(class)++;
2206                                         mbstat.m_drops++;
2207                                         break;
2208                                 }
2209                         }
2210
2211                         /*
2212                          * If the freelist is still empty and the caller is
2213                          * willing to be blocked, sleep on the wait channel
2214                          * until an element is available.  Otherwise, if
2215                          * MCR_TRYHARD is set, do our best to satisfy the
2216                          * request without having to go to sleep.
2217                          */
2218                         if (mbuf_worker_ready &&
2219                             mbuf_sleep(class, need, wait))
2220                                 break;
2221
2222                         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2223                 }
2224         }
2225
2226         m_alloc_cnt(class) += num - need;
2227         lck_mtx_unlock(mbuf_mlock);
2228
2229         return (num - need);
2230 }
2231
2232 /*
2233  * Common de-allocator for rudimentary objects called by the CPU cache
2234  * layer when one or more elements need to be returned to the appropriate
2235  * global freelist.
2236  */
2237 static void
2238 mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
2239 {
2240         mbuf_class_t class = (mbuf_class_t)arg;
2241         mcache_obj_t *nlist;
2242         unsigned int num = 0;
2243         int w;
2244
2245         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2246
2247         lck_mtx_lock(mbuf_mlock);
2248
2249         for (;;) {
2250                 nlist = list->obj_next;
2251                 list->obj_next = NULL;
2252                 slab_free(class, list);
2253                 ++num;
2254                 if ((list = nlist) == NULL)
2255                         break;
2256         }
2257         m_free_cnt(class) += num;
2258
2259         if ((w = mb_waiters) > 0)
2260                 mb_waiters = 0;
2261
2262         lck_mtx_unlock(mbuf_mlock);
2263
2264         if (w != 0)
2265                 wakeup(mb_waitchan);
2266 }
2267
2268 /*
2269  * Common auditor for rudimentary objects called by the CPU cache layer
2270  * during an allocation or free request.  For the former, this is called
2271  * after the objects are obtained from either the bucket or slab layer
2272  * and before they are returned to the caller.  For the latter, this is
2273  * called immediately during free and before placing the objects into
2274  * the bucket or slab layer.
2275  */
2276 static void
2277 mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2278 {
2279         mbuf_class_t class = (mbuf_class_t)arg;
2280         mcache_audit_t *mca;
2281
2282         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2283
2284         while (list != NULL) {
2285                 lck_mtx_lock(mbuf_mlock);
2286                 mca = mcl_audit_buf2mca(class, list);
2287
2288                 /* Do the sanity checks */
2289                 if (class == MC_MBUF) {
2290                         mcl_audit_mbuf(mca, list, FALSE, alloc);
2291                         ASSERT(mca->mca_uflags & MB_SCVALID);
2292                 } else {
2293                         mcl_audit_cluster(mca, list, m_maxsize(class),
2294                             alloc, TRUE);
2295                         ASSERT(!(mca->mca_uflags & MB_SCVALID));
2296                 }
2297                 /* Record this transaction */
2298                 if (mcltrace)
2299                         mcache_buffer_log(mca, list, m_cache(class), &mb_start);
2300
2301                 if (alloc)
2302                         mca->mca_uflags |= MB_INUSE;
2303                 else
2304                         mca->mca_uflags &= ~MB_INUSE;
2305                 /* Unpair the object (unconditionally) */
2306                 mca->mca_uptr = NULL;
2307                 lck_mtx_unlock(mbuf_mlock);
2308
2309                 list = list->obj_next;
2310         }
2311 }
2312
2313 /*
2314  * Common notify routine for all caches.  It is called by mcache when
2315  * one or more objects get freed.  We use this indication to trigger
2316  * the wakeup of any sleeping threads so that they can retry their
2317  * allocation requests.
2318  */
2319 static void
2320 mbuf_slab_notify(void *arg, u_int32_t reason)
2321 {
2322         mbuf_class_t class = (mbuf_class_t)arg;
2323         int w;
2324
2325         ASSERT(MBUF_CLASS_VALID(class));
2326
2327         if (reason != MCN_RETRYALLOC)
2328                 return;
2329
2330         lck_mtx_lock(mbuf_mlock);
2331         if ((w = mb_waiters) > 0) {
2332                 m_notified(class)++;
2333                 mb_waiters = 0;
2334         }
2335         lck_mtx_unlock(mbuf_mlock);
2336
2337         if (w != 0)
2338                 wakeup(mb_waitchan);
2339 }
2340
2341 /*
2342  * Obtain object(s) from the composite class's freelist.
2343  */
2344 static unsigned int
2345 cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
2346 {
2347         unsigned int need = num;
2348         mcl_slab_t *sp, *clsp, *nsp;
2349         struct mbuf *m;
2350         mcache_obj_t **list = *plist;
2351         void *cl;
2352
2353         VERIFY(need > 0);
2354         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2355         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2356
2357         /* Get what we can from the freelist */
2358         while ((*list = m_cobjlist(class)) != NULL) {
2359                 MRANGE(*list);
2360
2361                 m = (struct mbuf *)*list;
2362                 sp = slab_get(m);
2363                 cl = m->m_ext.ext_buf;
2364                 clsp = slab_get(cl);
2365                 VERIFY(m->m_flags == M_EXT && cl != NULL);
2366                 VERIFY(m_get_rfa(m) != NULL && MBUF_IS_COMPOSITE(m));
2367
2368                 if (class == MC_MBUF_CL) {
2369                         VERIFY(clsp->sl_refcnt >= 1 &&
2370                             clsp->sl_refcnt <= NCLPG);
2371                 } else {
2372                         VERIFY(clsp->sl_refcnt >= 1 &&
2373                             clsp->sl_refcnt <= NBCLPG);
2374                 }
2375
2376                 if (class == MC_MBUF_16KCL) {
2377                         int k;
2378                         for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2379                                 nsp = nsp->sl_next;
2380                                 /* Next slab must already be present */
2381                                 VERIFY(nsp != NULL);
2382                                 VERIFY(nsp->sl_refcnt == 1);
2383                         }
2384                 }
2385
2386                 if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
2387                     !MBUF_IN_MAP(m_cobjlist(class))) {
2388                         slab_nextptr_panic(sp, m_cobjlist(class));
2389                         /* NOTREACHED */
2390                 }
2391                 (*list)->obj_next = NULL;
2392                 list = *plist = &(*list)->obj_next;
2393
2394                 if (--need == 0)
2395                         break;
2396         }
2397         m_infree(class) -= (num - need);
2398
2399         return (num - need);
2400 }
2401
2402 /*
2403  * Place object(s) back into a composite class's freelist.
2404  */
2405 static unsigned int
2406 cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
2407 {
2408         mcache_obj_t *o, *tail;
2409         unsigned int num = 0;
2410         struct mbuf *m, *ms;
2411         mcache_audit_t *mca = NULL;
2412         mcache_obj_t *ref_list = NULL;
2413         mcl_slab_t *clsp, *nsp;
2414         void *cl;
2415         mbuf_class_t cl_class;
2416
2417         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2418         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2419         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2420
2421         if (class == MC_MBUF_CL) {
2422                 cl_class = MC_CL;
2423         } else if (class == MC_MBUF_BIGCL) {
2424                 cl_class = MC_BIGCL;
2425         } else {
2426                 VERIFY(class == MC_MBUF_16KCL);
2427                 cl_class = MC_16KCL;
2428         }
2429
2430         o = tail = list;
2431
2432         while ((m = ms = (struct mbuf *)o) != NULL) {
2433                 mcache_obj_t *rfa, *nexto = o->obj_next;
2434
2435                 /* Do the mbuf sanity checks */
2436                 if (mclaudit != NULL) {
2437                         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2438                         if (mclverify) {
2439                                 mcache_audit_free_verify(mca, m, 0,
2440                                     m_maxsize(MC_MBUF));
2441                         }
2442                         ms = MCA_SAVED_MBUF_PTR(mca);
2443                 }
2444
2445                 /* Do the cluster sanity checks */
2446                 cl = ms->m_ext.ext_buf;
2447                 clsp = slab_get(cl);
2448                 if (mclverify) {
2449                         size_t size = m_maxsize(cl_class);
2450                         mcache_audit_free_verify(mcl_audit_buf2mca(cl_class,
2451                             (mcache_obj_t *)cl), cl, 0, size);
2452                 }
2453                 VERIFY(ms->m_type == MT_FREE);
2454                 VERIFY(ms->m_flags == M_EXT);
2455                 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2456                 if (cl_class == MC_CL) {
2457                         VERIFY(clsp->sl_refcnt >= 1 &&
2458                             clsp->sl_refcnt <= NCLPG);
2459                 } else {
2460                         VERIFY(clsp->sl_refcnt >= 1 &&
2461                             clsp->sl_refcnt <= NBCLPG);
2462                 }
2463                 if (cl_class == MC_16KCL) {
2464                         int k;
2465                         for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2466                                 nsp = nsp->sl_next;
2467                                 /* Next slab must already be present */
2468                                 VERIFY(nsp != NULL);
2469                                 VERIFY(nsp->sl_refcnt == 1);
2470                         }
2471                 }
2472
2473                 /*
2474                  * If we're asked to purge, restore the actual mbuf using
2475                  * contents of the shadow structure (if auditing is enabled)
2476                  * and clear EXTF_COMPOSITE flag from the mbuf, as we are
2477                  * about to free it and the attached cluster into their caches.
2478                  */
2479                 if (purged) {
2480                         /* Restore constructed mbuf fields */
2481                         if (mclaudit != NULL)
2482                                 mcl_audit_restore_mbuf(m, mca, TRUE);
2483
2484                         MEXT_MINREF(m) = 0;
2485                         MEXT_REF(m) = 0;
2486                         MEXT_PREF(m) = 0;
2487                         MEXT_FLAGS(m) = 0;
2488                         MEXT_PRIV(m) = 0;
2489                         MEXT_PMBUF(m) = NULL;
2490                         MEXT_TOKEN(m) = 0;
2491
2492                         rfa = (mcache_obj_t *)(void *)m_get_rfa(m);
2493                         m_set_ext(m, NULL, NULL, NULL);
2494                         rfa->obj_next = ref_list;
2495                         ref_list = rfa;
2496
2497                         m->m_type = MT_FREE;
2498                         m->m_flags = m->m_len = 0;
2499                         m->m_next = m->m_nextpkt = NULL;
2500
2501                         /* Save mbuf fields and make auditing happy */
2502                         if (mclaudit != NULL)
2503                                 mcl_audit_mbuf(mca, o, FALSE, FALSE);
2504
2505                         VERIFY(m_total(class) > 0);
2506                         m_total(class)--;
2507
2508                         /* Free the mbuf */
2509                         o->obj_next = NULL;
2510                         slab_free(MC_MBUF, o);
2511
2512                         /* And free the cluster */
2513                         ((mcache_obj_t *)cl)->obj_next = NULL;
2514                         if (class == MC_MBUF_CL)
2515                                 slab_free(MC_CL, cl);
2516                         else if (class == MC_MBUF_BIGCL)
2517                                 slab_free(MC_BIGCL, cl);
2518                         else
2519                                 slab_free(MC_16KCL, cl);
2520                 }
2521
2522                 ++num;
2523                 tail = o;
2524                 o = nexto;
2525         }
2526
2527         if (!purged) {
2528                 tail->obj_next = m_cobjlist(class);
2529                 m_cobjlist(class) = list;
2530                 m_infree(class) += num;
2531         } else if (ref_list != NULL) {
2532                 mcache_free_ext(ref_cache, ref_list);
2533         }
2534
2535         return (num);
2536 }
2537
2538 /*
2539  * Common allocator for composite objects called by the CPU cache layer
2540  * during an allocation request whenever there is no available element in
2541  * the bucket layer.  It returns one or more composite elements from the
2542  * appropriate global freelist.  If the freelist is empty, it will attempt
2543  * to obtain the rudimentary objects from their caches and construct them
2544  * into composite mbuf + cluster objects.
2545  */
2546 static unsigned int
2547 mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed,
2548     int wait)
2549 {
2550         mbuf_class_t class = (mbuf_class_t)arg;
2551         mbuf_class_t cl_class = 0;
2552         unsigned int num = 0, cnum = 0, want = needed;
2553         mcache_obj_t *ref_list = NULL;
2554         mcache_obj_t *mp_list = NULL;
2555         mcache_obj_t *clp_list = NULL;
2556         mcache_obj_t **list;
2557         struct ext_ref *rfa;
2558         struct mbuf *m;
2559         void *cl;
2560
2561         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2562         ASSERT(needed > 0);
2563
2564         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2565
2566         /* There should not be any slab for this class */
2567         VERIFY(m_slab_cnt(class) == 0 &&
2568             m_slablist(class).tqh_first == NULL &&
2569             m_slablist(class).tqh_last == NULL);
2570
2571         lck_mtx_lock(mbuf_mlock);
2572
2573         /* Try using the freelist first */
2574         num = cslab_alloc(class, plist, needed);
2575         list = *plist;
2576         if (num == needed) {
2577                 m_alloc_cnt(class) += num;
2578                 lck_mtx_unlock(mbuf_mlock);
2579                 return (needed);
2580         }
2581
2582         lck_mtx_unlock(mbuf_mlock);
2583
2584         /*
2585          * We could not satisfy the request using the freelist alone;
2586          * allocate from the appropriate rudimentary caches and use
2587          * whatever we can get to construct the composite objects.
2588          */
2589         needed -= num;
2590
2591         /*
2592          * Mark these allocation requests as coming from a composite cache.
2593          * Also, if the caller is willing to be blocked, mark the request
2594          * with MCR_FAILOK such that we don't end up sleeping at the mbuf
2595          * slab layer waiting for the individual object when one or more
2596          * of the already-constructed composite objects are available.
2597          */
2598         wait |= MCR_COMP;
2599         if (!(wait & MCR_NOSLEEP))
2600                 wait |= MCR_FAILOK;
2601
2602         /* allocate mbufs */
2603         needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
2604         if (needed == 0) {
2605                 ASSERT(mp_list == NULL);
2606                 goto fail;
2607         }
2608
2609         /* allocate clusters */
2610         if (class == MC_MBUF_CL) {
2611                 cl_class = MC_CL;
2612         } else if (class == MC_MBUF_BIGCL) {
2613                 cl_class = MC_BIGCL;
2614         } else {
2615                 VERIFY(class == MC_MBUF_16KCL);
2616                 cl_class = MC_16KCL;
2617         }
2618         needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait);
2619         if (needed == 0) {
2620                 ASSERT(clp_list == NULL);
2621                 goto fail;
2622         }
2623
2624         needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
2625         if (needed == 0) {
2626                 ASSERT(ref_list == NULL);
2627                 goto fail;
2628         }
2629
2630         /*
2631          * By this time "needed" is MIN(mbuf, cluster, ref).  Any left
2632          * overs will get freed accordingly before we return to caller.
2633          */
2634         for (cnum = 0; cnum < needed; cnum++) {
2635                 struct mbuf *ms;
2636
2637                 m = ms = (struct mbuf *)mp_list;
2638                 mp_list = mp_list->obj_next;
2639
2640                 cl = clp_list;
2641                 clp_list = clp_list->obj_next;
2642                 ((mcache_obj_t *)cl)->obj_next = NULL;
2643
2644                 rfa = (struct ext_ref *)ref_list;
2645                 ref_list = ref_list->obj_next;
2646                 ((mcache_obj_t *)(void *)rfa)->obj_next = NULL;
2647
2648                 /*
2649                  * If auditing is enabled, construct the shadow mbuf
2650                  * in the audit structure instead of in the actual one.
2651                  * mbuf_cslab_audit() will take care of restoring the
2652                  * contents after the integrity check.
2653                  */
2654                 if (mclaudit != NULL) {
2655                         mcache_audit_t *mca, *cl_mca;
2656
2657                         lck_mtx_lock(mbuf_mlock);
2658                         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2659                         ms = MCA_SAVED_MBUF_PTR(mca);
2660                         cl_mca = mcl_audit_buf2mca(cl_class,
2661                             (mcache_obj_t *)cl);
2662
2663                         /*
2664                          * Pair them up.  Note that this is done at the time
2665                          * the mbuf+cluster objects are constructed.  This
2666                          * information should be treated as "best effort"
2667                          * debugging hint since more than one mbufs can refer
2668                          * to a cluster.  In that case, the cluster might not
2669                          * be freed along with the mbuf it was paired with.
2670                          */
2671                         mca->mca_uptr = cl_mca;
2672                         cl_mca->mca_uptr = mca;
2673
2674                         ASSERT(mca->mca_uflags & MB_SCVALID);
2675                         ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
2676                         lck_mtx_unlock(mbuf_mlock);
2677
2678                         /* Technically, they are in the freelist */
2679                         if (mclverify) {
2680                                 size_t size;
2681
2682                                 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
2683                                     m_maxsize(MC_MBUF));
2684
2685                                 if (class == MC_MBUF_CL)
2686                                         size = m_maxsize(MC_CL);
2687                                 else if (class == MC_MBUF_BIGCL)
2688                                         size = m_maxsize(MC_BIGCL);
2689                                 else
2690                                         size = m_maxsize(MC_16KCL);
2691
2692                                 mcache_set_pattern(MCACHE_FREE_PATTERN, cl,
2693                                     size);
2694                         }
2695                 }
2696
2697                 MBUF_INIT(ms, 0, MT_FREE);
2698                 if (class == MC_MBUF_16KCL) {
2699                         MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2700                 } else if (class == MC_MBUF_BIGCL) {
2701                         MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2702                 } else {
2703                         MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2704                 }
2705                 VERIFY(ms->m_flags == M_EXT);
2706                 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2707
2708                 *list = (mcache_obj_t *)m;
2709                 (*list)->obj_next = NULL;
2710                 list = *plist = &(*list)->obj_next;
2711         }
2712
2713 fail:
2714         /*
2715          * Free up what's left of the above.
2716          */
2717         if (mp_list != NULL)
2718                 mcache_free_ext(m_cache(MC_MBUF), mp_list);
2719         if (clp_list != NULL)
2720                 mcache_free_ext(m_cache(cl_class), clp_list);
2721         if (ref_list != NULL)
2722                 mcache_free_ext(ref_cache, ref_list);
2723
2724         lck_mtx_lock(mbuf_mlock);
2725         if (num > 0 || cnum > 0) {
2726                 m_total(class) += cnum;
2727                 VERIFY(m_total(class) <= m_maxlimit(class));
2728                 m_alloc_cnt(class) += num + cnum;
2729         }
2730         if ((num + cnum) < want)
2731                 m_fail_cnt(class) += (want - (num + cnum));
2732         lck_mtx_unlock(mbuf_mlock);
2733
2734         return (num + cnum);
2735 }
2736
2737 /*
2738  * Common de-allocator for composite objects called by the CPU cache
2739  * layer when one or more elements need to be returned to the appropriate
2740  * global freelist.
2741  */
2742 static void
2743 mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
2744 {
2745         mbuf_class_t class = (mbuf_class_t)arg;
2746         unsigned int num;
2747         int w;
2748
2749         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2750
2751         lck_mtx_lock(mbuf_mlock);
2752
2753         num = cslab_free(class, list, purged);
2754         m_free_cnt(class) += num;
2755
2756         if ((w = mb_waiters) > 0)
2757                 mb_waiters = 0;
2758
2759         lck_mtx_unlock(mbuf_mlock);
2760
2761         if (w != 0)
2762                 wakeup(mb_waitchan);
2763 }
2764
2765 /*
2766  * Common auditor for composite objects called by the CPU cache layer
2767  * during an allocation or free request.  For the former, this is called
2768  * after the objects are obtained from either the bucket or slab layer
2769  * and before they are returned to the caller.  For the latter, this is
2770  * called immediately during free and before placing the objects into
2771  * the bucket or slab layer.
2772  */
2773 static void
2774 mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2775 {
2776         mbuf_class_t class = (mbuf_class_t)arg, cl_class;
2777         mcache_audit_t *mca;
2778         struct mbuf *m, *ms;
2779         mcl_slab_t *clsp, *nsp;
2780         size_t cl_size;
2781         void *cl;
2782
2783         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2784         if (class == MC_MBUF_CL)
2785                 cl_class = MC_CL;
2786         else if (class == MC_MBUF_BIGCL)
2787                 cl_class = MC_BIGCL;
2788         else
2789                 cl_class = MC_16KCL;
2790         cl_size = m_maxsize(cl_class);
2791
2792         while ((m = ms = (struct mbuf *)list) != NULL) {
2793                 lck_mtx_lock(mbuf_mlock);
2794                 /* Do the mbuf sanity checks and record its transaction */
2795                 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2796                 mcl_audit_mbuf(mca, m, TRUE, alloc);
2797                 if (mcltrace)
2798                         mcache_buffer_log(mca, m, m_cache(class), &mb_start);
2799
2800                 if (alloc)
2801                         mca->mca_uflags |= MB_COMP_INUSE;
2802                 else
2803                         mca->mca_uflags &= ~MB_COMP_INUSE;
2804
2805                 /*
2806                  * Use the shadow mbuf in the audit structure if we are
2807                  * freeing, since the contents of the actual mbuf has been
2808                  * pattern-filled by the above call to mcl_audit_mbuf().
2809                  */
2810                 if (!alloc && mclverify)
2811                         ms = MCA_SAVED_MBUF_PTR(mca);
2812
2813                 /* Do the cluster sanity checks and record its transaction */
2814                 cl = ms->m_ext.ext_buf;
2815                 clsp = slab_get(cl);
2816                 VERIFY(ms->m_flags == M_EXT && cl != NULL);
2817                 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2818                 if (class == MC_MBUF_CL)
2819                         VERIFY(clsp->sl_refcnt >= 1 &&
2820                             clsp->sl_refcnt <= NCLPG);
2821                 else
2822                         VERIFY(clsp->sl_refcnt >= 1 &&
2823                             clsp->sl_refcnt <= NBCLPG);
2824
2825                 if (class == MC_MBUF_16KCL) {
2826                         int k;
2827                         for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2828                                 nsp = nsp->sl_next;
2829                                 /* Next slab must already be present */
2830                                 VERIFY(nsp != NULL);
2831                                 VERIFY(nsp->sl_refcnt == 1);
2832                         }
2833                 }
2834
2835
2836                 mca = mcl_audit_buf2mca(cl_class, cl);
2837                 mcl_audit_cluster(mca, cl, cl_size, alloc, FALSE);
2838                 if (mcltrace)
2839                         mcache_buffer_log(mca, cl, m_cache(class), &mb_start);
2840
2841                 if (alloc)
2842                         mca->mca_uflags |= MB_COMP_INUSE;
2843                 else
2844                         mca->mca_uflags &= ~MB_COMP_INUSE;
2845                 lck_mtx_unlock(mbuf_mlock);
2846
2847                 list = list->obj_next;
2848         }
2849 }
2850
2851 static void
2852 m_vm_error_stats(uint32_t *cnt, uint64_t *ts, uint64_t *size,
2853                  uint64_t alloc_size, kern_return_t error)
2854 {
2855
2856         *cnt = *cnt + 1;
2857         *ts = net_uptime();
2858         if (size) {
2859                 *size = alloc_size;
2860         }
2861         _CASSERT(sizeof(mb_kmem_stats) / sizeof(mb_kmem_stats[0]) ==
2862             sizeof(mb_kmem_stats_labels) / sizeof(mb_kmem_stats_labels[0]));
2863         switch (error) {
2864         case KERN_SUCCESS:
2865                 break;
2866         case KERN_INVALID_ARGUMENT:
2867                 mb_kmem_stats[0]++;
2868                 break;
2869         case KERN_INVALID_ADDRESS:
2870                 mb_kmem_stats[1]++;
2871                 break;
2872         case KERN_RESOURCE_SHORTAGE:
2873                 mb_kmem_stats[2]++;
2874                 break;
2875         case KERN_NO_SPACE:
2876                 mb_kmem_stats[3]++;
2877                 break;
2878         case KERN_FAILURE:
2879                 mb_kmem_stats[4]++;
2880                 break;
2881         default:
2882                 mb_kmem_stats[5]++;
2883                 break;
2884         }
2885 }
2886
2887 /*
2888  * Allocate some number of mbuf clusters and place on cluster freelist.
2889  */
2890 static int
2891 m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
2892 {
2893         int i, count = 0;
2894         vm_size_t size = 0;
2895         int numpages = 0, large_buffer;
2896         vm_offset_t page = 0;
2897         mcache_audit_t *mca_list = NULL;
2898         mcache_obj_t *con_list = NULL;
2899         mcl_slab_t *sp;
2900         mbuf_class_t class;
2901         kern_return_t error;
2902
2903         /* Set if a buffer allocation needs allocation of multiple pages */
2904         large_buffer = ((bufsize == m_maxsize(MC_16KCL)) &&
2905                 PAGE_SIZE < M16KCLBYTES);
2906         VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
2907             bufsize == m_maxsize(MC_16KCL));
2908
2909         VERIFY((bufsize == PAGE_SIZE) ||
2910             (bufsize > PAGE_SIZE && bufsize == m_maxsize(MC_16KCL)));
2911
2912         if (bufsize == m_size(MC_BIGCL))
2913                 class = MC_BIGCL;
2914         else
2915                 class = MC_16KCL;
2916
2917         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2918
2919         /*
2920          * Multiple threads may attempt to populate the cluster map one
2921          * after another.  Since we drop the lock below prior to acquiring
2922          * the physical page(s), our view of the cluster map may no longer
2923          * be accurate, and we could end up over-committing the pages beyond
2924          * the maximum allowed for each class.  To prevent it, this entire
2925          * operation (including the page mapping) is serialized.
2926          */
2927         while (mb_clalloc_busy) {
2928                 mb_clalloc_waiters++;
2929                 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
2930                     (PZERO-1), "m_clalloc", NULL);
2931                 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2932         }
2933
2934         /* We are busy now; tell everyone else to go away */
2935         mb_clalloc_busy = TRUE;
2936
2937         /*
2938          * Honor the caller's wish to block or not block.  We have a way
2939          * to grow the pool asynchronously using the mbuf worker thread.
2940          */
2941         i = m_howmany(num, bufsize);
2942         if (i <= 0 || (wait & M_DONTWAIT))
2943                 goto out;
2944
2945         lck_mtx_unlock(mbuf_mlock);
2946
2947         size = round_page(i * bufsize);
2948         page = kmem_mb_alloc(mb_map, size, large_buffer, &error);
2949
2950         /*
2951          * If we did ask for "n" 16KB physically contiguous chunks
2952          * and didn't get them, then please try again without this
2953          * restriction.
2954          */
2955         net_update_uptime();
2956         if (large_buffer && page == 0) {
2957                 m_vm_error_stats(&mb_kmem_contig_failed,
2958                     &mb_kmem_contig_failed_ts,
2959                     &mb_kmem_contig_failed_size,
2960                     size, error);
2961                 page = kmem_mb_alloc(mb_map, size, 0, &error);
2962         }
2963
2964         if (page == 0) {
2965                 m_vm_error_stats(&mb_kmem_failed,
2966                     &mb_kmem_failed_ts,
2967                     &mb_kmem_failed_size,
2968                     size, error);
2969 #if PAGE_SIZE == 4096
2970                 if (bufsize == m_maxsize(MC_BIGCL)) {
2971 #else
2972                 if (bufsize >= m_maxsize(MC_BIGCL)) {
2973 #endif
2974                         /* Try for 1 page if failed */
2975                         size = PAGE_SIZE;
2976                         page = kmem_mb_alloc(mb_map, size, 0, &error);
2977                         if (page == 0) {
2978                                 m_vm_error_stats(&mb_kmem_one_failed,
2979                                     &mb_kmem_one_failed_ts,
2980                                     NULL, size, error);
2981                         }
2982                 }
2983
2984                 if (page == 0) {
2985                         lck_mtx_lock(mbuf_mlock);
2986                         goto out;
2987                 }
2988         }
2989
2990         VERIFY(IS_P2ALIGNED(page, PAGE_SIZE));
2991         numpages = size / PAGE_SIZE;
2992
2993         /* If auditing is enabled, allocate the audit structures now */
2994         if (mclaudit != NULL) {
2995                 int needed;
2996
2997                 /*
2998                  * Yes, I realize this is a waste of memory for clusters
2999                  * that never get transformed into mbufs, as we may end
3000                  * up with NMBPG-1 unused audit structures per cluster.
3001                  * But doing so tremendously simplifies the allocation
3002                  * strategy, since at this point we are not holding the
3003                  * mbuf lock and the caller is okay to be blocked.
3004                  */
3005                 if (bufsize == PAGE_SIZE) {
3006                         needed = numpages * NMBPG;
3007
3008                         i = mcache_alloc_ext(mcl_audit_con_cache,
3009                             &con_list, needed, MCR_SLEEP);
3010
3011                         VERIFY(con_list != NULL && i == needed);
3012                 } else {
3013                         /*
3014                          * if multiple 4K pages are being used for a
3015                          * 16K cluster
3016                          */
3017                         needed = numpages / NSLABSP16KB;
3018                 }
3019
3020                 i = mcache_alloc_ext(mcache_audit_cache,
3021                     (mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
3022
3023                 VERIFY(mca_list != NULL && i == needed);
3024         }
3025
3026         lck_mtx_lock(mbuf_mlock);
3027
3028         for (i = 0; i < numpages; i++, page += PAGE_SIZE) {
3029                 ppnum_t offset =
3030                     ((unsigned char *)page - mbutl) >> PAGE_SHIFT;
3031                 ppnum_t new_page = pmap_find_phys(kernel_pmap, page);
3032
3033                 /*
3034                  * If there is a mapper the appropriate I/O page is
3035                  * returned; zero out the page to discard its past
3036                  * contents to prevent exposing leftover kernel memory.
3037                  */
3038                 VERIFY(offset < mcl_pages);
3039                 if (mcl_paddr_base != 0) {
3040                         bzero((void *)(uintptr_t) page, PAGE_SIZE);
3041                         new_page = IOMapperInsertPage(mcl_paddr_base,
3042                             offset, new_page);
3043                 }
3044                 mcl_paddr[offset] = new_page;
3045
3046                 /* Pattern-fill this fresh page */
3047                 if (mclverify) {
3048                         mcache_set_pattern(MCACHE_FREE_PATTERN,
3049                             (caddr_t)page, PAGE_SIZE);
3050                 }
3051                 if (bufsize == PAGE_SIZE) {
3052                         mcache_obj_t *buf;
3053                         /* One for the entire page */
3054                         sp = slab_get((void *)page);
3055                         if (mclaudit != NULL) {
3056                                 mcl_audit_init((void *)page,
3057                                     &mca_list, &con_list,
3058                                     AUDIT_CONTENTS_SIZE, NMBPG);
3059                         }
3060                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
3061                         slab_init(sp, class, SLF_MAPPED, (void *)page,
3062                             (void *)page, PAGE_SIZE, 0, 1);
3063                         buf = (mcache_obj_t *)page;
3064                         buf->obj_next = NULL;
3065
3066                         /* Insert this slab */
3067                         slab_insert(sp, class);
3068
3069                         /* Update stats now since slab_get drops the lock */
3070                         ++m_infree(class);
3071                         ++m_total(class);
3072                         VERIFY(m_total(class) <= m_maxlimit(class));
3073                         if (class == MC_BIGCL) {
3074                                 mbstat.m_bigclfree = m_infree(MC_BIGCL) +
3075                                     m_infree(MC_MBUF_BIGCL);
3076                                 mbstat.m_bigclusters = m_total(MC_BIGCL);
3077                         }
3078                         ++count;
3079                 } else if ((bufsize > PAGE_SIZE) &&
3080                     (i % NSLABSP16KB) == 0) {
3081                         union m16kcluster *m16kcl = (union m16kcluster *)page;
3082                         mcl_slab_t *nsp;
3083                         int k;
3084
3085                         /* One for the entire 16KB */
3086                         sp = slab_get(m16kcl);
3087                         if (mclaudit != NULL)
3088                                 mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
3089
3090                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
3091                         slab_init(sp, MC_16KCL, SLF_MAPPED,
3092                             m16kcl, m16kcl, bufsize, 0, 1);
3093                         m16kcl->m16kcl_next = NULL;
3094
3095                         /*
3096                          * 2nd-Nth page's slab is part of the first one,
3097                          * where N is NSLABSP16KB.
3098                          */
3099                         for (k = 1; k < NSLABSP16KB; k++) {
3100                                 nsp = slab_get(((union mbigcluster *)page) + k);
3101                                 VERIFY(nsp->sl_refcnt == 0 &&
3102                                     nsp->sl_flags == 0);
3103                                 slab_init(nsp, MC_16KCL,
3104                                     SLF_MAPPED | SLF_PARTIAL,
3105                                     m16kcl, NULL, 0, 0, 0);
3106                         }
3107                         /* Insert this slab */
3108                         slab_insert(sp, MC_16KCL);
3109
3110                         /* Update stats now since slab_get drops the lock */
3111                         ++m_infree(MC_16KCL);
3112                         ++m_total(MC_16KCL);
3113                         VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3114                         ++count;
3115                 }
3116         }
3117         VERIFY(mca_list == NULL && con_list == NULL);
3118
3119         if (!mb_peak_newreport && mbuf_report_usage(class))
3120                 mb_peak_newreport = TRUE;
3121
3122         /* We're done; let others enter */
3123         mb_clalloc_busy = FALSE;
3124         if (mb_clalloc_waiters > 0) {
3125                 mb_clalloc_waiters = 0;
3126                 wakeup(mb_clalloc_waitchan);
3127         }
3128
3129         return (count);
3130 out:
3131         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3132
3133         mtracelarge_register(size);
3134
3135         /* We're done; let others enter */
3136         mb_clalloc_busy = FALSE;
3137         if (mb_clalloc_waiters > 0) {
3138                 mb_clalloc_waiters = 0;
3139                 wakeup(mb_clalloc_waitchan);
3140         }
3141
3142         /*
3143          * When non-blocking we kick a thread if we have to grow the
3144          * pool or if the number of free clusters is less than requested.
3145          */
3146         if (i > 0 && mbuf_worker_ready && mbuf_worker_needs_wakeup) {
3147                 wakeup((caddr_t)&mbuf_worker_needs_wakeup);
3148                 mbuf_worker_needs_wakeup = FALSE;
3149         }
3150         if (class == MC_BIGCL) {
3151                 if (i > 0) {
3152                         /*
3153                          * Remember total number of 4KB clusters needed
3154                          * at this time.
3155                          */
3156                         i += m_total(MC_BIGCL);
3157                         if (i > m_region_expand(MC_BIGCL)) {
3158                                 m_region_expand(MC_BIGCL) = i;
3159                         }
3160                 }
3161                 if (m_infree(MC_BIGCL) >= num)
3162                         return (1);
3163         } else {
3164                 if (i > 0) {
3165                         /*
3166                          * Remember total number of 16KB clusters needed
3167                          * at this time.
3168                          */
3169                         i += m_total(MC_16KCL);
3170                         if (i > m_region_expand(MC_16KCL)) {
3171                                 m_region_expand(MC_16KCL) = i;
3172                         }
3173                 }
3174                 if (m_infree(MC_16KCL) >= num)
3175                         return (1);
3176         }
3177         return (0);
3178 }
3179
3180 /*
3181  * Populate the global freelist of the corresponding buffer class.
3182  */
3183 static int
3184 freelist_populate(mbuf_class_t class, unsigned int num, int wait)
3185 {
3186         mcache_obj_t *o = NULL;
3187         int i, numpages = 0, count;
3188         mbuf_class_t super_class;
3189
3190         VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL ||
3191             class == MC_16KCL);
3192
3193         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3194
3195         VERIFY(PAGE_SIZE == m_maxsize(MC_BIGCL) ||
3196             PAGE_SIZE == m_maxsize(MC_16KCL));
3197
3198         if (m_maxsize(class) >= PAGE_SIZE)
3199                 return(m_clalloc(num, wait, m_maxsize(class)) != 0);
3200
3201         /*
3202          * The rest of the function will allocate pages and will slice
3203          * them up into the right size
3204          */
3205
3206         numpages = (num * m_size(class) + PAGE_SIZE - 1) / PAGE_SIZE;
3207
3208         /* Currently assume that pages are 4K or 16K */
3209         if (PAGE_SIZE == m_maxsize(MC_BIGCL))
3210                 super_class = MC_BIGCL;
3211         else
3212                 super_class = MC_16KCL;
3213
3214         i = m_clalloc(numpages, wait, m_maxsize(super_class));
3215
3216         /* how many objects will we cut the page into? */
3217         int numobj = PAGE_SIZE / m_maxsize(class);
3218
3219         for (count = 0; count < numpages; count++) {
3220                 /* respect totals, minlimit, maxlimit */
3221                 if (m_total(super_class) <= m_minlimit(super_class) ||
3222                     m_total(class) >= m_maxlimit(class))
3223                         break;
3224
3225                 if ((o = slab_alloc(super_class, wait)) == NULL)
3226                         break;
3227
3228                 struct mbuf *m = (struct mbuf *)o;
3229                 union mcluster *c = (union mcluster *)o;
3230                 union mbigcluster *mbc = (union mbigcluster *)o;
3231                 mcl_slab_t *sp = slab_get(o);
3232                 mcache_audit_t *mca = NULL;
3233
3234                 /*
3235                  * since one full page will be converted to MC_MBUF or
3236                  * MC_CL, verify that the reference count will match that
3237                  * assumption
3238                  */
3239                 VERIFY(sp->sl_refcnt == 1 && slab_is_detached(sp));
3240                 VERIFY((sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
3241                 /*
3242                  * Make sure that the cluster is unmolested
3243                  * while in freelist
3244                  */
3245                 if (mclverify) {
3246                         mca = mcl_audit_buf2mca(super_class,
3247                             (mcache_obj_t *)o);
3248                         mcache_audit_free_verify(mca,
3249                             (mcache_obj_t *)o, 0, m_maxsize(super_class));
3250                 }
3251
3252                 /* Reinitialize it as an mbuf or 2K or 4K slab */
3253                 slab_init(sp, class, sp->sl_flags,
3254                     sp->sl_base, NULL, PAGE_SIZE, 0, numobj);
3255
3256                 VERIFY(sp->sl_head == NULL);
3257
3258                 VERIFY(m_total(super_class) >= 1);
3259                 m_total(super_class)--;
3260
3261                 if (super_class == MC_BIGCL)
3262                         mbstat.m_bigclusters = m_total(MC_BIGCL);
3263
3264                 m_total(class) += numobj;
3265                 VERIFY(m_total(class) <= m_maxlimit(class));
3266                 m_infree(class) += numobj;
3267
3268                 if (!mb_peak_newreport && mbuf_report_usage(class))
3269                         mb_peak_newreport = TRUE;
3270
3271                 i = numobj;
3272                 if (class == MC_MBUF) {
3273                         mbstat.m_mbufs = m_total(MC_MBUF);
3274                         mtype_stat_add(MT_FREE, NMBPG);
3275                         while (i--) {
3276                                 /*
3277                                  * If auditing is enabled, construct the
3278                                  * shadow mbuf in the audit structure
3279                                  * instead of the actual one.
3280                                  * mbuf_slab_audit() will take care of
3281                                  * restoring the contents after the
3282                                  * integrity check.
3283                                  */
3284                                 if (mclaudit != NULL) {
3285                                         struct mbuf *ms;
3286                                         mca = mcl_audit_buf2mca(MC_MBUF,
3287                                             (mcache_obj_t *)m);
3288                                         ms = MCA_SAVED_MBUF_PTR(mca);
3289                                         ms->m_type = MT_FREE;
3290                                 } else {
3291                                         m->m_type = MT_FREE;
3292                                 }
3293                                 m->m_next = sp->sl_head;
3294                                 sp->sl_head = (void *)m++;
3295                         }
3296                 } else if (class == MC_CL) { /* MC_CL */
3297                         mbstat.m_clfree =
3298                             m_infree(MC_CL) + m_infree(MC_MBUF_CL);
3299                         mbstat.m_clusters = m_total(MC_CL);
3300                         while (i--) {
3301                                 c->mcl_next = sp->sl_head;
3302                                 sp->sl_head = (void *)c++;
3303                         }
3304                 } else {
3305                         VERIFY(class == MC_BIGCL);
3306                         mbstat.m_bigclusters = m_total(MC_BIGCL);
3307                         mbstat.m_bigclfree = m_infree(MC_BIGCL) +
3308                             m_infree(MC_MBUF_BIGCL);
3309                         while (i--) {
3310                                 mbc->mbc_next = sp->sl_head;
3311                                 sp->sl_head = (void *)mbc++;
3312                         }
3313                 }
3314
3315                 /* Insert into the mbuf or 2k or 4k slab list */
3316                 slab_insert(sp, class);
3317
3318                 if ((i = mb_waiters) > 0)
3319                         mb_waiters = 0;
3320                 if (i != 0)
3321                         wakeup(mb_waitchan);
3322         }
3323         return (count != 0);
3324 }
3325
3326 /*
3327  * For each class, initialize the freelist to hold m_minlimit() objects.
3328  */
3329 static void
3330 freelist_init(mbuf_class_t class)
3331 {
3332         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3333
3334         VERIFY(class == MC_CL || class == MC_BIGCL);
3335         VERIFY(m_total(class) == 0);
3336         VERIFY(m_minlimit(class) > 0);
3337
3338         while (m_total(class) < m_minlimit(class))
3339                 (void) freelist_populate(class, m_minlimit(class), M_WAIT);
3340
3341         VERIFY(m_total(class) >= m_minlimit(class));
3342 }
3343
3344 /*
3345  * (Inaccurately) check if it might be worth a trip back to the
3346  * mcache layer due the availability of objects there.  We'll
3347  * end up back here if there's nothing up there.
3348  */
3349 static boolean_t
3350 mbuf_cached_above(mbuf_class_t class, int wait)
3351 {
3352         switch (class) {
3353         case MC_MBUF:
3354                 if (wait & MCR_COMP)
3355                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)) ||
3356                             !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3357                 break;
3358
3359         case MC_CL:
3360                 if (wait & MCR_COMP)
3361                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)));
3362                 break;
3363
3364         case MC_BIGCL:
3365                 if (wait & MCR_COMP)
3366                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3367                 break;
3368
3369         case MC_16KCL:
3370                 if (wait & MCR_COMP)
3371                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL)));
3372                 break;
3373
3374         case MC_MBUF_CL:
3375         case MC_MBUF_BIGCL:
3376         case MC_MBUF_16KCL:
3377                 break;
3378
3379         default:
3380                 VERIFY(0);
3381                 /* NOTREACHED */
3382         }
3383
3384         return (!mcache_bkt_isempty(m_cache(class)));
3385 }
3386
3387 /*
3388  * If possible, convert constructed objects to raw ones.
3389  */
3390 static boolean_t
3391 mbuf_steal(mbuf_class_t class, unsigned int num)
3392 {
3393         mcache_obj_t *top = NULL;
3394         mcache_obj_t **list = &top;
3395         unsigned int tot = 0;
3396
3397         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3398
3399         switch (class) {
3400         case MC_MBUF:
3401         case MC_CL:
3402         case MC_BIGCL:
3403         case MC_16KCL:
3404                 return (FALSE);
3405
3406         case MC_MBUF_CL:
3407         case MC_MBUF_BIGCL:
3408         case MC_MBUF_16KCL:
3409                 /* Get the required number of constructed objects if possible */
3410                 if (m_infree(class) > m_minlimit(class)) {
3411                         tot = cslab_alloc(class, &list,
3412                             MIN(num, m_infree(class)));
3413                 }
3414
3415                 /* And destroy them to get back the raw objects */
3416                 if (top != NULL)
3417                         (void) cslab_free(class, top, 1);
3418                 break;
3419
3420         default:
3421                 VERIFY(0);
3422                 /* NOTREACHED */
3423         }
3424
3425         return (tot == num);
3426 }
3427
3428 static void
3429 m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
3430 {
3431         int m, bmap = 0;
3432
3433         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3434
3435         VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
3436         VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
3437         VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3438
3439         /*
3440          * This logic can be made smarter; for now, simply mark
3441          * all other related classes as potential victims.
3442          */
3443         switch (class) {
3444         case MC_MBUF:
3445                 m_wantpurge(MC_CL)++;
3446                 m_wantpurge(MC_BIGCL)++;
3447                 m_wantpurge(MC_MBUF_CL)++;
3448                 m_wantpurge(MC_MBUF_BIGCL)++;
3449                 break;
3450
3451         case MC_CL:
3452                 m_wantpurge(MC_MBUF)++;
3453                 m_wantpurge(MC_BIGCL)++;
3454                 m_wantpurge(MC_MBUF_BIGCL)++;
3455                 if (!comp)
3456                         m_wantpurge(MC_MBUF_CL)++;
3457                 break;
3458
3459         case MC_BIGCL:
3460                 m_wantpurge(MC_MBUF)++;
3461                 m_wantpurge(MC_CL)++;
3462                 m_wantpurge(MC_MBUF_CL)++;
3463                 if (!comp)
3464                         m_wantpurge(MC_MBUF_BIGCL)++;
3465                 break;
3466
3467         case MC_16KCL:
3468                 if (!comp)
3469                         m_wantpurge(MC_MBUF_16KCL)++;
3470                 break;
3471
3472         default:
3473                 VERIFY(0);
3474                 /* NOTREACHED */
3475         }
3476
3477         /*
3478          * Run through each marked class and check if we really need to
3479          * purge (and therefore temporarily disable) the per-CPU caches
3480          * layer used by the class.  If so, remember the classes since
3481          * we are going to drop the lock below prior to purging.
3482          */
3483         for (m = 0; m < NELEM(mbuf_table); m++) {
3484                 if (m_wantpurge(m) > 0) {
3485                         m_wantpurge(m) = 0;
3486                         /*
3487                          * Try hard to steal the required number of objects
3488                          * from the freelist of other mbuf classes.  Only
3489                          * purge and disable the per-CPU caches layer when
3490                          * we don't have enough; it's the last resort.
3491                          */
3492                         if (!mbuf_steal(m, num))
3493                                 bmap |= (1 << m);
3494                 }
3495         }
3496
3497         lck_mtx_unlock(mbuf_mlock);
3498
3499         if (bmap != 0) {
3500                 /* signal the domains to drain */
3501                 net_drain_domains();
3502
3503                 /* Sigh; we have no other choices but to ask mcache to purge */
3504                 for (m = 0; m < NELEM(mbuf_table); m++) {
3505                         if ((bmap & (1 << m)) &&
3506                             mcache_purge_cache(m_cache(m), TRUE)) {
3507                                 lck_mtx_lock(mbuf_mlock);
3508                                 m_purge_cnt(m)++;
3509                                 mbstat.m_drain++;
3510                                 lck_mtx_unlock(mbuf_mlock);
3511                         }
3512                 }
3513         } else {
3514                 /*
3515                  * Request mcache to reap extra elements from all of its caches;
3516                  * note that all reaps are serialized and happen only at a fixed
3517                  * interval.
3518                  */
3519                 mcache_reap();
3520         }
3521         lck_mtx_lock(mbuf_mlock);
3522 }
3523
3524 static inline struct mbuf *
3525 m_get_common(int wait, short type, int hdr)
3526 {
3527         struct mbuf *m;
3528         int mcflags = MSLEEPF(wait);
3529
3530         /* Is this due to a non-blocking retry?  If so, then try harder */
3531         if (mcflags & MCR_NOSLEEP)
3532                 mcflags |= MCR_TRYHARD;
3533
3534         m = mcache_alloc(m_cache(MC_MBUF), mcflags);
3535         if (m != NULL) {
3536                 MBUF_INIT(m, hdr, type);
3537                 mtype_stat_inc(type);
3538                 mtype_stat_dec(MT_FREE);
3539 #if CONFIG_MACF_NET
3540                 if (hdr && mac_init_mbuf(m, wait) != 0) {
3541                         m_free(m);
3542                         return (NULL);
3543                 }
3544 #endif /* MAC_NET */
3545         }
3546         return (m);
3547 }
3548
3549 /*
3550  * Space allocation routines; these are also available as macros
3551  * for critical paths.
3552  */
3553 #define _M_GET(wait, type)      m_get_common(wait, type, 0)
3554 #define _M_GETHDR(wait, type)   m_get_common(wait, type, 1)
3555 #define _M_RETRY(wait, type)    _M_GET(wait, type)
3556 #define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
3557 #define _MGET(m, how, type)     ((m) = _M_GET(how, type))
3558 #define _MGETHDR(m, how, type)  ((m) = _M_GETHDR(how, type))
3559
3560 struct mbuf *
3561 m_get(int wait, int type)
3562 {
3563         return (_M_GET(wait, type));
3564 }
3565
3566 struct mbuf *
3567 m_gethdr(int wait, int type)
3568 {
3569         return (_M_GETHDR(wait, type));
3570 }
3571
3572 struct mbuf *
3573 m_retry(int wait, int type)
3574 {
3575         return (_M_RETRY(wait, type));
3576 }
3577
3578 struct mbuf *
3579 m_retryhdr(int wait, int type)
3580 {
3581         return (_M_RETRYHDR(wait, type));
3582 }
3583
3584 struct mbuf *
3585 m_getclr(int wait, int type)
3586 {
3587         struct mbuf *m;
3588
3589         _MGET(m, wait, type);
3590         if (m != NULL)
3591                 bzero(MTOD(m, caddr_t), MLEN);
3592         return (m);
3593 }
3594
3595 static int
3596 m_free_paired(struct mbuf *m)
3597 {
3598         VERIFY((m->m_flags & M_EXT) && (MEXT_FLAGS(m) & EXTF_PAIRED));
3599
3600         membar_sync();
3601         if (MEXT_PMBUF(m) == m) {
3602                 volatile UInt16 *addr = (volatile UInt16 *)&MEXT_PREF(m);
3603                 int16_t oprefcnt, prefcnt;
3604
3605                 /*
3606                  * Paired ref count might be negative in case we lose
3607                  * against another thread clearing MEXT_PMBUF, in the
3608                  * event it occurs after the above memory barrier sync.
3609                  * In that case just ignore as things have been unpaired.
3610                  */
3611                 do {
3612                         oprefcnt = *addr;
3613                         prefcnt = oprefcnt - 1;
3614                 } while (!OSCompareAndSwap16(oprefcnt, prefcnt, addr));
3615
3616                 if (prefcnt > 1) {
3617                         return (1);
3618                 } else if (prefcnt == 1) {
3619                         (*(m_get_ext_free(m)))(m->m_ext.ext_buf,
3620                             m->m_ext.ext_size, m_get_ext_arg(m));
3621                         return (1);
3622                 } else if (prefcnt == 0) {
3623                         VERIFY(MBUF_IS_PAIRED(m));
3624
3625                         /*
3626                          * Restore minref to its natural value, so that
3627                          * the caller will be able to free the cluster
3628                          * as appropriate.
3629                          */
3630                         MEXT_MINREF(m) = 0;
3631
3632                         /*
3633                          * Clear MEXT_PMBUF, but leave EXTF_PAIRED intact
3634                          * as it is immutable.  atomic_set_ptr also causes
3635                          * memory barrier sync.
3636                          */
3637                         atomic_set_ptr(&MEXT_PMBUF(m), NULL);
3638
3639                         switch (m->m_ext.ext_size) {
3640                         case MCLBYTES:
3641                                 m_set_ext(m, m_get_rfa(m), NULL, NULL);
3642                                 break;
3643
3644                         case MBIGCLBYTES:
3645                                 m_set_ext(m, m_get_rfa(m), m_bigfree, NULL);
3646                                 break;
3647
3648                         case M16KCLBYTES:
3649                                 m_set_ext(m, m_get_rfa(m), m_16kfree, NULL);
3650                                 break;
3651
3652                         default:
3653                                 VERIFY(0);
3654                                 /* NOTREACHED */
3655                         }
3656                 }
3657         }
3658
3659         /*
3660          * Tell caller the unpair has occurred, and that the reference
3661          * count on the external cluster held for the paired mbuf should
3662          * now be dropped.
3663          */
3664         return (0);
3665 }
3666
3667 struct mbuf *
3668 m_free(struct mbuf *m)
3669 {
3670         struct mbuf *n = m->m_next;
3671
3672         if (m->m_type == MT_FREE)
3673                 panic("m_free: freeing an already freed mbuf");
3674
3675         if (m->m_flags & M_PKTHDR) {
3676                 /* Check for scratch area overflow */
3677                 m_redzone_verify(m);
3678                 /* Free the aux data and tags if there is any */
3679                 m_tag_delete_chain(m, NULL);
3680
3681                 m_do_tx_compl_callback(m, NULL);
3682         }
3683
3684         if (m->m_flags & M_EXT) {
3685                 u_int16_t refcnt;
3686                 u_int32_t composite;
3687                 m_ext_free_func_t m_free_func;
3688
3689                 if (MBUF_IS_PAIRED(m) && m_free_paired(m))
3690                         return (n);
3691
3692                 refcnt = m_decref(m);
3693                 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3694                 m_free_func = m_get_ext_free(m);
3695
3696                 if (refcnt == MEXT_MINREF(m) && !composite) {
3697                         if (m_free_func == NULL) {
3698                                 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3699                         } else if (m_free_func == m_bigfree) {
3700                                 mcache_free(m_cache(MC_BIGCL),
3701                                     m->m_ext.ext_buf);
3702                         } else if (m_free_func == m_16kfree) {
3703                                 mcache_free(m_cache(MC_16KCL),
3704                                     m->m_ext.ext_buf);
3705                         } else {
3706                                 (*m_free_func)(m->m_ext.ext_buf,
3707                                     m->m_ext.ext_size, m_get_ext_arg(m));
3708                         }
3709                         mcache_free(ref_cache, m_get_rfa(m));
3710                         m_set_ext(m, NULL, NULL, NULL);
3711                 } else if (refcnt == MEXT_MINREF(m) && composite) {
3712                         VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
3713                         VERIFY(m->m_type != MT_FREE);
3714
3715                         mtype_stat_dec(m->m_type);
3716                         mtype_stat_inc(MT_FREE);
3717
3718                         m->m_type = MT_FREE;
3719                         m->m_flags = M_EXT;
3720                         m->m_len = 0;
3721                         m->m_next = m->m_nextpkt = NULL;
3722
3723                         MEXT_FLAGS(m) &= ~EXTF_READONLY;
3724
3725                         /* "Free" into the intermediate cache */
3726                         if (m_free_func == NULL) {
3727                                 mcache_free(m_cache(MC_MBUF_CL), m);
3728                         } else if (m_free_func == m_bigfree) {
3729                                 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3730                         } else {
3731                                 VERIFY(m_free_func == m_16kfree);
3732                                 mcache_free(m_cache(MC_MBUF_16KCL), m);
3733                         }
3734                         return (n);
3735                 }
3736         }
3737
3738         if (m->m_type != MT_FREE) {
3739                 mtype_stat_dec(m->m_type);
3740                 mtype_stat_inc(MT_FREE);
3741         }
3742
3743         m->m_type = MT_FREE;
3744         m->m_flags = m->m_len = 0;
3745         m->m_next = m->m_nextpkt = NULL;
3746
3747         mcache_free(m_cache(MC_MBUF), m);
3748
3749         return (n);
3750 }
3751
3752 __private_extern__ struct mbuf *
3753 m_clattach(struct mbuf *m, int type, caddr_t extbuf,
3754     void (*extfree)(caddr_t, u_int, caddr_t), u_int extsize, caddr_t extarg,
3755     int wait, int pair)
3756 {
3757         struct ext_ref *rfa = NULL;
3758
3759         /*
3760          * If pairing is requested and an existing mbuf is provided, reject
3761          * it if it's already been paired to another cluster.  Otherwise,
3762          * allocate a new one or free any existing below.
3763          */
3764         if ((m != NULL && MBUF_IS_PAIRED(m)) ||
3765             (m == NULL && (m = _M_GETHDR(wait, type)) == NULL))
3766                 return (NULL);
3767
3768         if (m->m_flags & M_EXT) {
3769                 u_int16_t refcnt;
3770                 u_int32_t composite;
3771                 m_ext_free_func_t m_free_func;
3772
3773                 refcnt = m_decref(m);
3774                 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3775                 VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED) && MEXT_PMBUF(m) == NULL);
3776                 m_free_func = m_get_ext_free(m);
3777                 if (refcnt == MEXT_MINREF(m) && !composite) {
3778                         if (m_free_func == NULL) {
3779                                 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3780                         } else if (m_free_func == m_bigfree) {
3781                                 mcache_free(m_cache(MC_BIGCL),
3782                                     m->m_ext.ext_buf);
3783                         } else if (m_free_func == m_16kfree) {
3784                                 mcache_free(m_cache(MC_16KCL),
3785                                     m->m_ext.ext_buf);
3786                         } else {
3787                                 (*m_free_func)(m->m_ext.ext_buf,
3788                                     m->m_ext.ext_size, m_get_ext_arg(m));
3789                         }
3790                         /* Re-use the reference structure */
3791                         rfa = m_get_rfa(m);
3792                 } else if (refcnt == MEXT_MINREF(m) && composite) {
3793                         VERIFY(m->m_type != MT_FREE);
3794
3795                         mtype_stat_dec(m->m_type);
3796                         mtype_stat_inc(MT_FREE);
3797
3798                         m->m_type = MT_FREE;
3799                         m->m_flags = M_EXT;
3800                         m->m_len = 0;
3801                         m->m_next = m->m_nextpkt = NULL;
3802
3803                         MEXT_FLAGS(m) &= ~EXTF_READONLY;
3804
3805                         /* "Free" into the intermediate cache */
3806                         if (m_free_func == NULL) {
3807                                 mcache_free(m_cache(MC_MBUF_CL), m);
3808                         } else if (m_free_func == m_bigfree) {
3809                                 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3810                         } else {
3811                                 VERIFY(m_free_func == m_16kfree);
3812                                 mcache_free(m_cache(MC_MBUF_16KCL), m);
3813                         }
3814                         /*
3815                          * Allocate a new mbuf, since we didn't divorce
3816                          * the composite mbuf + cluster pair above.
3817                          */
3818                         if ((m = _M_GETHDR(wait, type)) == NULL)
3819                                 return (NULL);
3820                 }
3821         }
3822
3823         if (rfa == NULL &&
3824             (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
3825                 m_free(m);
3826                 return (NULL);
3827         }
3828
3829         if (!pair) {
3830                 MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa,
3831                     0, 1, 0, 0, 0, NULL);
3832         } else {
3833                 MEXT_INIT(m, extbuf, extsize, extfree, (caddr_t)m, rfa,
3834                     1, 1, 1, EXTF_PAIRED, 0, m);
3835         }
3836
3837         return (m);
3838 }
3839
3840 /*
3841  * Perform `fast' allocation mbuf clusters from a cache of recently-freed
3842  * clusters. (If the cache is empty, new clusters are allocated en-masse.)
3843  */
3844 struct mbuf *
3845 m_getcl(int wait, int type, int flags)
3846 {
3847         struct mbuf *m;
3848         int mcflags = MSLEEPF(wait);
3849         int hdr = (flags & M_PKTHDR);
3850
3851         /* Is this due to a non-blocking retry?  If so, then try harder */
3852         if (mcflags & MCR_NOSLEEP)
3853                 mcflags |= MCR_TRYHARD;
3854
3855         m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags);
3856         if (m != NULL) {
3857                 u_int16_t flag;
3858                 struct ext_ref *rfa;
3859                 void *cl;
3860
3861                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3862                 cl = m->m_ext.ext_buf;
3863                 rfa = m_get_rfa(m);
3864
3865                 ASSERT(cl != NULL && rfa != NULL);
3866                 VERIFY(MBUF_IS_COMPOSITE(m) && m_get_ext_free(m) == NULL);
3867
3868                 flag = MEXT_FLAGS(m);
3869
3870                 MBUF_INIT(m, hdr, type);
3871                 MBUF_CL_INIT(m, cl, rfa, 1, flag);
3872
3873                 mtype_stat_inc(type);
3874                 mtype_stat_dec(MT_FREE);
3875 #if CONFIG_MACF_NET
3876                 if (hdr && mac_init_mbuf(m, wait) != 0) {
3877                         m_freem(m);
3878                         return (NULL);
3879                 }
3880 #endif /* MAC_NET */
3881         }
3882         return (m);
3883 }
3884
3885 /* m_mclget() add an mbuf cluster to a normal mbuf */
3886 struct mbuf *
3887 m_mclget(struct mbuf *m, int wait)
3888 {
3889         struct ext_ref *rfa;
3890
3891         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3892                 return (m);
3893
3894         m->m_ext.ext_buf = m_mclalloc(wait);
3895         if (m->m_ext.ext_buf != NULL) {
3896                 MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3897         } else {
3898                 mcache_free(ref_cache, rfa);
3899         }
3900         return (m);
3901 }
3902
3903 /* Allocate an mbuf cluster */
3904 caddr_t
3905 m_mclalloc(int wait)
3906 {
3907         int mcflags = MSLEEPF(wait);
3908
3909         /* Is this due to a non-blocking retry?  If so, then try harder */
3910         if (mcflags & MCR_NOSLEEP)
3911                 mcflags |= MCR_TRYHARD;
3912
3913         return (mcache_alloc(m_cache(MC_CL), mcflags));
3914 }
3915
3916 /* Free an mbuf cluster */
3917 void
3918 m_mclfree(caddr_t p)
3919 {
3920         mcache_free(m_cache(MC_CL), p);
3921 }
3922
3923 /*
3924  * mcl_hasreference() checks if a cluster of an mbuf is referenced by
3925  * another mbuf; see comments in m_incref() regarding EXTF_READONLY.
3926  */
3927 int
3928 m_mclhasreference(struct mbuf *m)
3929 {
3930         if (!(m->m_flags & M_EXT))
3931                 return (0);
3932
3933         ASSERT(m_get_rfa(m) != NULL);
3934
3935         return ((MEXT_FLAGS(m) & EXTF_READONLY) ? 1 : 0);
3936 }
3937
3938 __private_extern__ caddr_t
3939 m_bigalloc(int wait)
3940 {
3941         int mcflags = MSLEEPF(wait);
3942
3943         /* Is this due to a non-blocking retry?  If so, then try harder */
3944         if (mcflags & MCR_NOSLEEP)
3945                 mcflags |= MCR_TRYHARD;
3946
3947         return (mcache_alloc(m_cache(MC_BIGCL), mcflags));
3948 }
3949
3950 __private_extern__ void
3951 m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3952 {
3953         mcache_free(m_cache(MC_BIGCL), p);
3954 }
3955
3956 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
3957 __private_extern__ struct mbuf *
3958 m_mbigget(struct mbuf *m, int wait)
3959 {
3960         struct ext_ref *rfa;
3961
3962         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3963                 return (m);
3964
3965         m->m_ext.ext_buf =  m_bigalloc(wait);
3966         if (m->m_ext.ext_buf != NULL) {
3967                 MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3968         } else {
3969                 mcache_free(ref_cache, rfa);
3970         }
3971         return (m);
3972 }
3973
3974 __private_extern__ caddr_t
3975 m_16kalloc(int wait)
3976 {
3977         int mcflags = MSLEEPF(wait);
3978
3979         /* Is this due to a non-blocking retry?  If so, then try harder */
3980         if (mcflags & MCR_NOSLEEP)
3981                 mcflags |= MCR_TRYHARD;
3982
3983         return (mcache_alloc(m_cache(MC_16KCL), mcflags));
3984 }
3985
3986 __private_extern__ void
3987 m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3988 {
3989         mcache_free(m_cache(MC_16KCL), p);
3990 }
3991
3992 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
3993 __private_extern__ struct mbuf *
3994 m_m16kget(struct mbuf *m, int wait)
3995 {
3996         struct ext_ref *rfa;
3997
3998         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3999                 return (m);
4000
4001         m->m_ext.ext_buf =  m_16kalloc(wait);
4002         if (m->m_ext.ext_buf != NULL) {
4003                 MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
4004         } else {
4005                 mcache_free(ref_cache, rfa);
4006         }
4007         return (m);
4008 }
4009
4010 /*
4011  * "Move" mbuf pkthdr from "from" to "to".
4012  * "from" must have M_PKTHDR set, and "to" must be empty.
4013  */
4014 void
4015 m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
4016 {
4017         VERIFY(from->m_flags & M_PKTHDR);
4018
4019         /* Check for scratch area overflow */
4020         m_redzone_verify(from);
4021
4022         if (to->m_flags & M_PKTHDR) {
4023                 /* Check for scratch area overflow */
4024                 m_redzone_verify(to);
4025                 /* We will be taking over the tags of 'to' */
4026                 m_tag_delete_chain(to, NULL);
4027         }
4028         to->m_pkthdr = from->m_pkthdr;          /* especially tags */
4029         m_classifier_init(from, 0);             /* purge classifier info */
4030         m_tag_init(from, 1);                    /* purge all tags from src */
4031         m_scratch_init(from);                   /* clear src scratch area */
4032         to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
4033         if ((to->m_flags & M_EXT) == 0)
4034                 to->m_data = to->m_pktdat;
4035         m_redzone_init(to);                     /* setup red zone on dst */
4036 }
4037
4038 /*
4039  * Duplicate "from"'s mbuf pkthdr in "to".
4040  * "from" must have M_PKTHDR set, and "to" must be empty.
4041  * In particular, this does a deep copy of the packet tags.
4042  */
4043 static int
4044 m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
4045 {
4046         VERIFY(from->m_flags & M_PKTHDR);
4047
4048         /* Check for scratch area overflow */
4049         m_redzone_verify(from);
4050
4051         if (to->m_flags & M_PKTHDR) {
4052                 /* Check for scratch area overflow */
4053                 m_redzone_verify(to);
4054                 /* We will be taking over the tags of 'to' */
4055                 m_tag_delete_chain(to, NULL);
4056         }
4057         to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
4058         if ((to->m_flags & M_EXT) == 0)
4059                 to->m_data = to->m_pktdat;
4060         to->m_pkthdr = from->m_pkthdr;
4061         m_redzone_init(to);                     /* setup red zone on dst */
4062         m_tag_init(to, 0);                      /* preserve dst static tags */
4063         return (m_tag_copy_chain(to, from, how));
4064 }
4065
4066 void
4067 m_copy_pftag(struct mbuf *to, struct mbuf *from)
4068 {
4069         memcpy(m_pftag(to), m_pftag(from), sizeof(struct pf_mtag));
4070 #if PF_ECN
4071         m_pftag(to)->pftag_hdr = NULL;
4072         m_pftag(to)->pftag_flags &= ~(PF_TAG_HDR_INET|PF_TAG_HDR_INET6);
4073 #endif /* PF_ECN */
4074 }
4075
4076 void
4077 m_classifier_init(struct mbuf *m, uint32_t pktf_mask)
4078 {
4079         VERIFY(m->m_flags & M_PKTHDR);
4080
4081         m->m_pkthdr.pkt_proto = 0;
4082         m->m_pkthdr.pkt_flowsrc = 0;
4083         m->m_pkthdr.pkt_flowid = 0;
4084         m->m_pkthdr.pkt_flags &= pktf_mask;     /* caller-defined mask */
4085         /* preserve service class and interface info for loopback packets */
4086         if (!(m->m_pkthdr.pkt_flags & PKTF_LOOP))
4087                 (void) m_set_service_class(m, MBUF_SC_BE);
4088         if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO))
4089                 m->m_pkthdr.pkt_ifainfo = 0;
4090         /*
4091          * Preserve timestamp if requested
4092          */
4093         if (!(m->m_pkthdr.pkt_flags & PKTF_TS_VALID))
4094                 m->m_pkthdr.pkt_timestamp = 0;
4095 }
4096
4097 void
4098 m_copy_classifier(struct mbuf *to, struct mbuf *from)
4099 {
4100         VERIFY(to->m_flags & M_PKTHDR);
4101         VERIFY(from->m_flags & M_PKTHDR);
4102
4103         to->m_pkthdr.pkt_proto = from->m_pkthdr.pkt_proto;
4104         to->m_pkthdr.pkt_flowsrc = from->m_pkthdr.pkt_flowsrc;
4105         to->m_pkthdr.pkt_flowid = from->m_pkthdr.pkt_flowid;
4106         to->m_pkthdr.pkt_flags = from->m_pkthdr.pkt_flags;
4107         (void) m_set_service_class(to, from->m_pkthdr.pkt_svc);
4108         to->m_pkthdr.pkt_ifainfo  = from->m_pkthdr.pkt_ifainfo;
4109 }
4110
4111 /*
4112  * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
4113  * if wantall is not set, return whatever number were available.  Set up the
4114  * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
4115  * are chained on the m_nextpkt field.  Any packets requested beyond this
4116  * are chained onto the last packet header's m_next field.  The size of
4117  * the cluster is controlled by the parameter bufsize.
4118  */
4119 __private_extern__ struct mbuf *
4120 m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
4121     int wait, int wantall, size_t bufsize)
4122 {
4123         struct mbuf *m;
4124         struct mbuf **np, *top;
4125         unsigned int pnum, needed = *num_needed;
4126         mcache_obj_t *mp_list = NULL;
4127         int mcflags = MSLEEPF(wait);
4128         u_int16_t flag;
4129         struct ext_ref *rfa;
4130         mcache_t *cp;
4131         void *cl;
4132
4133         ASSERT(bufsize == m_maxsize(MC_CL) ||
4134             bufsize == m_maxsize(MC_BIGCL) ||
4135             bufsize == m_maxsize(MC_16KCL));
4136
4137         /*
4138          * Caller must first check for njcl because this
4139          * routine is internal and not exposed/used via KPI.
4140          */
4141         VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0);
4142
4143         top = NULL;
4144         np = &top;
4145         pnum = 0;
4146
4147         /*
4148          * The caller doesn't want all the requested buffers; only some.
4149          * Try hard to get what we can, but don't block.  This effectively
4150          * overrides MCR_SLEEP, since this thread will not go to sleep
4151          * if we can't get all the buffers.
4152          */
4153         if (!wantall || (mcflags & MCR_NOSLEEP))
4154                 mcflags |= MCR_TRYHARD;
4155
4156         /* Allocate the composite mbuf + cluster elements from the cache */
4157         if (bufsize == m_maxsize(MC_CL))
4158                 cp = m_cache(MC_MBUF_CL);
4159         else if (bufsize == m_maxsize(MC_BIGCL))
4160                 cp = m_cache(MC_MBUF_BIGCL);
4161         else
4162                 cp = m_cache(MC_MBUF_16KCL);
4163         needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
4164
4165         for (pnum = 0; pnum < needed; pnum++) {
4166                 m = (struct mbuf *)mp_list;
4167                 mp_list = mp_list->obj_next;
4168
4169                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
4170                 cl = m->m_ext.ext_buf;
4171                 rfa = m_get_rfa(m);
4172
4173                 ASSERT(cl != NULL && rfa != NULL);
4174                 VERIFY(MBUF_IS_COMPOSITE(m));
4175
4176                 flag = MEXT_FLAGS(m);
4177
4178                 MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
4179                 if (bufsize == m_maxsize(MC_16KCL)) {
4180                         MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
4181                 } else if (bufsize == m_maxsize(MC_BIGCL)) {
4182                         MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
4183                 } else {
4184                         MBUF_CL_INIT(m, cl, rfa, 1, flag);
4185                 }
4186
4187                 if (num_with_pkthdrs > 0) {
4188                         --num_with_pkthdrs;
4189 #if CONFIG_MACF_NET
4190                         if (mac_mbuf_label_init(m, wait) != 0) {
4191                                 m_freem(m);
4192                                 break;
4193                         }
4194 #endif /* MAC_NET */
4195                 }
4196
4197                 *np = m;
4198                 if (num_with_pkthdrs > 0)
4199                         np = &m->m_nextpkt;
4200                 else
4201                         np = &m->m_next;
4202         }
4203         ASSERT(pnum != *num_needed || mp_list == NULL);
4204         if (mp_list != NULL)
4205                 mcache_free_ext(cp, mp_list);
4206
4207         if (pnum > 0) {
4208                 mtype_stat_add(MT_DATA, pnum);
4209                 mtype_stat_sub(MT_FREE, pnum);
4210         }
4211
4212         if (wantall && (pnum != *num_needed)) {
4213                 if (top != NULL)
4214                         m_freem_list(top);
4215                 return (NULL);
4216         }
4217
4218         if (pnum > *num_needed) {
4219                 printf("%s: File a radar related to <rdar://10146739>. \
4220                         needed = %u, pnum = %u, num_needed = %u \n",
4221                         __func__, needed, pnum, *num_needed);
4222         }
4223
4224         *num_needed = pnum;
4225         return (top);
4226 }
4227
4228 /*
4229  * Return list of mbuf linked by m_nextpkt.  Try for numlist, and if
4230  * wantall is not set, return whatever number were available.  The size of
4231  * each mbuf in the list is controlled by the parameter packetlen.  Each
4232  * mbuf of the list may have a chain of mbufs linked by m_next.  Each mbuf
4233  * in the chain is called a segment.  If maxsegments is not null and the
4234  * value pointed to is not null, this specify the maximum number of segments
4235  * for a chain of mbufs.  If maxsegments is zero or the value pointed to
4236  * is zero the caller does not have any restriction on the number of segments.
4237  * The actual  number of segments of a mbuf chain is return in the value
4238  * pointed to by maxsegments.
4239  */
4240 __private_extern__ struct mbuf *
4241 m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
4242     unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
4243 {
4244         struct mbuf **np, *top, *first = NULL;
4245         size_t bufsize, r_bufsize;
4246         unsigned int num = 0;
4247         unsigned int nsegs = 0;
4248         unsigned int needed, resid;
4249         int mcflags = MSLEEPF(wait);
4250         mcache_obj_t *mp_list = NULL, *rmp_list = NULL;
4251         mcache_t *cp = NULL, *rcp = NULL;
4252
4253         if (*numlist == 0)
4254                 return (NULL);
4255
4256         top = NULL;
4257         np = &top;
4258
4259         if (wantsize == 0) {
4260                 if (packetlen <= MINCLSIZE) {
4261                         bufsize = packetlen;
4262                 } else if (packetlen > m_maxsize(MC_CL)) {
4263                         /* Use 4KB if jumbo cluster pool isn't available */
4264                         if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0)
4265                                 bufsize = m_maxsize(MC_BIGCL);
4266                         else
4267                                 bufsize = m_maxsize(MC_16KCL);
4268                 } else {
4269                         bufsize = m_maxsize(MC_CL);
4270                 }
4271         } else if (wantsize == m_maxsize(MC_CL) ||
4272             wantsize == m_maxsize(MC_BIGCL) ||
4273             (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) {
4274                 bufsize = wantsize;
4275         } else {
4276                 return (NULL);
4277         }
4278
4279         if (bufsize <= MHLEN) {
4280                 nsegs = 1;
4281         } else if (bufsize <= MINCLSIZE) {
4282                 if (maxsegments != NULL && *maxsegments == 1) {
4283                         bufsize = m_maxsize(MC_CL);
4284                         nsegs = 1;
4285                 } else {
4286                         nsegs = 2;
4287                 }
4288         } else if (bufsize == m_maxsize(MC_16KCL)) {
4289                 VERIFY(njcl > 0);
4290                 nsegs = ((packetlen - 1) >> M16KCLSHIFT) + 1;
4291         } else if (bufsize == m_maxsize(MC_BIGCL)) {
4292                 nsegs = ((packetlen - 1) >> MBIGCLSHIFT) + 1;
4293         } else {
4294                 nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
4295         }
4296         if (maxsegments != NULL) {
4297                 if (*maxsegments && nsegs > *maxsegments) {
4298                         *maxsegments = nsegs;
4299                         return (NULL);
4300                 }
4301                 *maxsegments = nsegs;
4302         }
4303
4304         /*
4305          * The caller doesn't want all the requested buffers; only some.
4306          * Try hard to get what we can, but don't block.  This effectively
4307          * overrides MCR_SLEEP, since this thread will not go to sleep
4308          * if we can't get all the buffers.
4309          */
4310         if (!wantall || (mcflags & MCR_NOSLEEP))
4311                 mcflags |= MCR_TRYHARD;
4312
4313         /*
4314          * Simple case where all elements in the lists/chains are mbufs.
4315          * Unless bufsize is greater than MHLEN, each segment chain is made
4316          * up of exactly 1 mbuf.  Otherwise, each segment chain is made up
4317          * of 2 mbufs; the second one is used for the residual data, i.e.
4318          * the remaining data that cannot fit into the first mbuf.
4319          */
4320         if (bufsize <= MINCLSIZE) {
4321                 /* Allocate the elements in one shot from the mbuf cache */
4322                 ASSERT(bufsize <= MHLEN || nsegs == 2);
4323                 cp = m_cache(MC_MBUF);
4324                 needed = mcache_alloc_ext(cp, &mp_list,
4325                     (*numlist) * nsegs, mcflags);
4326
4327                 /*
4328                  * The number of elements must be even if we are to use an
4329                  * mbuf (instead of a cluster) to store the residual data.
4330                  * If we couldn't allocate the requested number of mbufs,
4331                  * trim the number down (if it's odd) in order to avoid
4332                  * creating a partial segment chain.
4333                  */
4334                 if (bufsize > MHLEN && (needed & 0x1))
4335                         needed--;
4336
4337                 while (num < needed) {
4338                         struct mbuf *m;
4339
4340                         m = (struct mbuf *)mp_list;
4341                         mp_list = mp_list->obj_next;
4342                         ASSERT(m != NULL);
4343
4344                         MBUF_INIT(m, 1, MT_DATA);
4345 #if CONFIG_MACF_NET
4346                         if (mac_init_mbuf(m, wait) != 0) {
4347                                 m_free(m);
4348                                 break;
4349                         }
4350 #endif /* MAC_NET */
4351                         num++;
4352                         if (bufsize > MHLEN) {
4353                                 /* A second mbuf for this segment chain */
4354                                 m->m_next = (struct mbuf *)mp_list;
4355                                 mp_list = mp_list->obj_next;
4356                                 ASSERT(m->m_next != NULL);
4357
4358                                 MBUF_INIT(m->m_next, 0, MT_DATA);
4359                                 num++;
4360                         }
4361                         *np = m;
4362                         np = &m->m_nextpkt;
4363                 }
4364                 ASSERT(num != *numlist || mp_list == NULL);
4365
4366                 if (num > 0) {
4367                         mtype_stat_add(MT_DATA, num);
4368                         mtype_stat_sub(MT_FREE, num);
4369                 }
4370                 num /= nsegs;
4371
4372                 /* We've got them all; return to caller */
4373                 if (num == *numlist)
4374                         return (top);
4375
4376                 goto fail;
4377         }
4378
4379         /*
4380          * Complex cases where elements are made up of one or more composite
4381          * mbufs + cluster, depending on packetlen.  Each N-segment chain can
4382          * be illustrated as follows:
4383          *
4384          * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
4385          *
4386          * Every composite mbuf + cluster element comes from the intermediate
4387          * cache (either MC_MBUF_CL or MC_MBUF_BIGCL).  For space efficiency,
4388          * the last composite element will come from the MC_MBUF_CL cache,
4389          * unless the residual data is larger than 2KB where we use the
4390          * big cluster composite cache (MC_MBUF_BIGCL) instead.  Residual
4391          * data is defined as extra data beyond the first element that cannot
4392          * fit into the previous element, i.e. there is no residual data if
4393          * the chain only has 1 segment.
4394          */
4395         r_bufsize = bufsize;
4396         resid = packetlen > bufsize ? packetlen % bufsize : 0;
4397         if (resid > 0) {
4398                 /* There is residual data; figure out the cluster size */
4399                 if (wantsize == 0 && packetlen > MINCLSIZE) {
4400                         /*
4401                          * Caller didn't request that all of the segments
4402                          * in the chain use the same cluster size; use the
4403                          * smaller of the cluster sizes.
4404                          */
4405                         if (njcl > 0 && resid > m_maxsize(MC_BIGCL))
4406                                 r_bufsize = m_maxsize(MC_16KCL);
4407                         else if (resid > m_maxsize(MC_CL))
4408                                 r_bufsize = m_maxsize(MC_BIGCL);
4409                         else
4410                                 r_bufsize = m_maxsize(MC_CL);
4411                 } else {
4412                         /* Use the same cluster size as the other segments */
4413                         resid = 0;
4414                 }
4415         }
4416
4417         needed = *numlist;
4418         if (resid > 0) {
4419                 /*
4420                  * Attempt to allocate composite mbuf + cluster elements for
4421                  * the residual data in each chain; record the number of such
4422                  * elements that can be allocated so that we know how many
4423                  * segment chains we can afford to create.
4424                  */
4425                 if (r_bufsize <= m_maxsize(MC_CL))
4426                         rcp = m_cache(MC_MBUF_CL);
4427                 else if (r_bufsize <= m_maxsize(MC_BIGCL))
4428                         rcp = m_cache(MC_MBUF_BIGCL);
4429                 else
4430                         rcp = m_cache(MC_MBUF_16KCL);
4431                 needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
4432
4433                 if (needed == 0)
4434                         goto fail;
4435
4436                 /* This is temporarily reduced for calculation */
4437                 ASSERT(nsegs > 1);
4438                 nsegs--;
4439         }
4440
4441         /*
4442          * Attempt to allocate the rest of the composite mbuf + cluster
4443          * elements for the number of segment chains that we need.
4444          */
4445         if (bufsize <= m_maxsize(MC_CL))
4446                 cp = m_cache(MC_MBUF_CL);
4447         else if (bufsize <= m_maxsize(MC_BIGCL))
4448                 cp = m_cache(MC_MBUF_BIGCL);
4449         else
4450                 cp = m_cache(MC_MBUF_16KCL);
4451         needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
4452
4453         /* Round it down to avoid creating a partial segment chain */
4454         needed = (needed / nsegs) * nsegs;
4455         if (needed == 0)
4456                 goto fail;
4457
4458         if (resid > 0) {
4459                 /*
4460                  * We're about to construct the chain(s); take into account
4461                  * the number of segments we have created above to hold the
4462                  * residual data for each chain, as well as restore the
4463                  * original count of segments per chain.
4464                  */
4465                 ASSERT(nsegs > 0);
4466                 needed += needed / nsegs;
4467                 nsegs++;
4468         }
4469
4470         for (;;) {
4471                 struct mbuf *m;
4472                 u_int16_t flag;
4473                 struct ext_ref *rfa;
4474                 void *cl;
4475                 int pkthdr;
4476                 m_ext_free_func_t m_free_func;
4477
4478                 ++num;
4479                 if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
4480                         m = (struct mbuf *)mp_list;
4481                         mp_list = mp_list->obj_next;
4482                 } else {
4483                         m = (struct mbuf *)rmp_list;
4484                         rmp_list = rmp_list->obj_next;
4485                 }
4486                 m_free_func = m_get_ext_free(m);
4487                 ASSERT(m != NULL);
4488                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
4489                 VERIFY(m_free_func == NULL || m_free_func == m_bigfree ||
4490                     m_free_func == m_16kfree);
4491
4492                 cl = m->m_ext.ext_buf;
4493                 rfa = m_get_rfa(m);
4494
4495                 ASSERT(cl != NULL && rfa != NULL);
4496                 VERIFY(MBUF_IS_COMPOSITE(m));
4497
4498                 flag = MEXT_FLAGS(m);
4499
4500                 pkthdr = (nsegs == 1 || (num % nsegs) == 1);
4501                 if (pkthdr)
4502                         first = m;
4503                 MBUF_INIT(m, pkthdr, MT_DATA);
4504                 if (m_free_func == m_16kfree) {
4505                         MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
4506                 } else if (m_free_func == m_bigfree) {
4507                         MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
4508                 } else {
4509                         MBUF_CL_INIT(m, cl, rfa, 1, flag);
4510                 }
4511 #if CONFIG_MACF_NET
4512                 if (pkthdr && mac_init_mbuf(m, wait) != 0) {
4513                         --num;
4514                         m_freem(m);
4515                         break;
4516                 }
4517 #endif /* MAC_NET */
4518
4519                 *np = m;
4520                 if ((num % nsegs) == 0)
4521                         np = &first->m_nextpkt;
4522                 else
4523                         np = &m->m_next;
4524
4525                 if (num == needed)
4526                         break;
4527         }
4528
4529         if (num > 0) {
4530                 mtype_stat_add(MT_DATA, num);
4531                 mtype_stat_sub(MT_FREE, num);
4532         }
4533
4534         num /= nsegs;
4535
4536         /* We've got them all; return to caller */
4537         if (num == *numlist) {
4538                 ASSERT(mp_list == NULL && rmp_list == NULL);
4539                 return (top);
4540         }
4541
4542 fail:
4543         /* Free up what's left of the above */
4544         if (mp_list != NULL)
4545                 mcache_free_ext(cp, mp_list);
4546         if (rmp_list != NULL)
4547                 mcache_free_ext(rcp, rmp_list);
4548         if (wantall && top != NULL) {
4549                 m_freem(top);
4550                 return (NULL);
4551         }
4552         *numlist = num;
4553         return (top);
4554 }
4555
4556 /*
4557  * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
4558  * packets on receive ring.
4559  */
4560 __private_extern__ struct mbuf *
4561 m_getpacket_how(int wait)
4562 {
4563         unsigned int num_needed = 1;
4564
4565         return (m_getpackets_internal(&num_needed, 1, wait, 1,
4566             m_maxsize(MC_CL)));
4567 }
4568
4569 /*
4570  * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
4571  * packets on receive ring.
4572  */
4573 struct mbuf *
4574 m_getpacket(void)
4575 {
4576         unsigned int num_needed = 1;
4577
4578         return (m_getpackets_internal(&num_needed, 1, M_WAIT, 1,
4579             m_maxsize(MC_CL)));
4580 }
4581
4582 /*
4583  * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
4584  * if this can't be met, return whatever number were available.  Set up the
4585  * first num_with_pkthdrs with mbuf hdrs configured as packet headers.  These
4586  * are chained on the m_nextpkt field.  Any packets requested beyond this are
4587  * chained onto the last packet header's m_next field.
4588  */
4589 struct mbuf *
4590 m_getpackets(int num_needed, int num_with_pkthdrs, int how)
4591 {
4592         unsigned int n = num_needed;
4593
4594         return (m_getpackets_internal(&n, num_with_pkthdrs, how, 0,
4595             m_maxsize(MC_CL)));
4596 }
4597
4598 /*
4599  * Return a list of mbuf hdrs set up as packet hdrs chained together
4600  * on the m_nextpkt field
4601  */
4602 struct mbuf *
4603 m_getpackethdrs(int num_needed, int how)
4604 {
4605         struct mbuf *m;
4606         struct mbuf **np, *top;
4607
4608         top = NULL;
4609         np = &top;
4610
4611         while (num_needed--) {
4612                 m = _M_RETRYHDR(how, MT_DATA);
4613                 if (m == NULL)
4614                         break;
4615
4616                 *np = m;
4617                 np = &m->m_nextpkt;
4618         }
4619
4620         return (top);
4621 }
4622
4623 /*
4624  * Free an mbuf list (m_nextpkt) while following m_next.  Returns the count
4625  * for mbufs packets freed.  Used by the drivers.
4626  */
4627 int
4628 m_freem_list(struct mbuf *m)
4629 {
4630         struct mbuf *nextpkt;
4631         mcache_obj_t *mp_list = NULL;
4632         mcache_obj_t *mcl_list = NULL;
4633         mcache_obj_t *mbc_list = NULL;
4634         mcache_obj_t *m16k_list = NULL;
4635         mcache_obj_t *m_mcl_list = NULL;
4636         mcache_obj_t *m_mbc_list = NULL;
4637         mcache_obj_t *m_m16k_list = NULL;
4638         mcache_obj_t *ref_list = NULL;
4639         int pktcount = 0;
4640         int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
4641
4642         while (m != NULL) {
4643                 pktcount++;
4644
4645                 nextpkt = m->m_nextpkt;
4646                 m->m_nextpkt = NULL;
4647
4648                 while (m != NULL) {
4649                         struct mbuf *next = m->m_next;
4650                         mcache_obj_t *o, *rfa;
4651                         u_int32_t composite;
4652                         u_int16_t refcnt;
4653                         m_ext_free_func_t m_free_func;
4654
4655                         if (m->m_type == MT_FREE)
4656                                 panic("m_free: freeing an already freed mbuf");
4657
4658                         if (m->m_flags & M_PKTHDR) {
4659                                 /* Check for scratch area overflow */
4660                                 m_redzone_verify(m);
4661                                 /* Free the aux data and tags if there is any */
4662                                 m_tag_delete_chain(m, NULL);
4663                         }
4664
4665                         if (!(m->m_flags & M_EXT)) {
4666                                 mt_free++;
4667                                 goto simple_free;
4668                         }
4669
4670                         if (MBUF_IS_PAIRED(m) && m_free_paired(m)) {
4671                                 m = next;
4672                                 continue;
4673                         }
4674
4675                         mt_free++;
4676
4677                         o = (mcache_obj_t *)(void *)m->m_ext.ext_buf;
4678                         refcnt = m_decref(m);
4679                         composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
4680                         m_free_func = m_get_ext_free(m);
4681                         if (refcnt == MEXT_MINREF(m) && !composite) {
4682                                 if (m_free_func == NULL) {
4683                                         o->obj_next = mcl_list;
4684                                         mcl_list = o;
4685                                 } else if (m_free_func == m_bigfree) {
4686                                         o->obj_next = mbc_list;
4687                                         mbc_list = o;
4688                                 } else if (m_free_func == m_16kfree) {
4689                                         o->obj_next = m16k_list;
4690                                         m16k_list = o;
4691                                 } else {
4692                                         (*(m_free_func))((caddr_t)o,
4693                                             m->m_ext.ext_size,
4694                                             m_get_ext_arg(m));
4695                                 }
4696                                 rfa = (mcache_obj_t *)(void *)m_get_rfa(m);
4697                                 rfa->obj_next = ref_list;
4698                                 ref_list = rfa;
4699                                 m_set_ext(m, NULL, NULL, NULL);
4700                         } else if (refcnt == MEXT_MINREF(m) && composite) {
4701                                 VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
4702                                 VERIFY(m->m_type != MT_FREE);
4703                                 /*
4704                                  * Amortize the costs of atomic operations
4705                                  * by doing them at the end, if possible.
4706                                  */
4707                                 if (m->m_type == MT_DATA)
4708                                         mt_data++;
4709                                 else if (m->m_type == MT_HEADER)
4710                                         mt_header++;
4711                                 else if (m->m_type == MT_SONAME)
4712                                         mt_soname++;
4713                                 else if (m->m_type == MT_TAG)
4714                                         mt_tag++;
4715                                 else
4716                                         mtype_stat_dec(m->m_type);
4717
4718                                 m->m_type = MT_FREE;
4719                                 m->m_flags = M_EXT;
4720                                 m->m_len = 0;
4721                                 m->m_next = m->m_nextpkt = NULL;
4722
4723                                 MEXT_FLAGS(m) &= ~EXTF_READONLY;
4724
4725                                 /* "Free" into the intermediate cache */
4726                                 o = (mcache_obj_t *)m;
4727                                 if (m_free_func == NULL) {
4728                                         o->obj_next = m_mcl_list;
4729                                         m_mcl_list = o;
4730                                 } else if (m_free_func == m_bigfree) {
4731                                         o->obj_next = m_mbc_list;
4732                                         m_mbc_list = o;
4733                                 } else {
4734                                         VERIFY(m_free_func == m_16kfree);
4735                                         o->obj_next = m_m16k_list;
4736                                         m_m16k_list = o;
4737                                 }
4738                                 m = next;
4739                                 continue;
4740                         }
4741 simple_free:
4742                         /*
4743                          * Amortize the costs of atomic operations
4744                          * by doing them at the end, if possible.
4745                          */
4746                         if (m->m_type == MT_DATA)
4747                                 mt_data++;
4748                         else if (m->m_type == MT_HEADER)
4749                                 mt_header++;
4750                         else if (m->m_type == MT_SONAME)
4751                                 mt_soname++;
4752                         else if (m->m_type == MT_TAG)
4753                                 mt_tag++;
4754                         else if (m->m_type != MT_FREE)
4755                                 mtype_stat_dec(m->m_type);
4756
4757                         m->m_type = MT_FREE;
4758                         m->m_flags = m->m_len = 0;
4759                         m->m_next = m->m_nextpkt = NULL;
4760
4761                         ((mcache_obj_t *)m)->obj_next = mp_list;
4762                         mp_list = (mcache_obj_t *)m;
4763
4764                         m = next;
4765                 }
4766
4767                 m = nextpkt;
4768         }
4769
4770         if (mt_free > 0)
4771                 mtype_stat_add(MT_FREE, mt_free);
4772         if (mt_data > 0)
4773                 mtype_stat_sub(MT_DATA, mt_data);
4774         if (mt_header > 0)
4775                 mtype_stat_sub(MT_HEADER, mt_header);
4776         if (mt_soname > 0)
4777                 mtype_stat_sub(MT_SONAME, mt_soname);
4778         if (mt_tag > 0)
4779                 mtype_stat_sub(MT_TAG, mt_tag);
4780
4781         if (mp_list != NULL)
4782                 mcache_free_ext(m_cache(MC_MBUF), mp_list);
4783         if (mcl_list != NULL)
4784                 mcache_free_ext(m_cache(MC_CL), mcl_list);
4785         if (mbc_list != NULL)
4786                 mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
4787         if (m16k_list != NULL)
4788                 mcache_free_ext(m_cache(MC_16KCL), m16k_list);
4789         if (m_mcl_list != NULL)
4790                 mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
4791         if (m_mbc_list != NULL)
4792                 mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
4793         if (m_m16k_list != NULL)
4794                 mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
4795         if (ref_list != NULL)
4796                 mcache_free_ext(ref_cache, ref_list);
4797
4798         return (pktcount);
4799 }
4800
4801 void
4802 m_freem(struct mbuf *m)
4803 {
4804         while (m != NULL)
4805                 m = m_free(m);
4806 }
4807
4808 /*
4809  * Mbuffer utility routines.
4810  */
4811
4812 /*
4813  * Compute the amount of space available before the current start
4814  * of data in an mbuf.
4815  */
4816 int
4817 m_leadingspace(struct mbuf *m)
4818 {
4819         if (m->m_flags & M_EXT) {
4820                 if (MCLHASREFERENCE(m))
4821                         return (0);
4822                 return (m->m_data - m->m_ext.ext_buf);
4823         }
4824         if (m->m_flags & M_PKTHDR)
4825                 return (m->m_data - m->m_pktdat);
4826         return (m->m_data - m->m_dat);
4827 }
4828
4829 /*
4830  * Compute the amount of space available after the end of data in an mbuf.
4831  */
4832 int
4833 m_trailingspace(struct mbuf *m)
4834 {
4835         if (m->m_flags & M_EXT) {
4836                 if (MCLHASREFERENCE(m))
4837                         return (0);
4838                 return (m->m_ext.ext_buf + m->m_ext.ext_size -
4839                     (m->m_data + m->m_len));
4840         }
4841         return (&m->m_dat[MLEN] - (m->m_data + m->m_len));
4842 }
4843
4844 /*
4845  * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
4846  * copy junk along.  Does not adjust packet header length.
4847  */
4848 struct mbuf *
4849 m_prepend(struct mbuf *m, int len, int how)
4850 {
4851         struct mbuf *mn;
4852
4853         _MGET(mn, how, m->m_type);
4854         if (mn == NULL) {
4855                 m_freem(m);
4856                 return (NULL);
4857         }
4858         if (m->m_flags & M_PKTHDR) {
4859                 M_COPY_PKTHDR(mn, m);
4860                 m->m_flags &= ~M_PKTHDR;
4861         }
4862         mn->m_next = m;
4863         m = mn;
4864         if (m->m_flags & M_PKTHDR) {
4865                 VERIFY(len <= MHLEN);
4866                 MH_ALIGN(m, len);
4867         } else {
4868                 VERIFY(len <= MLEN);
4869                 M_ALIGN(m, len);
4870         }
4871         m->m_len = len;
4872         return (m);
4873 }
4874
4875 /*
4876  * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
4877  * chain, copy junk along, and adjust length.
4878  */
4879 struct mbuf *
4880 m_prepend_2(struct mbuf *m, int len, int how, int align)
4881 {
4882         if (M_LEADINGSPACE(m) >= len &&
4883             (!align || IS_P2ALIGNED((m->m_data - len), sizeof(u_int32_t)))) {
4884                 m->m_data -= len;
4885                 m->m_len += len;
4886         } else {
4887                 m = m_prepend(m, len, how);
4888         }
4889         if ((m) && (m->m_flags & M_PKTHDR))
4890                 m->m_pkthdr.len += len;
4891         return (m);
4892 }
4893
4894 /*
4895  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
4896  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
4897  * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
4898  */
4899 int MCFail;
4900
4901 struct mbuf *
4902 m_copym_mode(struct mbuf *m, int off0, int len, int wait, uint32_t mode)
4903 {
4904         struct mbuf *n, *mhdr = NULL, **np;
4905         int off = off0;
4906         struct mbuf *top;
4907         int copyhdr = 0;
4908
4909         if (off < 0 || len < 0)
4910                 panic("m_copym: invalid offset %d or len %d", off, len);
4911
4912         VERIFY((mode != M_COPYM_MUST_COPY_HDR &&
4913             mode != M_COPYM_MUST_MOVE_HDR) || (m->m_flags & M_PKTHDR));
4914
4915         if ((off == 0 && (m->m_flags & M_PKTHDR)) ||
4916             mode == M_COPYM_MUST_COPY_HDR || mode == M_COPYM_MUST_MOVE_HDR) {
4917                 mhdr = m;
4918                 copyhdr = 1;
4919         }
4920
4921         while (off >= m->m_len) {
4922                 if (m->m_next == NULL)
4923                         panic("m_copym: invalid mbuf chain");
4924                 off -= m->m_len;
4925                 m = m->m_next;
4926         }
4927         np = &top;
4928         top = NULL;
4929
4930         while (len > 0) {
4931                 if (m == NULL) {
4932                         if (len != M_COPYALL)
4933                                 panic("m_copym: len != M_COPYALL");
4934                         break;
4935                 }
4936
4937                 if (copyhdr)
4938                         n = _M_RETRYHDR(wait, m->m_type);
4939                 else
4940                         n = _M_RETRY(wait, m->m_type);
4941                 *np = n;
4942
4943                 if (n == NULL)
4944                         goto nospace;
4945
4946                 if (copyhdr != 0) {
4947                         if ((mode == M_COPYM_MOVE_HDR) ||
4948                             (mode == M_COPYM_MUST_MOVE_HDR)) {
4949                                 M_COPY_PKTHDR(n, mhdr);
4950                         } else if ((mode == M_COPYM_COPY_HDR) ||
4951                             (mode == M_COPYM_MUST_COPY_HDR)) {
4952                                 if (m_dup_pkthdr(n, mhdr, wait) == 0)
4953                                         goto nospace;
4954                         }
4955                         if (len == M_COPYALL)
4956                                 n->m_pkthdr.len -= off0;
4957                         else
4958                                 n->m_pkthdr.len = len;
4959                         copyhdr = 0;
4960                         /*
4961                          * There is data to copy from the packet header mbuf
4962                          * if it is empty or it is before the starting offset
4963                          */
4964                         if (mhdr != m) {
4965                                 np = &n->m_next;
4966                                 continue;
4967                         }
4968                 }
4969                 n->m_len = MIN(len, (m->m_len - off));
4970                 if (m->m_flags & M_EXT) {
4971                         n->m_ext = m->m_ext;
4972                         m_incref(m);
4973                         n->m_data = m->m_data + off;
4974                         n->m_flags |= M_EXT;
4975                 } else {
4976                         /*
4977                          * Limit to the capacity of the destination
4978                          */
4979                         if (n->m_flags & M_PKTHDR)
4980                                 n->m_len = MIN(n->m_len, MHLEN);
4981                         else
4982                                 n->m_len = MIN(n->m_len, MLEN);
4983
4984                         if (MTOD(n, char *) + n->m_len > ((char *)n) + MSIZE)
4985                                 panic("%s n %p copy overflow",
4986                                         __func__, n);
4987
4988                         bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4989                             (unsigned)n->m_len);
4990                 }
4991                 if (len != M_COPYALL)
4992                         len -= n->m_len;
4993                 off = 0;
4994                 m = m->m_next;
4995                 np = &n->m_next;
4996         }
4997
4998         if (top == NULL)
4999                 MCFail++;
5000
5001         return (top);
5002 nospace:
5003
5004         m_freem(top);
5005         MCFail++;
5006         return (NULL);
5007 }
5008
5009
5010 struct mbuf *
5011 m_copym(struct mbuf *m, int off0, int len, int wait)
5012 {
5013         return (m_copym_mode(m, off0, len, wait, M_COPYM_MOVE_HDR));
5014 }
5015
5016 /*
5017  * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
5018  * within this routine also, the last mbuf and offset accessed are passed
5019  * out and can be passed back in to avoid having to rescan the entire mbuf
5020  * list (normally hung off of the socket)
5021  */
5022 struct mbuf *
5023 m_copym_with_hdrs(struct mbuf *m0, int off0, int len0, int wait,
5024     struct mbuf **m_lastm, int *m_off, uint32_t mode)
5025 {
5026         struct mbuf *m = m0, *n, **np = NULL;
5027         int off = off0, len = len0;
5028         struct mbuf *top = NULL;
5029         int mcflags = MSLEEPF(wait);
5030         int copyhdr = 0;
5031         int type = 0;
5032         mcache_obj_t *list = NULL;
5033         int needed = 0;
5034
5035         if (off == 0 && (m->m_flags & M_PKTHDR))
5036                 copyhdr = 1;
5037
5038         if (m_lastm != NULL && *m_lastm != NULL) {
5039                 m = *m_lastm;
5040                 off = *m_off;
5041         } else {
5042                 while (off >= m->m_len) {
5043                         off -= m->m_len;
5044                         m = m->m_next;
5045                 }
5046         }
5047
5048         n = m;
5049         while (len > 0) {
5050                 needed++;
5051                 ASSERT(n != NULL);
5052                 len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
5053                 n = n->m_next;
5054         }
5055         needed++;
5056         len = len0;
5057
5058         /*
5059          * If the caller doesn't want to be put to sleep, mark it with
5060          * MCR_TRYHARD so that we may reclaim buffers from other places
5061          * before giving up.
5062          */
5063         if (mcflags & MCR_NOSLEEP)
5064                 mcflags |= MCR_TRYHARD;
5065
5066         if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
5067             mcflags) != needed)
5068                 goto nospace;
5069
5070         needed = 0;
5071         while (len > 0) {
5072                 n = (struct mbuf *)list;
5073                 list = list->obj_next;
5074                 ASSERT(n != NULL && m != NULL);
5075
5076                 type = (top == NULL) ? MT_HEADER : m->m_type;
5077                 MBUF_INIT(n, (top == NULL), type);
5078 #if CONFIG_MACF_NET
5079                 if (top == NULL && mac_mbuf_label_init(n, wait) != 0) {
5080                         mtype_stat_inc(MT_HEADER);
5081                         mtype_stat_dec(MT_FREE);
5082                         m_free(n);
5083                         goto nospace;
5084                 }
5085 #endif /* MAC_NET */
5086
5087                 if (top == NULL) {
5088                         top = n;
5089                         np = &top->m_next;
5090                         continue;
5091                 } else {
5092                         needed++;
5093                         *np = n;
5094                 }
5095
5096                 if (copyhdr) {
5097                         if ((mode == M_COPYM_MOVE_HDR) ||
5098                             (mode == M_COPYM_MUST_MOVE_HDR)) {
5099                                 M_COPY_PKTHDR(n, m);
5100                         } else if ((mode == M_COPYM_COPY_HDR) ||
5101                             (mode == M_COPYM_MUST_COPY_HDR)) {
5102                                 if (m_dup_pkthdr(n, m, wait) == 0)
5103                                         goto nospace;
5104                         }
5105                         n->m_pkthdr.len = len;
5106                         copyhdr = 0;
5107                 }
5108                 n->m_len = MIN(len, (m->m_len - off));
5109
5110                 if (m->m_flags & M_EXT) {
5111                         n->m_ext = m->m_ext;
5112                         m_incref(m);
5113                         n->m_data = m->m_data + off;
5114                         n->m_flags |= M_EXT;
5115                 } else {
5116                         if (MTOD(n, char *) + n->m_len > ((char *)n) + MSIZE)
5117                                 panic("%s n %p copy overflow",
5118                                         __func__, n);
5119
5120                         bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
5121                             (unsigned)n->m_len);
5122                 }
5123                 len -= n->m_len;
5124
5125                 if (len == 0) {
5126                         if (m_lastm != NULL && m_off != NULL) {
5127                                 if ((off + n->m_len) == m->m_len) {
5128                                         *m_lastm = m->m_next;
5129                                         *m_off  = 0;
5130                                 } else {
5131                                         *m_lastm = m;
5132                                         *m_off  = off + n->m_len;
5133                                 }
5134                         }
5135                         break;
5136                 }
5137                 off = 0;
5138                 m = m->m_next;
5139                 np = &n->m_next;
5140         }
5141
5142         mtype_stat_inc(MT_HEADER);
5143         mtype_stat_add(type, needed);
5144         mtype_stat_sub(MT_FREE, needed + 1);
5145
5146         ASSERT(list == NULL);
5147         return (top);
5148
5149 nospace:
5150         if (list != NULL)
5151                 mcache_free_ext(m_cache(MC_MBUF), list);
5152         if (top != NULL)
5153                 m_freem(top);
5154         MCFail++;
5155         return (NULL);
5156 }
5157
5158 /*
5159  * Copy data from an mbuf chain starting "off" bytes from the beginning,
5160  * continuing for "len" bytes, into the indicated buffer.
5161  */
5162 void
5163 m_copydata(struct mbuf *m, int off, int len, void *vp)
5164 {
5165         int off0 = off, len0 = len;
5166         struct mbuf *m0 = m;
5167         unsigned count;
5168         char *cp = vp;
5169
5170         if (__improbable(off < 0 || len < 0)) {
5171                 panic("%s: invalid offset %d or len %d", __func__, off, len);
5172                 /* NOTREACHED */
5173         }
5174
5175         while (off > 0) {
5176                 if (__improbable(m == NULL)) {
5177                         panic("%s: invalid mbuf chain %p [off %d, len %d]",
5178                             __func__, m0, off0, len0);
5179                         /* NOTREACHED */
5180                 }
5181                 if (off < m->m_len)
5182                         break;
5183                 off -= m->m_len;
5184                 m = m->m_next;
5185         }
5186         while (len > 0) {
5187                 if (__improbable(m == NULL)) {
5188                         panic("%s: invalid mbuf chain %p [off %d, len %d]",
5189                             __func__, m0, off0, len0);
5190                         /* NOTREACHED */
5191                 }
5192                 count = MIN(m->m_len - off, len);
5193                 bcopy(MTOD(m, caddr_t) + off, cp, count);
5194                 len -= count;
5195                 cp += count;
5196                 off = 0;
5197                 m = m->m_next;
5198         }
5199 }
5200
5201 /*
5202  * Concatenate mbuf chain n to m.  Both chains must be of the same type
5203  * (e.g. MT_DATA).  Any m_pkthdr is not updated.
5204  */
5205 void
5206 m_cat(struct mbuf *m, struct mbuf *n)
5207 {
5208         while (m->m_next)
5209                 m = m->m_next;
5210         while (n) {
5211                 if ((m->m_flags & M_EXT) ||
5212                     m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
5213                         /* just join the two chains */
5214                         m->m_next = n;
5215                         return;
5216                 }
5217                 /* splat the data from one into the other */
5218                 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
5219                     (u_int)n->m_len);
5220                 m->m_len += n->m_len;
5221                 n = m_free(n);
5222         }
5223 }
5224
5225 void
5226 m_adj(struct mbuf *mp, int req_len)
5227 {
5228         int len = req_len;
5229         struct mbuf *m;
5230         int count;
5231
5232         if ((m = mp) == NULL)
5233                 return;
5234         if (len >= 0) {
5235                 /*
5236                  * Trim from head.
5237                  */
5238                 while (m != NULL && len > 0) {
5239                         if (m->m_len <= len) {
5240                                 len -= m->m_len;
5241                                 m->m_len = 0;
5242                                 m = m->m_next;
5243                         } else {
5244                                 m->m_len -= len;
5245                                 m->m_data += len;
5246                                 len = 0;
5247                         }
5248                 }
5249                 m = mp;
5250                 if (m->m_flags & M_PKTHDR)
5251                         m->m_pkthdr.len -= (req_len - len);
5252         } else {
5253                 /*
5254                  * Trim from tail.  Scan the mbuf chain,
5255                  * calculating its length and finding the last mbuf.
5256                  * If the adjustment only affects this mbuf, then just
5257                  * adjust and return.  Otherwise, rescan and truncate
5258                  * after the remaining size.
5259                  */
5260                 len = -len;
5261                 count = 0;
5262                 for (;;) {
5263                         count += m->m_len;
5264                         if (m->m_next == (struct mbuf *)0)
5265                                 break;
5266                         m = m->m_next;
5267                 }
5268                 if (m->m_len >= len) {
5269                         m->m_len -= len;
5270                         m = mp;
5271                         if (m->m_flags & M_PKTHDR)
5272                                 m->m_pkthdr.len -= len;
5273                         return;
5274                 }
5275                 count -= len;
5276                 if (count < 0)
5277                         count = 0;
5278                 /*
5279                  * Correct length for chain is "count".
5280                  * Find the mbuf with last data, adjust its length,
5281                  * and toss data from remaining mbufs on chain.
5282                  */
5283                 m = mp;
5284                 if (m->m_flags & M_PKTHDR)
5285                         m->m_pkthdr.len = count;
5286                 for (; m; m = m->m_next) {
5287                         if (m->m_len >= count) {
5288                                 m->m_len = count;
5289                                 break;
5290                         }
5291                         count -= m->m_len;
5292                 }
5293                 while ((m = m->m_next))
5294                         m->m_len = 0;
5295         }
5296 }
5297
5298 /*
5299  * Rearange an mbuf chain so that len bytes are contiguous
5300  * and in the data area of an mbuf (so that mtod and dtom
5301  * will work for a structure of size len).  Returns the resulting
5302  * mbuf chain on success, frees it and returns null on failure.
5303  * If there is room, it will add up to max_protohdr-len extra bytes to the
5304  * contiguous region in an attempt to avoid being called next time.
5305  */
5306 int MPFail;
5307
5308 struct mbuf *
5309 m_pullup(struct mbuf *n, int len)
5310 {
5311         struct mbuf *m;
5312         int count;
5313         int space;
5314
5315         /* check invalid arguments */
5316         if (n == NULL) {
5317                  panic("%s: n == NULL", __func__);
5318         }
5319         if (len < 0) {
5320                 os_log_info(OS_LOG_DEFAULT, "%s: failed negative len %d",
5321                     __func__, len);
5322                 goto bad;
5323         }
5324
5325         /*
5326          * If first mbuf has no cluster, and has room for len bytes
5327          * without shifting current data, pullup into it,
5328          * otherwise allocate a new mbuf to prepend to the chain.
5329          */
5330         if ((n->m_flags & M_EXT) == 0 &&
5331             n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
5332                 if (n->m_len >= len)
5333                         return (n);
5334                 m = n;
5335                 n = n->m_next;
5336                 len -= m->m_len;
5337         } else {
5338                 if (len > MHLEN)
5339                         goto bad;
5340                 _MGET(m, M_DONTWAIT, n->m_type);
5341                 if (m == 0)
5342                         goto bad;
5343                 m->m_len = 0;
5344                 if (n->m_flags & M_PKTHDR) {
5345                         M_COPY_PKTHDR(m, n);
5346                         n->m_flags &= ~M_PKTHDR;
5347                 }
5348         }
5349         space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
5350         do {
5351                 count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
5352                 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
5353                     (unsigned)count);
5354                 len -= count;
5355                 m->m_len += count;
5356                 n->m_len -= count;
5357                 space -= count;
5358                 if (n->m_len)
5359                         n->m_data += count;
5360                 else
5361                         n = m_free(n);
5362         } while (len > 0 && n);
5363         if (len > 0) {
5364                 (void) m_free(m);
5365                 goto bad;
5366         }
5367         m->m_next = n;
5368         return (m);
5369 bad:
5370         m_freem(n);
5371         MPFail++;
5372         return (0);
5373 }
5374
5375 /*
5376  * Like m_pullup(), except a new mbuf is always allocated, and we allow
5377  * the amount of empty space before the data in the new mbuf to be specified
5378  * (in the event that the caller expects to prepend later).
5379  */
5380 __private_extern__ int MSFail = 0;
5381
5382 __private_extern__ struct mbuf *
5383 m_copyup(struct mbuf *n, int len, int dstoff)
5384 {
5385         struct mbuf *m;
5386         int count, space;
5387
5388         if (len > (MHLEN - dstoff))
5389                 goto bad;
5390         MGET(m, M_DONTWAIT, n->m_type);
5391         if (m == NULL)
5392                 goto bad;
5393         m->m_len = 0;
5394         if (n->m_flags & M_PKTHDR) {
5395                 m_copy_pkthdr(m, n);
5396                 n->m_flags &= ~M_PKTHDR;
5397         }
5398         m->m_data += dstoff;
5399         space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
5400         do {
5401                 count = min(min(max(len, max_protohdr), space), n->m_len);
5402                 memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
5403                     (unsigned)count);
5404                 len -= count;
5405                 m->m_len += count;
5406                 n->m_len -= count;
5407                 space -= count;
5408                 if (n->m_len)
5409                         n->m_data += count;
5410                 else
5411                         n = m_free(n);
5412         } while (len > 0 && n);
5413         if (len > 0) {
5414                 (void) m_free(m);
5415                 goto bad;
5416         }
5417         m->m_next = n;
5418         return (m);
5419 bad:
5420         m_freem(n);
5421         MSFail++;
5422         return (NULL);
5423 }
5424
5425 /*
5426  * Partition an mbuf chain in two pieces, returning the tail --
5427  * all but the first len0 bytes.  In case of failure, it returns NULL and
5428  * attempts to restore the chain to its original state.
5429  */
5430 struct mbuf *
5431 m_split(struct mbuf *m0, int len0, int wait)
5432 {
5433         return (m_split0(m0, len0, wait, 1));
5434 }
5435
5436 static struct mbuf *
5437 m_split0(struct mbuf *m0, int len0, int wait, int copyhdr)
5438 {
5439         struct mbuf *m, *n;
5440         unsigned len = len0, remain;
5441
5442         for (m = m0; m && len > m->m_len; m = m->m_next)
5443                 len -= m->m_len;
5444         if (m == NULL)
5445                 return (NULL);
5446         remain = m->m_len - len;
5447         if (copyhdr && (m0->m_flags & M_PKTHDR)) {
5448                 _MGETHDR(n, wait, m0->m_type);
5449                 if (n == NULL)
5450                         return (NULL);
5451                 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
5452                 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
5453                 m0->m_pkthdr.len = len0;
5454                 if (m->m_flags & M_EXT)
5455                         goto extpacket;
5456                 if (remain > MHLEN) {
5457                         /* m can't be the lead packet */
5458                         MH_ALIGN(n, 0);
5459                         n->m_next = m_split(m, len, wait);
5460                         if (n->m_next == NULL) {
5461                                 (void) m_free(n);
5462                                 return (NULL);
5463                         } else
5464                                 return (n);
5465                 } else
5466                         MH_ALIGN(n, remain);
5467         } else if (remain == 0) {
5468                 n = m->m_next;
5469                 m->m_next = NULL;
5470                 return (n);
5471         } else {
5472                 _MGET(n, wait, m->m_type);
5473                 if (n == NULL)
5474                         return (NULL);
5475                 M_ALIGN(n, remain);
5476         }
5477 extpacket:
5478         if (m->m_flags & M_EXT) {
5479                 n->m_flags |= M_EXT;
5480                 n->m_ext = m->m_ext;
5481                 m_incref(m);
5482                 n->m_data = m->m_data + len;
5483         } else {
5484                 bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain);
5485         }
5486         n->m_len = remain;
5487         m->m_len = len;
5488         n->m_next = m->m_next;
5489         m->m_next = NULL;
5490         return (n);
5491 }
5492
5493 /*
5494  * Routine to copy from device local memory into mbufs.
5495  */
5496 struct mbuf *
5497 m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
5498     void (*copy)(const void *, void *, size_t))
5499 {
5500         struct mbuf *m;
5501         struct mbuf *top = NULL, **mp = &top;
5502         int off = off0, len;
5503         char *cp;
5504         char *epkt;
5505
5506         cp = buf;
5507         epkt = cp + totlen;
5508         if (off) {
5509                 /*
5510                  * If 'off' is non-zero, packet is trailer-encapsulated,
5511                  * so we have to skip the type and length fields.
5512                  */
5513                 cp += off + 2 * sizeof (u_int16_t);
5514                 totlen -= 2 * sizeof (u_int16_t);
5515         }
5516         _MGETHDR(m, M_DONTWAIT, MT_DATA);
5517         if (m == NULL)
5518                 return (NULL);
5519         m->m_pkthdr.rcvif = ifp;
5520         m->m_pkthdr.len = totlen;
5521         m->m_len = MHLEN;
5522
5523         while (totlen > 0) {
5524                 if (top != NULL) {
5525                         _MGET(m, M_DONTWAIT, MT_DATA);
5526                         if (m == NULL) {
5527                                 m_freem(top);
5528                                 return (NULL);
5529                         }
5530                         m->m_len = MLEN;
5531                 }
5532                 len = MIN(totlen, epkt - cp);
5533                 if (len >= MINCLSIZE) {
5534                         MCLGET(m, M_DONTWAIT);
5535                         if (m->m_flags & M_EXT) {
5536                                 m->m_len = len = MIN(len, m_maxsize(MC_CL));
5537                         } else {
5538                                 /* give up when it's out of cluster mbufs */
5539                                 if (top != NULL)
5540                                         m_freem(top);
5541                                 m_freem(m);
5542                                 return (NULL);
5543                         }
5544                 } else {
5545                         /*
5546                          * Place initial small packet/header at end of mbuf.
5547                          */
5548                         if (len < m->m_len) {
5549                                 if (top == NULL &&
5550                                     len + max_linkhdr <= m->m_len)
5551                                         m->m_data += max_linkhdr;
5552                                 m->m_len = len;
5553                         } else {
5554                                 len = m->m_len;
5555                         }
5556                 }
5557                 if (copy)
5558                         copy(cp, MTOD(m, caddr_t), (unsigned)len);
5559                 else
5560                         bcopy(cp, MTOD(m, caddr_t), (unsigned)len);
5561                 cp += len;
5562                 *mp = m;
5563                 mp = &m->m_next;
5564                 totlen -= len;
5565                 if (cp == epkt)
5566                         cp = buf;
5567         }
5568         return (top);
5569 }
5570
5571 #ifndef MBUF_GROWTH_NORMAL_THRESH
5572 #define MBUF_GROWTH_NORMAL_THRESH 25
5573 #endif
5574
5575 /*
5576  * Cluster freelist allocation check.
5577  */
5578 static int
5579 m_howmany(int num, size_t bufsize)
5580 {
5581         int i = 0, j = 0;
5582         u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters;
5583         u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree;
5584         u_int32_t sumclusters, freeclusters;
5585         u_int32_t percent_pool, percent_kmem;
5586         u_int32_t mb_growth, mb_growth_thresh;
5587
5588         VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
5589             bufsize == m_maxsize(MC_16KCL));
5590
5591         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5592
5593         /* Numbers in 2K cluster units */
5594         m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
5595         m_clusters = m_total(MC_CL);
5596         m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
5597         m_16kclusters = m_total(MC_16KCL);
5598         sumclusters = m_mbclusters + m_clusters + m_bigclusters;
5599
5600         m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT;
5601         m_clfree = m_infree(MC_CL);
5602         m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT;
5603         m_16kclfree = m_infree(MC_16KCL);
5604         freeclusters = m_mbfree + m_clfree + m_bigclfree;
5605
5606         /* Bail if we've maxed out the mbuf memory map */
5607         if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) ||
5608             (njcl > 0 && bufsize == m_maxsize(MC_16KCL) &&
5609             (m_16kclusters << NCLPJCLSHIFT) >= njcl)) {
5610                 return (0);
5611         }
5612
5613         if (bufsize == m_maxsize(MC_BIGCL)) {
5614                 /* Under minimum */
5615                 if (m_bigclusters < m_minlimit(MC_BIGCL))
5616                         return (m_minlimit(MC_BIGCL) - m_bigclusters);
5617
5618                 percent_pool =
5619                     ((sumclusters - freeclusters) * 100) / sumclusters;
5620                 percent_kmem = (sumclusters * 100) / nclusters;
5621
5622                 /*
5623                  * If a light/normal user, grow conservatively (75%)
5624                  * If a heavy user, grow aggressively (50%)
5625                  */
5626                 if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH)
5627                         mb_growth = MB_GROWTH_NORMAL;
5628                 else
5629                         mb_growth = MB_GROWTH_AGGRESSIVE;
5630
5631                 if (percent_kmem < 5) {
5632                         /* For initial allocations */
5633                         i = num;
5634                 } else {
5635                         /* Return if >= MBIGCL_LOWAT clusters available */
5636                         if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT &&
5637                             m_total(MC_BIGCL) >=
5638                             MBIGCL_LOWAT + m_minlimit(MC_BIGCL))
5639                                 return (0);
5640
5641                         /* Ensure at least num clusters are accessible */
5642                         if (num >= m_infree(MC_BIGCL))
5643                                 i = num - m_infree(MC_BIGCL);
5644                         if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL))
5645                                 j = num - (m_total(MC_BIGCL) -
5646                                     m_minlimit(MC_BIGCL));
5647
5648                         i = MAX(i, j);
5649
5650                         /*
5651                          * Grow pool if percent_pool > 75 (normal growth)
5652                          * or percent_pool > 50 (aggressive growth).
5653                          */
5654                         mb_growth_thresh = 100 - (100 / (1 << mb_growth));
5655                         if (percent_pool > mb_growth_thresh)
5656                                 j = ((sumclusters + num) >> mb_growth) -
5657                                     freeclusters;
5658                         i = MAX(i, j);
5659                 }
5660
5661                 /* Check to ensure we didn't go over limits */
5662                 if (i + m_bigclusters >= m_maxlimit(MC_BIGCL))
5663                         i = m_maxlimit(MC_BIGCL) - m_bigclusters;
5664                 if ((i << 1) + sumclusters >= nclusters)
5665                         i = (nclusters - sumclusters) >> 1;
5666                 VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
5667                 VERIFY(sumclusters + (i << 1) <= nclusters);
5668
5669         } else { /* 16K CL */
5670                 VERIFY(njcl > 0);
5671                 /* Ensure at least num clusters are available */
5672                 if (num >= m_16kclfree)
5673                         i = num - m_16kclfree;
5674
5675                 /* Always grow 16KCL pool aggressively */
5676                 if (((m_16kclusters + num) >> 1) > m_16kclfree)
5677                         j = ((m_16kclusters + num) >> 1) - m_16kclfree;
5678                 i = MAX(i, j);
5679
5680                 /* Check to ensure we don't go over limit */
5681                 if ((i + m_total(MC_16KCL)) >= m_maxlimit(MC_16KCL))
5682                         i = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
5683         }
5684         return (i);
5685 }
5686 /*
5687  * Return the number of bytes in the mbuf chain, m.
5688  */
5689 unsigned int
5690 m_length(struct mbuf *m)
5691 {
5692         struct mbuf *m0;
5693         unsigned int pktlen;
5694
5695         if (m->m_flags & M_PKTHDR)
5696                 return (m->m_pkthdr.len);
5697
5698         pktlen = 0;
5699         for (m0 = m; m0 != NULL; m0 = m0->m_next)
5700                 pktlen += m0->m_len;
5701         return (pktlen);
5702 }
5703
5704 /*
5705  * Copy data from a buffer back into the indicated mbuf chain,
5706  * starting "off" bytes from the beginning, extending the mbuf
5707  * chain if necessary.
5708  */
5709 void
5710 m_copyback(struct mbuf *m0, int off, int len, const void *cp)
5711 {
5712 #if DEBUG
5713         struct mbuf *origm = m0;
5714         int error;
5715 #endif /* DEBUG */
5716
5717         if (m0 == NULL)
5718                 return;
5719
5720 #if DEBUG
5721         error =
5722 #endif /* DEBUG */
5723         m_copyback0(&m0, off, len, cp,
5724             M_COPYBACK0_COPYBACK | M_COPYBACK0_EXTEND, M_DONTWAIT);
5725
5726 #if DEBUG
5727         if (error != 0 || (m0 != NULL && origm != m0))
5728                 panic("m_copyback");
5729 #endif /* DEBUG */
5730 }
5731
5732 struct mbuf *
5733 m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp, int how)
5734 {
5735         int error;
5736
5737         /* don't support chain expansion */
5738         VERIFY(off + len <= m_length(m0));
5739
5740         error = m_copyback0(&m0, off, len, cp,
5741             M_COPYBACK0_COPYBACK | M_COPYBACK0_COW, how);
5742         if (error) {
5743                 /*
5744                  * no way to recover from partial success.
5745                  * just free the chain.
5746                  */
5747                 m_freem(m0);
5748                 return (NULL);
5749         }
5750         return (m0);
5751 }
5752
5753 /*
5754  * m_makewritable: ensure the specified range writable.
5755  */
5756 int
5757 m_makewritable(struct mbuf **mp, int off, int len, int how)
5758 {
5759         int error;
5760 #if DEBUG
5761         struct mbuf *n;
5762         int origlen, reslen;
5763
5764         origlen = m_length(*mp);
5765 #endif /* DEBUG */
5766
5767 #if 0 /* M_COPYALL is large enough */
5768         if (len == M_COPYALL)
5769                 len = m_length(*mp) - off; /* XXX */
5770 #endif
5771
5772         error = m_copyback0(mp, off, len, NULL,
5773             M_COPYBACK0_PRESERVE | M_COPYBACK0_COW, how);
5774
5775 #if DEBUG
5776         reslen = 0;
5777         for (n = *mp; n; n = n->m_next)
5778                 reslen += n->m_len;
5779         if (origlen != reslen)
5780                 panic("m_makewritable: length changed");
5781         if (((*mp)->m_flags & M_PKTHDR) && reslen != (*mp)->m_pkthdr.len)
5782                 panic("m_makewritable: inconsist");
5783 #endif /* DEBUG */
5784
5785         return (error);
5786 }
5787
5788 static int
5789 m_copyback0(struct mbuf **mp0, int off, int len, const void *vp, int flags,
5790     int how)
5791 {
5792         int mlen;
5793         struct mbuf *m, *n;
5794         struct mbuf **mp;
5795         int totlen = 0;
5796         const char *cp = vp;
5797
5798         VERIFY(mp0 != NULL);
5799         VERIFY(*mp0 != NULL);
5800         VERIFY((flags & M_COPYBACK0_PRESERVE) == 0 || cp == NULL);
5801         VERIFY((flags & M_COPYBACK0_COPYBACK) == 0 || cp != NULL);
5802
5803         /*
5804          * we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
5805          * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
5806          */
5807
5808         VERIFY((~flags & (M_COPYBACK0_EXTEND|M_COPYBACK0_COW)) != 0);
5809
5810         mp = mp0;
5811         m = *mp;
5812         while (off > (mlen = m->m_len)) {
5813                 off -= mlen;
5814                 totlen += mlen;
5815                 if (m->m_next == NULL) {
5816                         int tspace;
5817 extend:
5818                         if (!(flags & M_COPYBACK0_EXTEND))
5819                                 goto out;
5820
5821                         /*
5822                          * try to make some space at the end of "m".
5823                          */
5824
5825                         mlen = m->m_len;
5826                         if (off + len >= MINCLSIZE &&
5827                             !(m->m_flags & M_EXT) && m->m_len == 0) {
5828                                 MCLGET(m, how);
5829                         }
5830                         tspace = M_TRAILINGSPACE(m);
5831                         if (tspace > 0) {
5832                                 tspace = MIN(tspace, off + len);
5833                                 VERIFY(tspace > 0);
5834                                 bzero(mtod(m, char *) + m->m_len,
5835                                     MIN(off, tspace));
5836                                 m->m_len += tspace;
5837                                 off += mlen;
5838                                 totlen -= mlen;
5839                                 continue;
5840                         }
5841
5842                         /*
5843                          * need to allocate an mbuf.
5844                          */
5845
5846                         if (off + len >= MINCLSIZE) {
5847                                 n = m_getcl(how, m->m_type, 0);
5848                         } else {
5849                                 n = _M_GET(how, m->m_type);
5850                         }
5851                         if (n == NULL) {
5852                                 goto out;
5853                         }
5854                         n->m_len = 0;
5855                         n->m_len = MIN(M_TRAILINGSPACE(n), off + len);
5856                         bzero(mtod(n, char *), MIN(n->m_len, off));
5857                         m->m_next = n;
5858                 }
5859                 mp = &m->m_next;
5860                 m = m->m_next;
5861         }
5862         while (len > 0) {
5863                 mlen = m->m_len - off;
5864                 if (mlen != 0 && m_mclhasreference(m)) {
5865                         char *datap;
5866                         int eatlen;
5867
5868                         /*
5869                          * this mbuf is read-only.
5870                          * allocate a new writable mbuf and try again.
5871                          */
5872
5873 #if DIAGNOSTIC
5874                         if (!(flags & M_COPYBACK0_COW))
5875                                 panic("m_copyback0: read-only");
5876 #endif /* DIAGNOSTIC */
5877
5878                         /*
5879                          * if we're going to write into the middle of
5880                          * a mbuf, split it first.
5881                          */
5882                         if (off > 0 && len < mlen) {
5883                                 n = m_split0(m, off, how, 0);
5884                                 if (n == NULL)
5885                                         goto enobufs;
5886                                 m->m_next = n;
5887                                 mp = &m->m_next;
5888                                 m = n;
5889                                 off = 0;
5890                                 continue;
5891                         }
5892
5893                         /*
5894                          * XXX TODO coalesce into the trailingspace of
5895                          * the previous mbuf when possible.
5896                          */
5897
5898                         /*
5899                          * allocate a new mbuf.  copy packet header if needed.
5900                          */
5901                         n = _M_GET(how, m->m_type);
5902                         if (n == NULL)
5903                                 goto enobufs;
5904                         if (off == 0 && (m->m_flags & M_PKTHDR)) {
5905                                 M_COPY_PKTHDR(n, m);
5906                                 n->m_len = MHLEN;
5907                         } else {
5908                                 if (len >= MINCLSIZE)
5909                                         MCLGET(n, M_DONTWAIT);
5910                                 n->m_len =
5911                                     (n->m_flags & M_EXT) ? MCLBYTES : MLEN;
5912                         }
5913                         if (n->m_len > len)
5914                                 n->m_len = len;
5915
5916                         /*
5917                          * free the region which has been overwritten.
5918                          * copying data from old mbufs if requested.
5919                          */
5920                         if (flags & M_COPYBACK0_PRESERVE)
5921                                 datap = mtod(n, char *);
5922                         else
5923                                 datap = NULL;
5924                         eatlen = n->m_len;
5925                         VERIFY(off == 0 || eatlen >= mlen);
5926                         if (off > 0) {
5927                                 VERIFY(len >= mlen);
5928                                 m->m_len = off;
5929                                 m->m_next = n;
5930                                 if (datap) {
5931                                         m_copydata(m, off, mlen, datap);
5932                                         datap += mlen;
5933                                 }
5934                                 eatlen -= mlen;
5935                                 mp = &m->m_next;
5936                                 m = m->m_next;
5937                         }
5938                         while (m != NULL && m_mclhasreference(m) &&
5939                             n->m_type == m->m_type && eatlen > 0) {
5940                                 mlen = MIN(eatlen, m->m_len);
5941                                 if (datap) {
5942                                         m_copydata(m, 0, mlen, datap);
5943                                         datap += mlen;
5944                                 }
5945                                 m->m_data += mlen;
5946                                 m->m_len -= mlen;
5947                                 eatlen -= mlen;
5948                                 if (m->m_len == 0)
5949                                         *mp = m = m_free(m);
5950                         }
5951                         if (eatlen > 0)
5952                                 n->m_len -= eatlen;
5953                         n->m_next = m;
5954                         *mp = m = n;
5955                         continue;
5956                 }
5957                 mlen = MIN(mlen, len);
5958                 if (flags & M_COPYBACK0_COPYBACK) {
5959                         bcopy(cp, mtod(m, caddr_t) + off, (unsigned)mlen);
5960                         cp += mlen;
5961                 }
5962                 len -= mlen;
5963                 mlen += off;
5964                 off = 0;
5965                 totlen += mlen;
5966                 if (len == 0)
5967                         break;
5968                 if (m->m_next == NULL) {
5969                         goto extend;
5970                 }
5971                 mp = &m->m_next;
5972                 m = m->m_next;
5973         }
5974 out:
5975         if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) {
5976                 VERIFY(flags & M_COPYBACK0_EXTEND);
5977                 m->m_pkthdr.len = totlen;
5978         }
5979
5980         return (0);
5981
5982 enobufs:
5983         return (ENOBUFS);
5984 }
5985
5986 uint64_t
5987 mcl_to_paddr(char *addr)
5988 {
5989         vm_offset_t base_phys;
5990
5991         if (!MBUF_IN_MAP(addr))
5992                 return (0);
5993         base_phys = mcl_paddr[atop_64(addr - (char *)mbutl)];
5994
5995         if (base_phys == 0)
5996                 return (0);
5997         return ((uint64_t)(ptoa_64(base_phys) | ((uint64_t)addr & PAGE_MASK)));
5998 }
5999
6000 /*
6001  * Dup the mbuf chain passed in.  The whole thing.  No cute additional cruft.
6002  * And really copy the thing.  That way, we don't "precompute" checksums
6003  * for unsuspecting consumers.  Assumption: m->m_nextpkt == 0.  Trick: for
6004  * small packets, don't dup into a cluster.  That way received  packets
6005  * don't take up too much room in the sockbuf (cf. sbspace()).
6006  */
6007 int MDFail;
6008
6009 struct mbuf *
6010 m_dup(struct mbuf *m, int how)
6011 {
6012         struct mbuf *n, **np;
6013         struct mbuf *top;
6014         int copyhdr = 0;
6015
6016         np = &top;
6017         top = NULL;
6018         if (m->m_flags & M_PKTHDR)
6019                 copyhdr = 1;
6020
6021         /*
6022          * Quick check: if we have one mbuf and its data fits in an
6023          *  mbuf with packet header, just copy and go.
6024          */
6025         if (m->m_next == NULL) {
6026                 /* Then just move the data into an mbuf and be done... */
6027                 if (copyhdr) {
6028                         if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
6029                                 if ((n = _M_GETHDR(how, m->m_type)) == NULL)
6030                                         return (NULL);
6031                                 n->m_len = m->m_len;
6032                                 m_dup_pkthdr(n, m, how);
6033                                 bcopy(m->m_data, n->m_data, m->m_len);
6034                                 return (n);
6035                         }
6036                 } else if (m->m_len <= MLEN) {
6037                         if ((n = _M_GET(how, m->m_type)) == NULL)
6038                                 return (NULL);
6039                         bcopy(m->m_data, n->m_data, m->m_len);
6040                         n->m_len = m->m_len;
6041                         return (n);
6042                 }
6043         }
6044         while (m != NULL) {
6045 #if BLUE_DEBUG
6046                 printf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
6047                     m->m_data);
6048 #endif
6049                 if (copyhdr)
6050                         n = _M_GETHDR(how, m->m_type);
6051                 else
6052                         n = _M_GET(how, m->m_type);
6053                 if (n == NULL)
6054                         goto nospace;
6055                 if (m->m_flags & M_EXT) {
6056                         if (m->m_len <= m_maxsize(MC_CL))
6057                                 MCLGET(n, how);
6058                         else if (m->m_len <= m_maxsize(MC_BIGCL))
6059                                 n = m_mbigget(n, how);
6060                         else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0)
6061                                 n = m_m16kget(n, how);
6062                         if (!(n->m_flags & M_EXT)) {
6063                                 (void) m_free(n);
6064                                 goto nospace;
6065                         }
6066                 }
6067                 *np = n;
6068                 if (copyhdr) {
6069                         /* Don't use M_COPY_PKTHDR: preserve m_data */
6070                         m_dup_pkthdr(n, m, how);
6071                         copyhdr = 0;
6072                         if (!(n->m_flags & M_EXT))
6073                                 n->m_data = n->m_pktdat;
6074                 }
6075                 n->m_len = m->m_len;
6076                 /*
6077                  * Get the dup on the same bdry as the original
6078                  * Assume that the two mbufs have the same offset to data area
6079                  * (up to word boundaries)
6080                  */
6081                 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len);
6082                 m = m->m_next;
6083                 np = &n->m_next;
6084 #if BLUE_DEBUG
6085                 printf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
6086                     n->m_data);
6087 #endif
6088         }
6089
6090         if (top == NULL)
6091                 MDFail++;
6092         return (top);
6093
6094 nospace:
6095         m_freem(top);
6096         MDFail++;
6097         return (NULL);
6098 }
6099
6100 #define MBUF_MULTIPAGES(m)                                              \
6101         (((m)->m_flags & M_EXT) &&                                      \
6102         ((IS_P2ALIGNED((m)->m_data, PAGE_SIZE)                          \
6103         && (m)->m_len > PAGE_SIZE) ||                                   \
6104         (!IS_P2ALIGNED((m)->m_data, PAGE_SIZE) &&                       \
6105         P2ROUNDUP((m)->m_data, PAGE_SIZE) < ((uintptr_t)(m)->m_data + (m)->m_len))))
6106
6107 static struct mbuf *
6108 m_expand(struct mbuf *m, struct mbuf **last)
6109 {
6110         struct mbuf *top = NULL;
6111         struct mbuf **nm = &top;
6112         uintptr_t data0, data;
6113         unsigned int len0, len;
6114
6115         VERIFY(MBUF_MULTIPAGES(m));
6116         VERIFY(m->m_next == NULL);
6117         data0 = (uintptr_t)m->m_data;
6118         len0 = m->m_len;
6119         *last = top;
6120
6121         for (;;) {
6122                 struct mbuf *n;
6123
6124                 data = data0;
6125                 if (IS_P2ALIGNED(data, PAGE_SIZE) && len0 > PAGE_SIZE)
6126                         len = PAGE_SIZE;
6127                 else if (!IS_P2ALIGNED(data, PAGE_SIZE) &&
6128                     P2ROUNDUP(data, PAGE_SIZE) < (data + len0))
6129                         len = P2ROUNDUP(data, PAGE_SIZE) - data;
6130                 else
6131                         len = len0;
6132
6133                 VERIFY(len > 0);
6134                 VERIFY(m->m_flags & M_EXT);
6135                 m->m_data = (void *)data;
6136                 m->m_len = len;
6137
6138                 *nm = *last = m;
6139                 nm = &m->m_next;
6140                 m->m_next = NULL;
6141
6142                 data0 += len;
6143                 len0 -= len;
6144                 if (len0 == 0)
6145                         break;
6146
6147                 n = _M_RETRY(M_DONTWAIT, MT_DATA);
6148                 if (n == NULL) {
6149                         m_freem(top);
6150                         top = *last = NULL;
6151                         break;
6152                 }
6153
6154                 n->m_ext = m->m_ext;
6155                 m_incref(m);
6156                 n->m_flags |= M_EXT;
6157                 m = n;
6158         }
6159         return (top);
6160 }
6161
6162 struct mbuf *
6163 m_normalize(struct mbuf *m)
6164 {
6165         struct mbuf *top = NULL;
6166         struct mbuf **nm = &top;
6167         boolean_t expanded = FALSE;
6168
6169         while (m != NULL) {
6170                 struct mbuf *n;
6171
6172                 n = m->m_next;
6173                 m->m_next = NULL;
6174
6175                 /* Does the data cross one or more page boundaries? */
6176                 if (MBUF_MULTIPAGES(m)) {
6177                         struct mbuf *last;
6178                         if ((m = m_expand(m, &last)) == NULL) {
6179                                 m_freem(n);
6180                                 m_freem(top);
6181                                 top = NULL;
6182                                 break;
6183                         }
6184                         *nm = m;
6185                         nm = &last->m_next;
6186                         expanded = TRUE;
6187                 } else {
6188                         *nm = m;
6189                         nm = &m->m_next;
6190                 }
6191                 m = n;
6192         }
6193         if (expanded)
6194                 atomic_add_32(&mb_normalized, 1);
6195         return (top);
6196 }
6197
6198 /*
6199  * Append the specified data to the indicated mbuf chain,
6200  * Extend the mbuf chain if the new data does not fit in
6201  * existing space.
6202  *
6203  * Return 1 if able to complete the job; otherwise 0.
6204  */
6205 int
6206 m_append(struct mbuf *m0, int len, caddr_t cp)
6207 {
6208         struct mbuf *m, *n;
6209         int remainder, space;
6210
6211         for (m = m0; m->m_next != NULL; m = m->m_next)
6212                 ;
6213         remainder = len;
6214         space = M_TRAILINGSPACE(m);
6215         if (space > 0) {
6216                 /*
6217                  * Copy into available space.
6218                  */
6219                 if (space > remainder)
6220                         space = remainder;
6221                 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
6222                 m->m_len += space;
6223                 cp += space;
6224                 remainder -= space;
6225         }
6226         while (remainder > 0) {
6227                 /*
6228                  * Allocate a new mbuf; could check space
6229                  * and allocate a cluster instead.
6230                  */
6231                 n = m_get(M_WAITOK, m->m_type);
6232                 if (n == NULL)
6233                         break;
6234                 n->m_len = min(MLEN, remainder);
6235                 bcopy(cp, mtod(n, caddr_t), n->m_len);
6236                 cp += n->m_len;
6237                 remainder -= n->m_len;
6238                 m->m_next = n;
6239                 m = n;
6240         }
6241         if (m0->m_flags & M_PKTHDR)
6242                 m0->m_pkthdr.len += len - remainder;
6243         return (remainder == 0);
6244 }
6245
6246 struct mbuf *
6247 m_last(struct mbuf *m)
6248 {
6249         while (m->m_next != NULL)
6250                 m = m->m_next;
6251         return (m);
6252 }
6253
6254 unsigned int
6255 m_fixhdr(struct mbuf *m0)
6256 {
6257         u_int len;
6258
6259         VERIFY(m0->m_flags & M_PKTHDR);
6260
6261         len = m_length2(m0, NULL);
6262         m0->m_pkthdr.len = len;
6263         return (len);
6264 }
6265
6266 unsigned int
6267 m_length2(struct mbuf *m0, struct mbuf **last)
6268 {
6269         struct mbuf *m;
6270         u_int len;
6271
6272         len = 0;
6273         for (m = m0; m != NULL; m = m->m_next) {
6274                 len += m->m_len;
6275                 if (m->m_next == NULL)
6276                         break;
6277         }
6278         if (last != NULL)
6279                 *last = m;
6280         return (len);
6281 }
6282
6283 /*
6284  * Defragment a mbuf chain, returning the shortest possible chain of mbufs
6285  * and clusters.  If allocation fails and this cannot be completed, NULL will
6286  * be returned, but the passed in chain will be unchanged.  Upon success,
6287  * the original chain will be freed, and the new chain will be returned.
6288  *
6289  * If a non-packet header is passed in, the original mbuf (chain?) will
6290  * be returned unharmed.
6291  *
6292  * If offset is specfied, the first mbuf in the chain will have a leading
6293  * space of the amount stated by the "off" parameter.
6294  *
6295  * This routine requires that the m_pkthdr.header field of the original
6296  * mbuf chain is cleared by the caller.
6297  */
6298 struct mbuf *
6299 m_defrag_offset(struct mbuf *m0, u_int32_t off, int how)
6300 {
6301         struct mbuf *m_new = NULL, *m_final = NULL;
6302         int progress = 0, length, pktlen;
6303
6304         if (!(m0->m_flags & M_PKTHDR))
6305                 return (m0);
6306
6307         VERIFY(off < MHLEN);
6308         m_fixhdr(m0); /* Needed sanity check */
6309
6310         pktlen = m0->m_pkthdr.len + off;
6311         if (pktlen > MHLEN)
6312                 m_final = m_getcl(how, MT_DATA, M_PKTHDR);
6313         else
6314                 m_final = m_gethdr(how, MT_DATA);
6315
6316         if (m_final == NULL)
6317                 goto nospace;
6318
6319         if (off > 0) {
6320                 pktlen -= off;
6321                 m_final->m_data += off;
6322         }
6323
6324         /*
6325          * Caller must have handled the contents pointed to by this
6326          * pointer before coming here, as otherwise it will point to
6327          * the original mbuf which will get freed upon success.
6328          */
6329         VERIFY(m0->m_pkthdr.pkt_hdr == NULL);
6330
6331         if (m_dup_pkthdr(m_final, m0, how) == 0)
6332                 goto nospace;
6333
6334         m_new = m_final;
6335
6336         while (progress < pktlen) {
6337                 length = pktlen - progress;
6338                 if (length > MCLBYTES)
6339                         length = MCLBYTES;
6340                 length -= ((m_new == m_final) ? off : 0);
6341                 if (length < 0)
6342                         goto nospace;
6343
6344                 if (m_new == NULL) {
6345                         if (length > MLEN)
6346                                 m_new = m_getcl(how, MT_DATA, 0);
6347                         else
6348                                 m_new = m_get(how, MT_DATA);
6349                         if (m_new == NULL)
6350                                 goto nospace;
6351                 }
6352
6353                 m_copydata(m0, progress, length, mtod(m_new, caddr_t));
6354                 progress += length;
6355                 m_new->m_len = length;
6356                 if (m_new != m_final)
6357                         m_cat(m_final, m_new);
6358                 m_new = NULL;
6359         }
6360         m_freem(m0);
6361         m0 = m_final;
6362         return (m0);
6363 nospace:
6364         if (m_final)
6365                 m_freem(m_final);
6366         return (NULL);
6367 }
6368
6369 struct mbuf *
6370 m_defrag(struct mbuf *m0, int how)
6371 {
6372         return (m_defrag_offset(m0, 0, how));
6373 }
6374
6375 void
6376 m_mchtype(struct mbuf *m, int t)
6377 {
6378         mtype_stat_inc(t);
6379         mtype_stat_dec(m->m_type);
6380         (m)->m_type = t;
6381 }
6382
6383 void *
6384 m_mtod(struct mbuf *m)
6385 {
6386         return (MTOD(m, void *));
6387 }
6388
6389 struct mbuf *
6390 m_dtom(void *x)
6391 {
6392         return ((struct mbuf *)((uintptr_t)(x) & ~(MSIZE-1)));
6393 }
6394
6395 void
6396 m_mcheck(struct mbuf *m)
6397 {
6398         _MCHECK(m);
6399 }
6400
6401 /*
6402  * Return a pointer to mbuf/offset of location in mbuf chain.
6403  */
6404 struct mbuf *
6405 m_getptr(struct mbuf *m, int loc, int *off)
6406 {
6407
6408         while (loc >= 0) {
6409                 /* Normal end of search. */
6410                 if (m->m_len > loc) {
6411                         *off = loc;
6412                         return (m);
6413                 } else {
6414                         loc -= m->m_len;
6415                         if (m->m_next == NULL) {
6416                                 if (loc == 0) {
6417                                         /* Point at the end of valid data. */
6418                                         *off = m->m_len;
6419                                         return (m);
6420                                 }
6421                                 return (NULL);
6422                         }
6423                         m = m->m_next;
6424                 }
6425         }
6426         return (NULL);
6427 }
6428
6429 /*
6430  * Inform the corresponding mcache(s) that there's a waiter below.
6431  */
6432 static void
6433 mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
6434 {
6435         mcache_waiter_inc(m_cache(class));
6436         if (comp) {
6437                 if (class == MC_CL) {
6438                         mcache_waiter_inc(m_cache(MC_MBUF_CL));
6439                 } else if (class == MC_BIGCL) {
6440                         mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
6441                 } else if (class == MC_16KCL) {
6442                         mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
6443                 } else {
6444                         mcache_waiter_inc(m_cache(MC_MBUF_CL));
6445                         mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
6446                 }
6447         }
6448 }
6449
6450 /*
6451  * Inform the corresponding mcache(s) that there's no more waiter below.
6452  */
6453 static void
6454 mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
6455 {
6456         mcache_waiter_dec(m_cache(class));
6457         if (comp) {
6458                 if (class == MC_CL) {
6459                         mcache_waiter_dec(m_cache(MC_MBUF_CL));
6460                 } else if (class == MC_BIGCL) {
6461                         mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
6462                 } else if (class == MC_16KCL) {
6463                         mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
6464                 } else {
6465                         mcache_waiter_dec(m_cache(MC_MBUF_CL));
6466                         mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
6467                 }
6468         }
6469 }
6470
6471 /*
6472  * Called during slab (blocking and non-blocking) allocation.  If there
6473  * is at least one waiter, and the time since the first waiter is blocked
6474  * is greater than the watchdog timeout, panic the system.
6475  */
6476 static void
6477 mbuf_watchdog(void)
6478 {
6479         struct timeval now;
6480         unsigned int since;
6481
6482         if (mb_waiters == 0 || !mb_watchdog)
6483                 return;
6484
6485         microuptime(&now);
6486         since = now.tv_sec - mb_wdtstart.tv_sec;
6487         if (since >= MB_WDT_MAXTIME) {
6488                 panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__,
6489                     mb_waiters, since, mbuf_dump());
6490                 /* NOTREACHED */
6491         }
6492 }
6493
6494 /*
6495  * Called during blocking allocation.  Returns TRUE if one or more objects
6496  * are available at the per-CPU caches layer and that allocation should be
6497  * retried at that level.
6498  */
6499 static boolean_t
6500 mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
6501 {
6502         boolean_t mcache_retry = FALSE;
6503
6504         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6505
6506         /* Check if there's anything at the cache layer */
6507         if (mbuf_cached_above(class, wait)) {
6508                 mcache_retry = TRUE;
6509                 goto done;
6510         }
6511
6512         /* Nothing?  Then try hard to get it from somewhere */
6513         m_reclaim(class, num, (wait & MCR_COMP));
6514
6515         /* We tried hard and got something? */
6516         if (m_infree(class) > 0) {
6517                 mbstat.m_wait++;
6518                 goto done;
6519         } else if (mbuf_cached_above(class, wait)) {
6520                 mbstat.m_wait++;
6521                 mcache_retry = TRUE;
6522                 goto done;
6523         } else if (wait & MCR_TRYHARD) {
6524                 mcache_retry = TRUE;
6525                 goto done;
6526         }
6527
6528         /*
6529          * There's really nothing for us right now; inform the
6530          * cache(s) that there is a waiter below and go to sleep.
6531          */
6532         mbuf_waiter_inc(class, (wait & MCR_COMP));
6533
6534         VERIFY(!(wait & MCR_NOSLEEP));
6535
6536         /*
6537          * If this is the first waiter, arm the watchdog timer.  Otherwise
6538          * check if we need to panic the system due to watchdog timeout.
6539          */
6540         if (mb_waiters == 0)
6541                 microuptime(&mb_wdtstart);
6542         else
6543                 mbuf_watchdog();
6544
6545         mb_waiters++;
6546         m_region_expand(class) += m_total(class) + num;
6547         /* wake up the worker thread */
6548         if (mbuf_worker_ready &&
6549             mbuf_worker_needs_wakeup) {
6550                 wakeup((caddr_t)&mbuf_worker_needs_wakeup);
6551                 mbuf_worker_needs_wakeup = FALSE;
6552         }
6553
6554         (void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL);
6555
6556         /* We are now up; stop getting notified until next round */
6557         mbuf_waiter_dec(class, (wait & MCR_COMP));
6558
6559         /* We waited and got something */
6560         if (m_infree(class) > 0) {
6561                 mbstat.m_wait++;
6562                 goto done;
6563         } else if (mbuf_cached_above(class, wait)) {
6564                 mbstat.m_wait++;
6565                 mcache_retry = TRUE;
6566         }
6567 done:
6568         return (mcache_retry);
6569 }
6570
6571 __attribute__((noreturn))
6572 static void
6573 mbuf_worker_thread(void)
6574 {
6575         int mbuf_expand;
6576
6577         while (1) {
6578                 lck_mtx_lock(mbuf_mlock);
6579                 mbuf_worker_run_cnt++;
6580                 mbuf_expand = 0;
6581                 if (m_region_expand(MC_CL) > 0) {
6582                         int n;
6583                         mb_expand_cl_cnt++;
6584                         /* Adjust to current number of cluster in use */
6585                         n = m_region_expand(MC_CL) -
6586                             (m_total(MC_CL) - m_infree(MC_CL));
6587                         if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL))
6588                                 n = m_maxlimit(MC_CL) - m_total(MC_CL);
6589                         if (n > 0) {
6590                                 mb_expand_cl_total += n;
6591                         }
6592                         m_region_expand(MC_CL) = 0;
6593
6594                         if (n > 0)
6595                                 freelist_populate(MC_CL, n, M_WAIT);
6596                 }
6597                 if (m_region_expand(MC_BIGCL) > 0) {
6598                         int n;
6599                         mb_expand_bigcl_cnt++;
6600                         /* Adjust to current number of 4 KB cluster in use */
6601                         n = m_region_expand(MC_BIGCL) -
6602                             (m_total(MC_BIGCL) - m_infree(MC_BIGCL));
6603                         if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL))
6604                                 n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
6605                         if (n > 0) {
6606                                 mb_expand_bigcl_total += n;
6607                         }
6608                         m_region_expand(MC_BIGCL) = 0;
6609
6610                         if (n > 0)
6611                                 freelist_populate(MC_BIGCL, n, M_WAIT);
6612                 }
6613                 if (m_region_expand(MC_16KCL) > 0) {
6614                         int n;
6615                         mb_expand_16kcl_cnt++;
6616                         /* Adjust to current number of 16 KB cluster in use */
6617                         n = m_region_expand(MC_16KCL) -
6618                             (m_total(MC_16KCL) - m_infree(MC_16KCL));
6619                         if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL))
6620                                 n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
6621                         if (n > 0) {
6622                                 mb_expand_16kcl_total += n;
6623                         }
6624                         m_region_expand(MC_16KCL) = 0;
6625
6626                         if (n > 0)
6627                                 (void) freelist_populate(MC_16KCL, n, M_WAIT);
6628                 }
6629
6630                 /*
6631                  * Because we can run out of memory before filling the mbuf
6632                  * map, we should not allocate more clusters than they are
6633                  * mbufs -- otherwise we could have a large number of useless
6634                  * clusters allocated.
6635                  */
6636                 while (m_total(MC_MBUF) <
6637                     (m_total(MC_BIGCL) + m_total(MC_CL) + m_total(MC_16KCL))) {
6638                         mb_expand_cnt++;
6639                         if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0)
6640                                 break;
6641                 }
6642
6643                 mbuf_worker_needs_wakeup = TRUE;
6644                 /*
6645                  * If there's a deadlock and we're not sending / receiving
6646                  * packets, net_uptime() won't be updated.  Update it here
6647                  * so we are sure it's correct.
6648                  */
6649                 net_update_uptime();
6650                 mbuf_worker_last_runtime = net_uptime();
6651                 assert_wait((caddr_t)&mbuf_worker_needs_wakeup,
6652                     THREAD_UNINT);
6653                 lck_mtx_unlock(mbuf_mlock);
6654                 (void) thread_block((thread_continue_t)mbuf_worker_thread);
6655         }
6656 }
6657
6658 __attribute__((noreturn))
6659 static void
6660 mbuf_worker_thread_init(void)
6661 {
6662         mbuf_worker_ready++;
6663         mbuf_worker_thread();
6664 }
6665
6666 static mcl_slab_t *
6667 slab_get(void *buf)
6668 {
6669         mcl_slabg_t *slg;
6670         unsigned int ix, k;
6671
6672         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6673
6674         VERIFY(MBUF_IN_MAP(buf));
6675         ix = ((unsigned char *)buf - mbutl) >> MBSHIFT;
6676         VERIFY(ix < maxslabgrp);
6677
6678         if ((slg = slabstbl[ix]) == NULL) {
6679                 /*
6680                  * In the current implementation, we never shrink the slabs
6681                  * table; if we attempt to reallocate a cluster group when
6682                  * it's already allocated, panic since this is a sign of a
6683                  * memory corruption (slabstbl[ix] got nullified).
6684                  */
6685                 ++slabgrp;
6686                 VERIFY(ix < slabgrp);
6687                 /*
6688                  * Slabs expansion can only be done single threaded; when
6689                  * we get here, it must be as a result of m_clalloc() which
6690                  * is serialized and therefore mb_clalloc_busy must be set.
6691                  */
6692                 VERIFY(mb_clalloc_busy);
6693                 lck_mtx_unlock(mbuf_mlock);
6694
6695                 /* This is a new buffer; create the slabs group for it */
6696                 MALLOC(slg, mcl_slabg_t *, sizeof (*slg), M_TEMP,
6697                     M_WAITOK | M_ZERO);
6698                 MALLOC(slg->slg_slab, mcl_slab_t *, sizeof(mcl_slab_t) * NSLABSPMB,
6699                     M_TEMP, M_WAITOK | M_ZERO);
6700                 VERIFY(slg != NULL && slg->slg_slab != NULL);
6701
6702                 lck_mtx_lock(mbuf_mlock);
6703                 /*
6704                  * No other thread could have gone into m_clalloc() after
6705                  * we dropped the lock above, so verify that it's true.
6706                  */
6707                 VERIFY(mb_clalloc_busy);
6708
6709                 slabstbl[ix] = slg;
6710
6711                 /* Chain each slab in the group to its forward neighbor */
6712                 for (k = 1; k < NSLABSPMB; k++)
6713                         slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
6714                 VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
6715
6716                 /* And chain the last slab in the previous group to this */
6717                 if (ix > 0) {
6718                         VERIFY(slabstbl[ix - 1]->
6719                             slg_slab[NSLABSPMB - 1].sl_next == NULL);
6720                         slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
6721                             &slg->slg_slab[0];
6722                 }
6723         }
6724
6725         ix = MTOPG(buf) % NSLABSPMB;
6726         VERIFY(ix < NSLABSPMB);
6727
6728         return (&slg->slg_slab[ix]);
6729 }
6730
6731 static void
6732 slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
6733     void *base, void *head, unsigned int len, int refcnt, int chunks)
6734 {
6735         sp->sl_class = class;
6736         sp->sl_flags = flags;
6737         sp->sl_base = base;
6738         sp->sl_head = head;
6739         sp->sl_len = len;
6740         sp->sl_refcnt = refcnt;
6741         sp->sl_chunks = chunks;
6742         slab_detach(sp);
6743 }
6744
6745 static void
6746 slab_insert(mcl_slab_t *sp, mbuf_class_t class)
6747 {
6748         VERIFY(slab_is_detached(sp));
6749         m_slab_cnt(class)++;
6750         TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
6751         sp->sl_flags &= ~SLF_DETACHED;
6752
6753         /*
6754          * If a buffer spans multiple contiguous pages then mark them as
6755          * detached too
6756          */
6757         if (class == MC_16KCL) {
6758                 int k;
6759                 for (k = 1; k < NSLABSP16KB; k++) {
6760                         sp = sp->sl_next;
6761                         /* Next slab must already be present */
6762                         VERIFY(sp != NULL && slab_is_detached(sp));
6763                         sp->sl_flags &= ~SLF_DETACHED;
6764                 }
6765         }
6766 }
6767
6768 static void
6769 slab_remove(mcl_slab_t *sp, mbuf_class_t class)
6770 {
6771         int k;
6772         VERIFY(!slab_is_detached(sp));
6773         VERIFY(m_slab_cnt(class) > 0);
6774         m_slab_cnt(class)--;
6775         TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
6776         slab_detach(sp);
6777         if (class == MC_16KCL) {
6778                 for (k = 1; k < NSLABSP16KB; k++) {
6779                         sp = sp->sl_next;
6780                         /* Next slab must already be present */
6781                         VERIFY(sp != NULL);
6782                         VERIFY(!slab_is_detached(sp));
6783                         slab_detach(sp);
6784                 }
6785         }
6786 }
6787
6788 static boolean_t
6789 slab_inrange(mcl_slab_t *sp, void *buf)
6790 {
6791         return ((uintptr_t)buf >= (uintptr_t)sp->sl_base &&
6792             (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len));
6793 }
6794
6795 #undef panic
6796
6797 static void
6798 slab_nextptr_panic(mcl_slab_t *sp, void *addr)
6799 {
6800         int i;
6801         unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
6802         uintptr_t buf = (uintptr_t)sp->sl_base;
6803
6804         for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
6805                 void *next = ((mcache_obj_t *)buf)->obj_next;
6806                 if (next != addr)
6807                         continue;
6808                 if (!mclverify) {
6809                         if (next != NULL && !MBUF_IN_MAP(next)) {
6810                                 mcache_t *cp = m_cache(sp->sl_class);
6811                                 panic("%s: %s buffer %p in slab %p modified "
6812                                     "after free at offset 0: %p out of range "
6813                                     "[%p-%p)\n", __func__, cp->mc_name,
6814                                     (void *)buf, sp, next, mbutl, embutl);
6815                                 /* NOTREACHED */
6816                         }
6817                 } else {
6818                         mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
6819                             (mcache_obj_t *)buf);
6820                         mcl_audit_verify_nextptr(next, mca);
6821                 }
6822         }
6823 }
6824
6825 static void
6826 slab_detach(mcl_slab_t *sp)
6827 {
6828         sp->sl_link.tqe_next = (mcl_slab_t *)-1;
6829         sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
6830         sp->sl_flags |= SLF_DETACHED;
6831 }
6832
6833 static boolean_t
6834 slab_is_detached(mcl_slab_t *sp)
6835 {
6836         return ((intptr_t)sp->sl_link.tqe_next == -1 &&
6837             (intptr_t)sp->sl_link.tqe_prev == -1 &&
6838             (sp->sl_flags & SLF_DETACHED));
6839 }
6840
6841 static void
6842 mcl_audit_init(void *buf, mcache_audit_t **mca_list,
6843     mcache_obj_t **con_list, size_t con_size, unsigned int num)
6844 {
6845         mcache_audit_t *mca, *mca_tail;
6846         mcache_obj_t *con = NULL;
6847         boolean_t save_contents = (con_list != NULL);
6848         unsigned int i, ix;
6849
6850         ASSERT(num <= NMBPG);
6851         ASSERT(con_list == NULL || con_size != 0);
6852
6853         ix = MTOPG(buf);
6854         VERIFY(ix < maxclaudit);
6855
6856         /* Make sure we haven't been here before */
6857         for (i = 0; i < num; i++)
6858                 VERIFY(mclaudit[ix].cl_audit[i] == NULL);
6859
6860         mca = mca_tail = *mca_list;
6861         if (save_contents)
6862                 con = *con_list;
6863
6864         for (i = 0; i < num; i++) {
6865                 mcache_audit_t *next;
6866
6867                 next = mca->mca_next;
6868                 bzero(mca, sizeof (*mca));
6869                 mca->mca_next = next;
6870                 mclaudit[ix].cl_audit[i] = mca;
6871
6872                 /* Attach the contents buffer if requested */
6873                 if (save_contents) {
6874                         mcl_saved_contents_t *msc =
6875                             (mcl_saved_contents_t *)(void *)con;
6876
6877                         VERIFY(msc != NULL);
6878                         VERIFY(IS_P2ALIGNED(msc, sizeof (u_int64_t)));
6879                         VERIFY(con_size == sizeof (*msc));
6880                         mca->mca_contents_size = con_size;
6881                         mca->mca_contents = msc;
6882                         con = con->obj_next;
6883                         bzero(mca->mca_contents, mca->mca_contents_size);
6884                 }
6885
6886                 mca_tail = mca;
6887                 mca = mca->mca_next;
6888         }
6889
6890         if (save_contents)
6891                 *con_list = con;
6892
6893         *mca_list = mca_tail->mca_next;
6894         mca_tail->mca_next = NULL;
6895 }
6896
6897 static void
6898 mcl_audit_free(void *buf, unsigned int num)
6899 {
6900         unsigned int i, ix;
6901         mcache_audit_t *mca, *mca_list;
6902
6903         ix = MTOPG(buf);
6904         VERIFY(ix < maxclaudit);
6905
6906         if (mclaudit[ix].cl_audit[0] != NULL) {
6907                 mca_list = mclaudit[ix].cl_audit[0];
6908                 for (i = 0; i < num; i++) {
6909                         mca = mclaudit[ix].cl_audit[i];
6910                         mclaudit[ix].cl_audit[i] = NULL;
6911                         if (mca->mca_contents)
6912                                 mcache_free(mcl_audit_con_cache,
6913                                     mca->mca_contents);
6914                 }
6915                 mcache_free_ext(mcache_audit_cache,
6916                     (mcache_obj_t *)mca_list);
6917         }
6918 }
6919
6920 /*
6921  * Given an address of a buffer (mbuf/2KB/4KB/16KB), return
6922  * the corresponding audit structure for that buffer.
6923  */
6924 static mcache_audit_t *
6925 mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *mobj)
6926 {
6927         mcache_audit_t *mca = NULL;
6928         int ix = MTOPG(mobj), m_idx = 0;
6929         unsigned char *page_addr;
6930
6931         VERIFY(ix < maxclaudit);
6932         VERIFY(IS_P2ALIGNED(mobj, MIN(m_maxsize(class), PAGE_SIZE)));
6933
6934         page_addr = PGTOM(ix);
6935
6936         switch (class) {
6937         case MC_MBUF:
6938                 /*
6939                  * For the mbuf case, find the index of the page
6940                  * used by the mbuf and use that index to locate the
6941                  * base address of the page.  Then find out the
6942                  * mbuf index relative to the page base and use
6943                  * it to locate the audit structure.
6944                  */
6945                 m_idx = MBPAGEIDX(page_addr, mobj);
6946                 VERIFY(m_idx < (int)NMBPG);
6947                 mca = mclaudit[ix].cl_audit[m_idx];
6948                 break;
6949
6950         case MC_CL:
6951                 /*
6952                  * Same thing as above, but for 2KB clusters in a page.
6953                  */
6954                 m_idx = CLPAGEIDX(page_addr, mobj);
6955                 VERIFY(m_idx < (int)NCLPG);
6956                 mca = mclaudit[ix].cl_audit[m_idx];
6957                 break;
6958
6959         case MC_BIGCL:
6960                 m_idx = BCLPAGEIDX(page_addr, mobj);
6961                 VERIFY(m_idx < (int)NBCLPG);
6962                 mca = mclaudit[ix].cl_audit[m_idx];
6963                 break;
6964         case MC_16KCL:
6965                 /*
6966                  * Same as above, but only return the first element.
6967                  */
6968                 mca = mclaudit[ix].cl_audit[0];
6969                 break;
6970
6971         default:
6972                 VERIFY(0);
6973                 /* NOTREACHED */
6974         }
6975
6976         return (mca);
6977 }
6978
6979 static void
6980 mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite,
6981     boolean_t alloc)
6982 {
6983         struct mbuf *m = addr;
6984         mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next;
6985
6986         VERIFY(mca->mca_contents != NULL &&
6987             mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
6988
6989         if (mclverify)
6990                 mcl_audit_verify_nextptr(next, mca);
6991
6992         if (!alloc) {
6993                 /* Save constructed mbuf fields */
6994                 mcl_audit_save_mbuf(m, mca);
6995                 if (mclverify) {
6996                         mcache_set_pattern(MCACHE_FREE_PATTERN, m,
6997                             m_maxsize(MC_MBUF));
6998                 }
6999                 ((mcache_obj_t *)m)->obj_next = next;
7000                 return;
7001         }
7002
7003         /* Check if the buffer has been corrupted while in freelist */
7004         if (mclverify) {
7005                 mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
7006         }
7007         /* Restore constructed mbuf fields */
7008         mcl_audit_restore_mbuf(m, mca, composite);
7009 }
7010
7011 static void
7012 mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite)
7013 {
7014         struct mbuf *ms = MCA_SAVED_MBUF_PTR(mca);
7015
7016         if (composite) {
7017                 struct mbuf *next = m->m_next;
7018                 VERIFY(ms->m_flags == M_EXT && m_get_rfa(ms) != NULL &&
7019                     MBUF_IS_COMPOSITE(ms));
7020                 VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
7021                 /*
7022                  * We could have hand-picked the mbuf fields and restore
7023                  * them individually, but that will be a maintenance
7024                  * headache.  Instead, restore everything that was saved;
7025                  * the mbuf layer will recheck and reinitialize anyway.
7026                  */
7027                 bcopy(ms, m, MCA_SAVED_MBUF_SIZE);
7028                 m->m_next = next;
7029         } else {
7030                 /*
7031                  * For a regular mbuf (no cluster attached) there's nothing
7032                  * to restore other than the type field, which is expected
7033                  * to be MT_FREE.
7034                  */
7035                 m->m_type = ms->m_type;
7036         }
7037         _MCHECK(m);
7038 }
7039
7040 static void
7041 mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca)
7042 {
7043         VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
7044         _MCHECK(m);
7045         bcopy(m, MCA_SAVED_MBUF_PTR(mca), MCA_SAVED_MBUF_SIZE);
7046 }
7047
7048 static void
7049 mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc,
7050     boolean_t save_next)
7051 {
7052         mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next;
7053
7054         if (!alloc) {
7055                 if (mclverify) {
7056                         mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
7057                 }
7058                 if (save_next) {
7059                         mcl_audit_verify_nextptr(next, mca);
7060                         ((mcache_obj_t *)addr)->obj_next = next;
7061                 }
7062         } else if (mclverify) {
7063                 /* Check if the buffer has been corrupted while in freelist */
7064                 mcl_audit_verify_nextptr(next, mca);
7065                 mcache_audit_free_verify_set(mca, addr, 0, size);
7066         }
7067 }
7068
7069 static void
7070 mcl_audit_scratch(mcache_audit_t *mca)
7071 {
7072         void *stack[MCACHE_STACK_DEPTH + 1];
7073         mcl_scratch_audit_t *msa;
7074         struct timeval now;
7075
7076         VERIFY(mca->mca_contents != NULL);
7077         msa = MCA_SAVED_SCRATCH_PTR(mca);
7078
7079         msa->msa_pthread = msa->msa_thread;
7080         msa->msa_thread = current_thread();
7081         bcopy(msa->msa_stack, msa->msa_pstack, sizeof (msa->msa_pstack));
7082         msa->msa_pdepth = msa->msa_depth;
7083         bzero(stack, sizeof (stack));
7084         msa->msa_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
7085         bcopy(&stack[1], msa->msa_stack, sizeof (msa->msa_stack));
7086
7087         msa->msa_ptstamp = msa->msa_tstamp;
7088         microuptime(&now);
7089         /* tstamp is in ms relative to base_ts */
7090         msa->msa_tstamp = ((now.tv_usec - mb_start.tv_usec) / 1000);
7091         if ((now.tv_sec - mb_start.tv_sec) > 0)
7092                 msa->msa_tstamp += ((now.tv_sec - mb_start.tv_sec) * 1000);
7093 }
7094
7095 static void
7096 mcl_audit_mcheck_panic(struct mbuf *m)
7097 {
7098         mcache_audit_t *mca;
7099
7100         MRANGE(m);
7101         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
7102
7103         panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n",
7104             m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(mca));
7105         /* NOTREACHED */
7106 }
7107
7108 static void
7109 mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
7110 {
7111         if (next != NULL && !MBUF_IN_MAP(next) &&
7112             (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) {
7113                 panic("mcl_audit: buffer %p modified after free at offset 0: "
7114                     "%p out of range [%p-%p)\n%s\n",
7115                     mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca));
7116                 /* NOTREACHED */
7117         }
7118 }
7119
7120 /* This function turns on mbuf leak detection */
7121 static void
7122 mleak_activate(void)
7123 {
7124         mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR;
7125         PE_parse_boot_argn("mleak_sample_factor",
7126             &mleak_table.mleak_sample_factor,
7127             sizeof (mleak_table.mleak_sample_factor));
7128
7129         if (mleak_table.mleak_sample_factor == 0)
7130                 mclfindleak = 0;
7131
7132         if (mclfindleak == 0)
7133                 return;
7134
7135         vm_size_t alloc_size =
7136             mleak_alloc_buckets * sizeof (struct mallocation);
7137         vm_size_t trace_size = mleak_trace_buckets * sizeof (struct mtrace);
7138
7139         MALLOC(mleak_allocations, struct mallocation *, alloc_size,
7140             M_TEMP, M_WAITOK | M_ZERO);
7141         VERIFY(mleak_allocations != NULL);
7142
7143         MALLOC(mleak_traces, struct mtrace *, trace_size,
7144             M_TEMP, M_WAITOK | M_ZERO);
7145         VERIFY(mleak_traces != NULL);
7146
7147         MALLOC(mleak_stat, mleak_stat_t *, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES),
7148             M_TEMP, M_WAITOK | M_ZERO);
7149         VERIFY(mleak_stat != NULL);
7150         mleak_stat->ml_cnt = MLEAK_NUM_TRACES;
7151 #ifdef __LP64__
7152         mleak_stat->ml_isaddr64 = 1;
7153 #endif /* __LP64__ */
7154 }
7155
7156 static void
7157 mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc)
7158 {
7159         int temp;
7160
7161         if (mclfindleak == 0)
7162                 return;
7163
7164         if (!alloc)
7165                 return (mleak_free(addr));
7166
7167         temp = atomic_add_32_ov(&mleak_table.mleak_capture, 1);
7168
7169         if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) {
7170                 uintptr_t bt[MLEAK_STACK_DEPTH];
7171                 int logged = backtrace(bt, MLEAK_STACK_DEPTH);
7172                 mleak_log(bt, addr, logged, num);
7173         }
7174 }
7175
7176 /*
7177  * This function records the allocation in the mleak_allocations table
7178  * and the backtrace in the mleak_traces table; if allocation slot is in use,
7179  * replace old allocation with new one if the trace slot is in use, return
7180  * (or increment refcount if same trace).
7181  */
7182 static boolean_t
7183 mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num)
7184 {
7185         struct mallocation *allocation;
7186         struct mtrace *trace;
7187         uint32_t trace_index;
7188
7189         /* Quit if someone else modifying the tables */
7190         if (!lck_mtx_try_lock_spin(mleak_lock)) {
7191                 mleak_table.total_conflicts++;
7192                 return (FALSE);
7193         }
7194
7195         allocation = &mleak_allocations[hashaddr((uintptr_t)addr,
7196             mleak_alloc_buckets)];
7197         trace_index = hashbacktrace(bt, depth, mleak_trace_buckets);
7198         trace = &mleak_traces[trace_index];
7199
7200         VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]);
7201         VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]);
7202
7203         allocation->hitcount++;
7204         trace->hitcount++;
7205
7206         /*
7207          * If the allocation bucket we want is occupied
7208          * and the occupier has the same trace, just bail.
7209          */
7210         if (allocation->element != NULL &&
7211             trace_index == allocation->trace_index) {
7212                 mleak_table.alloc_collisions++;
7213                 lck_mtx_unlock(mleak_lock);
7214                 return (TRUE);
7215         }
7216
7217         /*
7218          * Store the backtrace in the traces array;
7219          * Size of zero = trace bucket is free.
7220          */
7221         if (trace->allocs > 0 &&
7222             bcmp(trace->addr, bt, (depth * sizeof (uintptr_t))) != 0) {
7223                 /* Different, unique trace, but the same hash! Bail out. */
7224                 trace->collisions++;
7225                 mleak_table.trace_collisions++;
7226                 lck_mtx_unlock(mleak_lock);
7227                 return (TRUE);
7228         } else if (trace->allocs > 0) {
7229                 /* Same trace, already added, so increment refcount */
7230                 trace->allocs++;
7231         } else {
7232                 /* Found an unused trace bucket, so record the trace here */
7233                 if (trace->depth != 0) {
7234                         /* this slot previously used but not currently in use */
7235                         mleak_table.trace_overwrites++;
7236                 }
7237                 mleak_table.trace_recorded++;
7238                 trace->allocs = 1;
7239                 memcpy(trace->addr, bt, (depth * sizeof (uintptr_t)));
7240                 trace->depth = depth;
7241                 trace->collisions = 0;
7242         }
7243
7244         /* Step 2: Store the allocation record in the allocations array */
7245         if (allocation->element != NULL) {
7246                 /*
7247                  * Replace an existing allocation.  No need to preserve
7248                  * because only a subset of the allocations are being
7249                  * recorded anyway.
7250                  */
7251                 mleak_table.alloc_collisions++;
7252         } else if (allocation->trace_index != 0) {
7253                 mleak_table.alloc_overwrites++;
7254         }
7255         allocation->element = addr;
7256         allocation->trace_index = trace_index;
7257         allocation->count = num;
7258         mleak_table.alloc_recorded++;
7259         mleak_table.outstanding_allocs++;
7260
7261         lck_mtx_unlock(mleak_lock);
7262         return (TRUE);
7263 }
7264
7265 static void
7266 mleak_free(mcache_obj_t *addr)
7267 {
7268         while (addr != NULL) {
7269                 struct mallocation *allocation = &mleak_allocations
7270                     [hashaddr((uintptr_t)addr, mleak_alloc_buckets)];
7271
7272                 if (allocation->element == addr &&
7273                     allocation->trace_index < mleak_trace_buckets) {
7274                         lck_mtx_lock_spin(mleak_lock);
7275                         if (allocation->element == addr &&
7276                             allocation->trace_index < mleak_trace_buckets) {
7277                                 struct mtrace *trace;
7278                                 trace = &mleak_traces[allocation->trace_index];
7279                                 /* allocs = 0 means trace bucket is unused */
7280                                 if (trace->allocs > 0)
7281                                         trace->allocs--;
7282                                 if (trace->allocs == 0)
7283                                         trace->depth = 0;
7284                                 /* NULL element means alloc bucket is unused */
7285                                 allocation->element = NULL;
7286                                 mleak_table.outstanding_allocs--;
7287                         }
7288                         lck_mtx_unlock(mleak_lock);
7289                 }
7290                 addr = addr->obj_next;
7291         }
7292 }
7293
7294 static void
7295 mleak_sort_traces()
7296 {
7297         int i, j, k;
7298         struct mtrace *swap;
7299
7300         for(i = 0; i < MLEAK_NUM_TRACES; i++)
7301                 mleak_top_trace[i] = NULL;
7302
7303         for(i = 0, j = 0; j < MLEAK_NUM_TRACES && i < mleak_trace_buckets; i++)
7304         {
7305                 if (mleak_traces[i].allocs <= 0)
7306                         continue;
7307
7308                 mleak_top_trace[j] = &mleak_traces[i];
7309                 for (k = j; k > 0; k--) {
7310                         if (mleak_top_trace[k]->allocs <=
7311                             mleak_top_trace[k-1]->allocs)
7312                                 break;
7313
7314                         swap = mleak_top_trace[k-1];
7315                         mleak_top_trace[k-1] = mleak_top_trace[k];
7316                         mleak_top_trace[k] = swap;
7317                 }
7318                 j++;
7319         }
7320
7321         j--;
7322         for(; i < mleak_trace_buckets; i++) {
7323                 if (mleak_traces[i].allocs <= mleak_top_trace[j]->allocs)
7324                         continue;
7325
7326                 mleak_top_trace[j] = &mleak_traces[i];
7327
7328                 for (k = j; k > 0; k--) {
7329                         if (mleak_top_trace[k]->allocs <=
7330                             mleak_top_trace[k-1]->allocs)
7331                                 break;
7332
7333                         swap = mleak_top_trace[k-1];
7334                         mleak_top_trace[k-1] = mleak_top_trace[k];
7335                         mleak_top_trace[k] = swap;
7336                 }
7337         }
7338 }
7339
7340 static void
7341 mleak_update_stats()
7342 {
7343         mleak_trace_stat_t *mltr;
7344         int i;
7345
7346         VERIFY(mleak_stat != NULL);
7347 #ifdef __LP64__
7348         VERIFY(mleak_stat->ml_isaddr64);
7349 #else
7350         VERIFY(!mleak_stat->ml_isaddr64);
7351 #endif /* !__LP64__ */
7352         VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES);
7353
7354         mleak_sort_traces();
7355
7356         mltr = &mleak_stat->ml_trace[0];
7357         bzero(mltr, sizeof (*mltr) * MLEAK_NUM_TRACES);
7358         for (i = 0; i < MLEAK_NUM_TRACES; i++) {
7359                 int j;
7360
7361                 if (mleak_top_trace[i] == NULL ||
7362                     mleak_top_trace[i]->allocs == 0)
7363                         continue;
7364
7365                 mltr->mltr_collisions   = mleak_top_trace[i]->collisions;
7366                 mltr->mltr_hitcount     = mleak_top_trace[i]->hitcount;
7367                 mltr->mltr_allocs       = mleak_top_trace[i]->allocs;
7368                 mltr->mltr_depth        = mleak_top_trace[i]->depth;
7369
7370                 VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH);
7371                 for (j = 0; j < mltr->mltr_depth; j++)
7372                         mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j];
7373
7374                 mltr++;
7375         }
7376 }
7377
7378 static struct mbtypes {
7379         int             mt_type;
7380         const char      *mt_name;
7381 } mbtypes[] = {
7382         { MT_DATA,      "data" },
7383         { MT_OOBDATA,   "oob data" },
7384         { MT_CONTROL,   "ancillary data" },
7385         { MT_HEADER,    "packet headers" },
7386         { MT_SOCKET,    "socket structures" },
7387         { MT_PCB,       "protocol control blocks" },
7388         { MT_RTABLE,    "routing table entries" },
7389         { MT_HTABLE,    "IMP host table entries" },
7390         { MT_ATABLE,    "address resolution tables" },
7391         { MT_FTABLE,    "fragment reassembly queue headers" },
7392         { MT_SONAME,    "socket names and addresses" },
7393         { MT_SOOPTS,    "socket options" },
7394         { MT_RIGHTS,    "access rights" },
7395         { MT_IFADDR,    "interface addresses" },
7396         { MT_TAG,       "packet tags" },
7397         { 0,            NULL }
7398 };
7399
7400 #define MBUF_DUMP_BUF_CHK() {   \
7401         clen -= k;              \
7402         if (clen < 1)           \
7403                 goto done;      \
7404         c += k;                 \
7405 }
7406
7407 static char *
7408 mbuf_dump(void)
7409 {
7410         unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct,
7411             totreturned = 0;
7412         u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0;
7413         u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0;
7414         u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0;
7415         int nmbtypes = sizeof (mbstat.m_mtypes) / sizeof (short);
7416         uint8_t seen[256];
7417         struct mbtypes *mp;
7418         mb_class_stat_t *sp;
7419         mleak_trace_stat_t *mltr;
7420         char *c = mbuf_dump_buf;
7421         int i, j, k, clen = MBUF_DUMP_BUF_SIZE;
7422
7423         mbuf_dump_buf[0] = '\0';
7424
7425         /* synchronize all statistics in the mbuf table */
7426         mbuf_stat_sync();
7427         mbuf_mtypes_sync(TRUE);
7428
7429         sp = &mb_stat->mbs_class[0];
7430         for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) {
7431                 u_int32_t mem;
7432
7433                 if (m_class(i) == MC_MBUF) {
7434                         m_mbufs = sp->mbcl_active;
7435                 } else if (m_class(i) == MC_CL) {
7436                         m_clfree = sp->mbcl_total - sp->mbcl_active;
7437                 } else if (m_class(i) == MC_BIGCL) {
7438                         m_bigclfree = sp->mbcl_total - sp->mbcl_active;
7439                 } else if (njcl > 0 && m_class(i) == MC_16KCL) {
7440                         m_16kclfree = sp->mbcl_total - sp->mbcl_active;
7441                         m_16kclusters = sp->mbcl_total;
7442                 } else if (m_class(i) == MC_MBUF_CL) {
7443                         m_mbufclfree = sp->mbcl_total - sp->mbcl_active;
7444                 } else if (m_class(i) == MC_MBUF_BIGCL) {
7445                         m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active;
7446                 } else if (njcl > 0 && m_class(i) == MC_MBUF_16KCL) {
7447                         m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active;
7448                 }
7449
7450                 mem = sp->mbcl_ctotal * sp->mbcl_size;
7451                 totmem += mem;
7452                 totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) *
7453                     sp->mbcl_size;
7454                 totreturned += sp->mbcl_release_cnt;
7455
7456         }
7457
7458         /* adjust free counts to include composite caches */
7459         m_clfree += m_mbufclfree;
7460         m_bigclfree += m_mbufbigclfree;
7461         m_16kclfree += m_mbuf16kclfree;
7462
7463         totmbufs = 0;
7464         for (mp = mbtypes; mp->mt_name != NULL; mp++)
7465                 totmbufs += mbstat.m_mtypes[mp->mt_type];
7466         if (totmbufs > m_mbufs)
7467                 totmbufs = m_mbufs;
7468         k = snprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs);
7469         MBUF_DUMP_BUF_CHK();
7470
7471         bzero(&seen, sizeof (seen));
7472         for (mp = mbtypes; mp->mt_name != NULL; mp++) {
7473                 if (mbstat.m_mtypes[mp->mt_type] != 0) {
7474                         seen[mp->mt_type] = 1;
7475                         k = snprintf(c, clen, "\t%u mbufs allocated to %s\n",
7476                             mbstat.m_mtypes[mp->mt_type], mp->mt_name);
7477                         MBUF_DUMP_BUF_CHK();
7478                 }
7479         }
7480         seen[MT_FREE] = 1;
7481         for (i = 0; i < nmbtypes; i++)
7482                 if (!seen[i] && mbstat.m_mtypes[i] != 0) {
7483                         k = snprintf(c, clen, "\t%u mbufs allocated to "
7484                             "<mbuf type %d>\n", mbstat.m_mtypes[i], i);
7485                         MBUF_DUMP_BUF_CHK();
7486                 }
7487         if ((m_mbufs - totmbufs) > 0) {
7488                 k = snprintf(c, clen, "\t%lu mbufs allocated to caches\n",
7489                     m_mbufs - totmbufs);
7490                 MBUF_DUMP_BUF_CHK();
7491         }
7492         k = snprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n"
7493             "%u/%u mbuf 4KB clusters in use\n",
7494             (unsigned int)(mbstat.m_clusters - m_clfree),
7495             (unsigned int)mbstat.m_clusters,
7496             (unsigned int)(mbstat.m_bigclusters - m_bigclfree),
7497             (unsigned int)mbstat.m_bigclusters);
7498         MBUF_DUMP_BUF_CHK();
7499
7500         if (njcl > 0) {
7501                 k = snprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n",
7502                     m_16kclusters - m_16kclfree, m_16kclusters,
7503                     njclbytes / 1024);
7504                 MBUF_DUMP_BUF_CHK();
7505         }
7506         totused = totmem - totfree;
7507         if (totmem == 0) {
7508                 totpct = 0;
7509         } else if (totused < (ULONG_MAX / 100)) {
7510                 totpct = (totused * 100) / totmem;
7511         } else {
7512                 u_long totmem1 = totmem / 100;
7513                 u_long totused1 = totused / 100;
7514                 totpct = (totused1 * 100) / totmem1;
7515         }
7516         k = snprintf(c, clen, "%lu KB allocated to network (approx. %lu%% "
7517             "in use)\n", totmem / 1024, totpct);
7518         MBUF_DUMP_BUF_CHK();
7519         k = snprintf(c, clen, "%lu KB returned to the system\n",
7520             totreturned / 1024);
7521         MBUF_DUMP_BUF_CHK();
7522
7523         net_update_uptime();
7524         k = snprintf(c, clen,
7525             "VM allocation failures: contiguous %u, normal %u, one page %u\n",
7526             mb_kmem_contig_failed, mb_kmem_failed, mb_kmem_one_failed);
7527         MBUF_DUMP_BUF_CHK();
7528         if (mb_kmem_contig_failed_ts || mb_kmem_failed_ts ||
7529             mb_kmem_one_failed_ts) {
7530                 k = snprintf(c, clen,
7531                     "VM allocation failure timestamps: contiguous %llu "
7532                     "(size %llu), normal %llu (size %llu), one page %llu "
7533                     "(now %llu)\n",
7534                     mb_kmem_contig_failed_ts, mb_kmem_contig_failed_size,
7535                     mb_kmem_failed_ts, mb_kmem_failed_size,
7536                     mb_kmem_one_failed_ts, net_uptime());
7537                 MBUF_DUMP_BUF_CHK();
7538                 k = snprintf(c, clen,
7539                     "VM return codes: ");
7540                 MBUF_DUMP_BUF_CHK();
7541                 for (i = 0;
7542                      i < sizeof(mb_kmem_stats) / sizeof(mb_kmem_stats[0]);
7543                      i++) {
7544                         k = snprintf(c, clen, "%s: %u ", mb_kmem_stats_labels[i],
7545                             mb_kmem_stats[i]);
7546                         MBUF_DUMP_BUF_CHK();
7547                 }
7548                 k = snprintf(c, clen, "\n");
7549                 MBUF_DUMP_BUF_CHK();
7550         }
7551         k = snprintf(c, clen,
7552             "worker thread runs: %u, expansions: %llu, cl %llu/%llu, "
7553             "bigcl %llu/%llu, 16k %llu/%llu\n", mbuf_worker_run_cnt,
7554             mb_expand_cnt, mb_expand_cl_cnt, mb_expand_cl_total,
7555             mb_expand_bigcl_cnt, mb_expand_bigcl_total, mb_expand_16kcl_cnt,
7556             mb_expand_16kcl_total);
7557         MBUF_DUMP_BUF_CHK();
7558         if (mbuf_worker_last_runtime != 0) {
7559                 k = snprintf(c, clen, "worker thread last run time: "
7560                     "%llu (%llu seconds ago)\n",
7561                     mbuf_worker_last_runtime,
7562                     net_uptime() - mbuf_worker_last_runtime);
7563                 MBUF_DUMP_BUF_CHK();
7564         }
7565
7566         k = snprintf(c, clen, "\nlargest allocation failure backtraces:\n");
7567         MBUF_DUMP_BUF_CHK();
7568
7569         for (j = 0; j < MTRACELARGE_NUM_TRACES; j++) {
7570                 struct mtracelarge *trace = &mtracelarge_table[j];
7571                 if (trace->size == 0 || trace->depth == 0)
7572                         continue;
7573                 k = snprintf(c, clen, "size %llu: < ", trace->size);
7574                 MBUF_DUMP_BUF_CHK();
7575                 for (i = 0; i < trace->depth; i++) {
7576                         if (mleak_stat->ml_isaddr64) {
7577                                 k = snprintf(c, clen, "0x%0llx ",
7578                                     (uint64_t)VM_KERNEL_UNSLIDE(
7579                                             trace->addr[i]));
7580                         } else {
7581                                 k = snprintf(c, clen,
7582                                     "0x%08x ",
7583                                     (uint32_t)VM_KERNEL_UNSLIDE(
7584                                             trace->addr[i]));
7585                         }
7586                         MBUF_DUMP_BUF_CHK();
7587                 }
7588                 k = snprintf(c, clen, ">\n");
7589                 MBUF_DUMP_BUF_CHK();
7590         }
7591
7592         /* mbuf leak detection statistics */
7593         mleak_update_stats();
7594
7595         k = snprintf(c, clen, "\nmbuf leak detection table:\n");
7596         MBUF_DUMP_BUF_CHK();
7597         k = snprintf(c, clen, "\ttotal captured: %u (one per %u)\n",
7598             mleak_table.mleak_capture / mleak_table.mleak_sample_factor,
7599             mleak_table.mleak_sample_factor);
7600         MBUF_DUMP_BUF_CHK();
7601         k = snprintf(c, clen, "\ttotal allocs outstanding: %llu\n",
7602             mleak_table.outstanding_allocs);
7603         MBUF_DUMP_BUF_CHK();
7604         k = snprintf(c, clen, "\tnew hash recorded: %llu allocs, %llu traces\n",
7605             mleak_table.alloc_recorded, mleak_table.trace_recorded);
7606         MBUF_DUMP_BUF_CHK();
7607         k = snprintf(c, clen, "\thash collisions: %llu allocs, %llu traces\n",
7608             mleak_table.alloc_collisions, mleak_table.trace_collisions);
7609         MBUF_DUMP_BUF_CHK();
7610         k = snprintf(c, clen, "\toverwrites: %llu allocs, %llu traces\n",
7611             mleak_table.alloc_overwrites, mleak_table.trace_overwrites);
7612         MBUF_DUMP_BUF_CHK();
7613         k = snprintf(c, clen, "\tlock conflicts: %llu\n\n",
7614             mleak_table.total_conflicts);
7615         MBUF_DUMP_BUF_CHK();
7616
7617         k = snprintf(c, clen, "top %d outstanding traces:\n",
7618             mleak_stat->ml_cnt);
7619         MBUF_DUMP_BUF_CHK();
7620         for (i = 0; i < mleak_stat->ml_cnt; i++) {
7621                 mltr = &mleak_stat->ml_trace[i];
7622                 k = snprintf(c, clen, "[%d] %llu outstanding alloc(s), "
7623                     "%llu hit(s), %llu collision(s)\n", (i + 1),
7624                     mltr->mltr_allocs, mltr->mltr_hitcount,
7625                     mltr->mltr_collisions);
7626                 MBUF_DUMP_BUF_CHK();
7627         }
7628
7629         if (mleak_stat->ml_isaddr64)
7630                 k = snprintf(c, clen, MB_LEAK_HDR_64);
7631         else
7632                 k = snprintf(c, clen, MB_LEAK_HDR_32);
7633         MBUF_DUMP_BUF_CHK();
7634
7635         for (i = 0; i < MLEAK_STACK_DEPTH; i++) {
7636                 k = snprintf(c, clen, "%2d: ", (i + 1));
7637                 MBUF_DUMP_BUF_CHK();
7638                 for (j = 0; j < mleak_stat->ml_cnt; j++) {
7639                         mltr = &mleak_stat->ml_trace[j];
7640                         if (i < mltr->mltr_depth) {
7641                                 if (mleak_stat->ml_isaddr64) {
7642                                         k = snprintf(c, clen, "0x%0llx  ",
7643                                             (uint64_t)VM_KERNEL_UNSLIDE(
7644                                                 mltr->mltr_addr[i]));
7645                                 } else {
7646                                         k = snprintf(c, clen,
7647                                             "0x%08x  ",
7648                                             (uint32_t)VM_KERNEL_UNSLIDE(
7649                                                 mltr->mltr_addr[i]));
7650                                 }
7651                         } else {
7652                                 if (mleak_stat->ml_isaddr64)
7653                                         k = snprintf(c, clen,
7654                                             MB_LEAK_SPACING_64);
7655                                 else
7656                                         k = snprintf(c, clen,
7657                                             MB_LEAK_SPACING_32);
7658                         }
7659                         MBUF_DUMP_BUF_CHK();
7660                 }
7661                 k = snprintf(c, clen, "\n");
7662                 MBUF_DUMP_BUF_CHK();
7663         }
7664 done:
7665         return (mbuf_dump_buf);
7666 }
7667
7668 #undef MBUF_DUMP_BUF_CHK
7669
7670 /*
7671  * Convert between a regular and a packet header mbuf.  Caller is responsible
7672  * for setting or clearing M_PKTHDR; this routine does the rest of the work.
7673  */
7674 int
7675 m_reinit(struct mbuf *m, int hdr)
7676 {
7677         int ret = 0;
7678
7679         if (hdr) {
7680                 VERIFY(!(m->m_flags & M_PKTHDR));
7681                 if (!(m->m_flags & M_EXT) &&
7682                     (m->m_data != m->m_dat || m->m_len > 0)) {
7683                         /*
7684                          * If there's no external cluster attached and the
7685                          * mbuf appears to contain user data, we cannot
7686                          * safely convert this to a packet header mbuf,
7687                          * as the packet header structure might overlap
7688                          * with the data.
7689                          */
7690                         printf("%s: cannot set M_PKTHDR on altered mbuf %llx, "
7691                             "m_data %llx (expected %llx), "
7692                             "m_len %d (expected 0)\n",
7693                             __func__,
7694                             (uint64_t)VM_KERNEL_ADDRPERM(m),
7695                             (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
7696                             (uint64_t)VM_KERNEL_ADDRPERM(m->m_dat), m->m_len);
7697                         ret = EBUSY;
7698                 } else {
7699                         VERIFY((m->m_flags & M_EXT) || m->m_data == m->m_dat);
7700                         m->m_flags |= M_PKTHDR;
7701                         MBUF_INIT_PKTHDR(m);
7702                 }
7703         } else {
7704                 /* Check for scratch area overflow */
7705                 m_redzone_verify(m);
7706                 /* Free the aux data and tags if there is any */
7707                 m_tag_delete_chain(m, NULL);
7708                 m->m_flags &= ~M_PKTHDR;
7709         }
7710
7711         return (ret);
7712 }
7713
7714 int
7715 m_ext_set_prop(struct mbuf *m, uint32_t o, uint32_t n)
7716 {
7717         ASSERT(m->m_flags & M_EXT);
7718         return (atomic_test_set_32(&MEXT_PRIV(m), o, n));
7719 }
7720
7721 uint32_t
7722 m_ext_get_prop(struct mbuf *m)
7723 {
7724         ASSERT(m->m_flags & M_EXT);
7725         return (MEXT_PRIV(m));
7726 }
7727
7728 int
7729 m_ext_paired_is_active(struct mbuf *m)
7730 {
7731         return (MBUF_IS_PAIRED(m) ? (MEXT_PREF(m) > MEXT_MINREF(m)) : 1);
7732 }
7733
7734 void
7735 m_ext_paired_activate(struct mbuf *m)
7736 {
7737         struct ext_ref *rfa;
7738         int hdr, type;
7739         caddr_t extbuf;
7740         m_ext_free_func_t extfree;
7741         u_int extsize;
7742
7743         VERIFY(MBUF_IS_PAIRED(m));
7744         VERIFY(MEXT_REF(m) == MEXT_MINREF(m));
7745         VERIFY(MEXT_PREF(m) == MEXT_MINREF(m));
7746
7747         hdr = (m->m_flags & M_PKTHDR);
7748         type = m->m_type;
7749         extbuf = m->m_ext.ext_buf;
7750         extfree = m_get_ext_free(m);
7751         extsize = m->m_ext.ext_size;
7752         rfa = m_get_rfa(m);
7753
7754         VERIFY(extbuf != NULL && rfa != NULL);
7755
7756         /*
7757          * Safe to reinitialize packet header tags, since it's
7758          * already taken care of at m_free() time.  Similar to
7759          * what's done in m_clattach() for the cluster.  Bump
7760          * up MEXT_PREF to indicate activation.
7761          */
7762         MBUF_INIT(m, hdr, type);
7763         MEXT_INIT(m, extbuf, extsize, extfree, (caddr_t)m, rfa,
7764             1, 1, 2, EXTF_PAIRED, MEXT_PRIV(m), m);
7765 }
7766
7767 void
7768 m_scratch_init(struct mbuf *m)
7769 {
7770         struct pkthdr *pkt = &m->m_pkthdr;
7771
7772         VERIFY(m->m_flags & M_PKTHDR);
7773
7774         /* See comments in <rdar://problem/14040693> */
7775         if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
7776                 panic_plain("Invalid attempt to modify guarded module-private "
7777                     "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
7778                 /* NOTREACHED */
7779         }
7780
7781         bzero(&pkt->pkt_mpriv, sizeof (pkt->pkt_mpriv));
7782 }
7783
7784 /*
7785  * This routine is reserved for mbuf_get_driver_scratch(); clients inside
7786  * xnu that intend on utilizing the module-private area should directly
7787  * refer to the pkt_mpriv structure in the pkthdr.  They are also expected
7788  * to set and clear PKTF_PRIV_GUARDED, while owning the packet and prior
7789  * to handing it off to another module, respectively.
7790  */
7791 u_int32_t
7792 m_scratch_get(struct mbuf *m, u_int8_t **p)
7793 {
7794         struct pkthdr *pkt = &m->m_pkthdr;
7795
7796         VERIFY(m->m_flags & M_PKTHDR);
7797
7798         /* See comments in <rdar://problem/14040693> */
7799         if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
7800                 panic_plain("Invalid attempt to access guarded module-private "
7801                     "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
7802                 /* NOTREACHED */
7803         }
7804
7805         if (mcltrace) {
7806                 mcache_audit_t *mca;
7807
7808                 lck_mtx_lock(mbuf_mlock);
7809                 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
7810                 if (mca->mca_uflags & MB_SCVALID)
7811                         mcl_audit_scratch(mca);
7812                 lck_mtx_unlock(mbuf_mlock);
7813         }
7814
7815         *p = (u_int8_t *)&pkt->pkt_mpriv;
7816         return (sizeof (pkt->pkt_mpriv));
7817 }
7818
7819 static void
7820 m_redzone_init(struct mbuf *m)
7821 {
7822         VERIFY(m->m_flags & M_PKTHDR);
7823         /*
7824          * Each mbuf has a unique red zone pattern, which is a XOR
7825          * of the red zone cookie and the address of the mbuf.
7826          */
7827         m->m_pkthdr.redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
7828 }
7829
7830 static void
7831 m_redzone_verify(struct mbuf *m)
7832 {
7833         u_int32_t mb_redzone;
7834
7835         VERIFY(m->m_flags & M_PKTHDR);
7836
7837         mb_redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
7838         if (m->m_pkthdr.redzone != mb_redzone) {
7839                 panic("mbuf %p redzone violation with value 0x%x "
7840                     "(instead of 0x%x, using cookie 0x%x)\n",
7841                     m, m->m_pkthdr.redzone, mb_redzone, mb_redzone_cookie);
7842                 /* NOTREACHED */
7843         }
7844 }
7845
7846 __private_extern__ inline void
7847 m_set_ext(struct mbuf *m, struct ext_ref *rfa, m_ext_free_func_t ext_free,
7848     caddr_t ext_arg)
7849 {
7850         VERIFY(m->m_flags & M_EXT);
7851         if (rfa != NULL) {
7852                 m->m_ext.ext_refflags =
7853                     (struct ext_ref *)(((uintptr_t)rfa) ^ mb_obscure_extref);
7854                 if (ext_free != NULL) {
7855                         rfa->ext_token = ((uintptr_t)&rfa->ext_token) ^
7856                             mb_obscure_extfree;
7857                         m->m_ext.ext_free = (m_ext_free_func_t)
7858                             (((uintptr_t)ext_free) ^ rfa->ext_token);
7859                         if (ext_arg != NULL) {
7860                                 m->m_ext.ext_arg =
7861                                     (caddr_t)(((uintptr_t)ext_arg) ^ rfa->ext_token);
7862                         } else {
7863                                 m->m_ext.ext_arg = NULL;
7864                         }
7865                 } else {
7866                         rfa->ext_token = 0;
7867                         m->m_ext.ext_free = NULL;
7868                         m->m_ext.ext_arg = NULL;
7869                 }
7870         } else {
7871                 /*
7872                  * If we are going to loose the cookie in ext_token by
7873                  * resetting the rfa, we should use the global cookie
7874                  * to obscure the ext_free and ext_arg pointers.
7875                  */
7876                 if (ext_free != NULL) {
7877                         m->m_ext.ext_free =
7878                             (m_ext_free_func_t)((uintptr_t)ext_free ^
7879                             mb_obscure_extfree);
7880                         if (ext_arg != NULL) {
7881                                 m->m_ext.ext_arg =
7882                                     (caddr_t)((uintptr_t)ext_arg ^
7883                                     mb_obscure_extfree);
7884                         } else {
7885                                 m->m_ext.ext_arg = NULL;
7886                         }
7887                 } else {
7888                         m->m_ext.ext_free = NULL;
7889                         m->m_ext.ext_arg = NULL;
7890                 }
7891                 m->m_ext.ext_refflags = NULL;
7892         }
7893 }
7894
7895 __private_extern__ inline struct ext_ref *
7896 m_get_rfa(struct mbuf *m)
7897 {
7898         if (m->m_ext.ext_refflags == NULL)
7899                 return (NULL);
7900         else
7901                 return ((struct ext_ref *)(((uintptr_t)m->m_ext.ext_refflags) ^ mb_obscure_extref));
7902 }
7903
7904 __private_extern__ inline m_ext_free_func_t
7905 m_get_ext_free(struct mbuf *m)
7906 {
7907         struct ext_ref *rfa;
7908         if (m->m_ext.ext_free == NULL)
7909                 return (NULL);
7910
7911         rfa = m_get_rfa(m);
7912         if (rfa == NULL)
7913                 return ((m_ext_free_func_t)((uintptr_t)m->m_ext.ext_free ^ mb_obscure_extfree));
7914         else
7915                 return ((m_ext_free_func_t)(((uintptr_t)m->m_ext.ext_free)
7916                     ^ rfa->ext_token));
7917 }
7918
7919 __private_extern__ inline caddr_t
7920 m_get_ext_arg(struct mbuf *m)
7921 {
7922         struct ext_ref *rfa;
7923         if (m->m_ext.ext_arg == NULL)
7924                 return (NULL);
7925
7926         rfa = m_get_rfa(m);
7927         if (rfa == NULL) {
7928                 return ((caddr_t)((uintptr_t)m->m_ext.ext_arg ^ mb_obscure_extfree));
7929         } else {
7930                 return ((caddr_t)(((uintptr_t)m->m_ext.ext_arg) ^
7931                     rfa->ext_token));
7932         }
7933 }
7934
7935 /*
7936  * Send a report of mbuf usage if the usage is at least 6% of max limit
7937  * or if there has been at least 3% increase since the last report.
7938  *
7939  * The values 6% and 3% are chosen so that we can do simple arithmetic
7940  * with shift operations.
7941  */
7942 static boolean_t
7943 mbuf_report_usage(mbuf_class_t cl)
7944 {
7945         /* if a report is already in progress, nothing to do */
7946         if (mb_peak_newreport)
7947                 return (TRUE);
7948
7949         if (m_total(cl) > m_peak(cl) &&
7950             m_total(cl) >= (m_maxlimit(cl) >> 4) &&
7951             (m_total(cl) - m_peak(cl)) >= (m_peak(cl) >> 5))
7952                 return (TRUE);
7953         return (FALSE);
7954 }
7955
7956 __private_extern__ void
7957 mbuf_report_peak_usage(void)
7958 {
7959         int i = 0;
7960         u_int64_t uptime;
7961         struct nstat_sysinfo_data ns_data;
7962         uint32_t memreleased = 0;
7963         static uint32_t prevmemreleased;
7964
7965         uptime = net_uptime();
7966         lck_mtx_lock(mbuf_mlock);
7967
7968         /* Generate an initial report after 1 week of uptime */
7969         if (!mb_peak_firstreport &&
7970             uptime > MBUF_PEAK_FIRST_REPORT_THRESHOLD) {
7971                 mb_peak_newreport = TRUE;
7972                 mb_peak_firstreport = TRUE;
7973         }
7974
7975         if (!mb_peak_newreport) {
7976                 lck_mtx_unlock(mbuf_mlock);
7977                 return;
7978         }
7979
7980         /*
7981          * Since a report is being generated before 1 week,
7982          * we do not need to force another one later
7983          */
7984         if (uptime < MBUF_PEAK_FIRST_REPORT_THRESHOLD)
7985                 mb_peak_firstreport = TRUE;
7986
7987         for (i = 0; i < NELEM(mbuf_table); i++) {
7988                 m_peak(m_class(i)) = m_total(m_class(i));
7989                 memreleased += m_release_cnt(i);
7990         }
7991         memreleased = memreleased - prevmemreleased;
7992         prevmemreleased = memreleased;
7993         mb_peak_newreport = FALSE;
7994         lck_mtx_unlock(mbuf_mlock);
7995
7996         bzero(&ns_data, sizeof(ns_data));
7997         ns_data.flags = NSTAT_SYSINFO_MBUF_STATS;
7998         ns_data.u.mb_stats.total_256b = m_peak(MC_MBUF);
7999         ns_data.u.mb_stats.total_2kb = m_peak(MC_CL);
8000         ns_data.u.mb_stats.total_4kb = m_peak(MC_BIGCL);
8001         ns_data.u.mb_stats.total_16kb = m_peak(MC_16KCL);
8002         ns_data.u.mb_stats.sbmb_total = total_sbmb_cnt_peak;
8003         ns_data.u.mb_stats.sb_atmbuflimit = sbmb_limreached;
8004         ns_data.u.mb_stats.draincnt = mbstat.m_drain;
8005         ns_data.u.mb_stats.memreleased = memreleased;
8006         ns_data.u.mb_stats.sbmb_floor = total_sbmb_cnt_floor;
8007
8008         nstat_sysinfo_send_data(&ns_data);
8009
8010         /*
8011          * Reset the floor whenever we report a new
8012          * peak to track the trend (increase peek usage
8013          * is not a leak if mbufs get released
8014          * between reports and the floor stays low)
8015          */
8016         total_sbmb_cnt_floor = total_sbmb_cnt_peak;
8017 }
8018
8019 /*
8020  * Called by the VM when there's memory pressure.
8021  */
8022 __private_extern__ void
8023 m_drain(void)
8024 {
8025         mbuf_class_t mc;
8026         mcl_slab_t *sp, *sp_tmp, *nsp;
8027         unsigned int num, k, interval, released = 0;
8028         unsigned long total_mem = 0, use_mem = 0;
8029         boolean_t ret, purge_caches = FALSE;
8030         ppnum_t offset;
8031         mcache_obj_t *obj;
8032         unsigned long per;
8033         static uint64_t last_drain = 0;
8034         static unsigned char scratch[32];
8035         static ppnum_t scratch_pa = 0;
8036
8037         if (mb_drain_maxint == 0 || mb_waiters)
8038                 return;
8039         if (scratch_pa == 0) {
8040                 bzero(scratch, sizeof(scratch));
8041                 scratch_pa = pmap_find_phys(kernel_pmap, (addr64_t)scratch);
8042                 VERIFY(scratch_pa);
8043         } else if (mclverify) {
8044                 /*
8045                  * Panic if a driver wrote to our scratch memory.
8046                  */
8047                 for (k = 0; k < sizeof(scratch); k++)
8048                         if (scratch[k])
8049                                 panic("suspect DMA to freed address");
8050         }
8051         /*
8052          * Don't free memory too often as that could cause excessive
8053          * waiting times for mbufs.  Purge caches if we were asked to drain
8054          * in the last 5 minutes.
8055          */
8056         lck_mtx_lock(mbuf_mlock);
8057         if (last_drain == 0) {
8058                 last_drain = net_uptime();
8059                 lck_mtx_unlock(mbuf_mlock);
8060                 return;
8061         }
8062         interval = net_uptime() - last_drain;
8063         if (interval <= mb_drain_maxint) {
8064                 lck_mtx_unlock(mbuf_mlock);
8065                 return;
8066         }
8067         if (interval <= mb_drain_maxint * 5)
8068                 purge_caches = TRUE;
8069         last_drain = net_uptime();
8070         /*
8071          * Don't free any memory if we're using 60% or more.
8072          */
8073         for (mc = 0; mc < NELEM(mbuf_table); mc++) {
8074                 total_mem += m_total(mc) * m_maxsize(mc);
8075                 use_mem += m_active(mc) * m_maxsize(mc);
8076         }
8077         per = (use_mem * 100) / total_mem;
8078         if (per >= 60) {
8079                 lck_mtx_unlock(mbuf_mlock);
8080                 return;
8081         }
8082         /*
8083          * Purge all the caches.  This effectively disables
8084          * caching for a few seconds, but the mbuf worker thread will
8085          * re-enable them again.
8086          */
8087         if (purge_caches == TRUE)
8088                 for (mc = 0; mc < NELEM(mbuf_table); mc++) {
8089                         if (m_total(mc) < m_avgtotal(mc))
8090                                 continue;
8091                         lck_mtx_unlock(mbuf_mlock);
8092                         ret = mcache_purge_cache(m_cache(mc), FALSE);
8093                         lck_mtx_lock(mbuf_mlock);
8094                         if (ret == TRUE)
8095                                 m_purge_cnt(mc)++;
8096                 }
8097         /*
8098          * Move the objects from the composite class freelist to
8099          * the rudimentary slabs list, but keep at least 10% of the average
8100          * total in the freelist.
8101          */
8102         for (mc = 0; mc < NELEM(mbuf_table); mc++) {
8103                 while (m_cobjlist(mc) &&
8104                     m_total(mc) < m_avgtotal(mc) &&
8105                     m_infree(mc) > 0.1 * m_avgtotal(mc) + m_minlimit(mc)) {
8106                         obj = m_cobjlist(mc);
8107                         m_cobjlist(mc) = obj->obj_next;
8108                         obj->obj_next = NULL;
8109                         num = cslab_free(mc, obj, 1);
8110                         VERIFY(num == 1);
8111                         m_free_cnt(mc)++;
8112                         m_infree(mc)--;
8113                         /* cslab_free() handles m_total */
8114                 }
8115         }
8116         /*
8117          * Free the buffers present in the slab list up to 10% of the total
8118          * average per class.
8119          *
8120          * We walk the list backwards in an attempt to reduce fragmentation.
8121          */
8122         for (mc = NELEM(mbuf_table) - 1; (int)mc >= 0; mc--) {
8123                 TAILQ_FOREACH_SAFE(sp, &m_slablist(mc), sl_link, sp_tmp) {
8124                         /*
8125                          * Process only unused slabs occupying memory.
8126                          */
8127                         if (sp->sl_refcnt != 0 || sp->sl_len == 0 ||
8128                             sp->sl_base == NULL)
8129                                 continue;
8130                         if (m_total(mc) < m_avgtotal(mc) ||
8131                             m_infree(mc) < 0.1 * m_avgtotal(mc) + m_minlimit(mc))
8132                                 break;
8133                         slab_remove(sp, mc);
8134                         switch (mc) {
8135                         case MC_MBUF:
8136                                 m_infree(mc) -= NMBPG;
8137                                 m_total(mc) -= NMBPG;
8138                                 if (mclaudit != NULL)
8139                                         mcl_audit_free(sp->sl_base, NMBPG);
8140                                 break;
8141                         case MC_CL:
8142                                 m_infree(mc) -= NCLPG;
8143                                 m_total(mc) -= NCLPG;
8144                                 if (mclaudit != NULL)
8145                                         mcl_audit_free(sp->sl_base, NMBPG);
8146                                 break;
8147                         case MC_BIGCL:
8148                         {
8149                                 m_infree(mc) -= NBCLPG;
8150                                 m_total(mc) -= NBCLPG;
8151                                 if (mclaudit != NULL)
8152                                         mcl_audit_free(sp->sl_base, NMBPG);
8153                                 break;
8154                         }
8155                         case MC_16KCL:
8156                                 m_infree(mc)--;
8157                                 m_total(mc)--;
8158                                 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
8159                                         nsp = nsp->sl_next;
8160                                         VERIFY(nsp->sl_refcnt == 0 &&
8161                                             nsp->sl_base != NULL &&
8162                                             nsp->sl_len == 0);
8163                                         slab_init(nsp, 0, 0, NULL, NULL, 0, 0,
8164                                             0);
8165                                         nsp->sl_flags = 0;
8166                                 }
8167                                 if (mclaudit != NULL) {
8168                                         if (sp->sl_len == PAGE_SIZE) {
8169                                                 mcl_audit_free(sp->sl_base,
8170                                                     NMBPG);
8171                                         } else {
8172                                                 mcl_audit_free(sp->sl_base, 1);
8173                                         }
8174                                 }
8175                                 break;
8176                         default:
8177                                 /*
8178                                  * The composite classes have their own
8179                                  * freelist (m_cobjlist), so we only
8180                                  * process rudimentary classes here.
8181                                  */
8182                                 VERIFY(0);
8183                         }
8184                         m_release_cnt(mc) += m_size(mc);
8185                         released += m_size(mc);
8186                         VERIFY(sp->sl_base != NULL &&
8187                             sp->sl_len >= PAGE_SIZE);
8188                         offset = MTOPG(sp->sl_base);
8189                         /*
8190                          * Make sure the IOMapper points to a valid, but
8191                          * bogus, address.  This should prevent further DMA
8192                          * accesses to freed memory.
8193                          */
8194                         IOMapperInsertPage(mcl_paddr_base, offset, scratch_pa);
8195                         mcl_paddr[offset] = 0;
8196                         kmem_free(mb_map, (vm_offset_t)sp->sl_base,
8197                             sp->sl_len);
8198                         slab_init(sp, 0, 0, NULL, NULL, 0, 0, 0);
8199                         sp->sl_flags = 0;
8200                 }
8201         }
8202         mbstat.m_drain++;
8203         mbstat.m_bigclusters = m_total(MC_BIGCL);
8204         mbstat.m_clusters = m_total(MC_CL);
8205         mbstat.m_mbufs = m_total(MC_MBUF);
8206         mbuf_stat_sync();
8207         mbuf_mtypes_sync(TRUE);
8208         lck_mtx_unlock(mbuf_mlock);
8209 }
8210
8211 static int
8212 m_drain_force_sysctl SYSCTL_HANDLER_ARGS
8213 {
8214 #pragma unused(arg1, arg2)
8215         int val = 0, err;
8216
8217         err = sysctl_handle_int(oidp, &val, 0, req);
8218         if (err != 0 || req->newptr == USER_ADDR_NULL)
8219                 return (err);
8220         if (val) {
8221                 lck_mtx_lock(mbuf_mlock);
8222                 printf("%s\n", mbuf_dump());
8223                 lck_mtx_unlock(mbuf_mlock);
8224                 m_drain();
8225         }
8226
8227         return (err);
8228 }
8229
8230 #if DEBUG || DEVELOPMENT
8231
8232 static int mbtest_val;
8233 static int mbtest_running;
8234
8235 static void mbtest_thread(__unused void *arg)
8236 {
8237         int i;
8238         int scale_down = 1;
8239         int iterations = 250;
8240         int allocations = nmbclusters;
8241         iterations = iterations / scale_down;
8242         allocations = allocations / scale_down;
8243         printf("%s thread starting\n", __func__);
8244         for (i = 0; i < iterations; i++) {
8245                 unsigned int needed = allocations;
8246                 struct mbuf *m1, *m2, *m3;
8247
8248                 if (njcl > 0) {
8249                         needed = allocations;
8250                         m3 = m_getpackets_internal(&needed, 0, M_DONTWAIT, 0, M16KCLBYTES);
8251                         m_freem_list(m3);
8252                 }
8253
8254                 needed = allocations;
8255                 m2 = m_getpackets_internal(&needed, 0, M_DONTWAIT, 0, MBIGCLBYTES);
8256                 m_freem_list(m2);
8257
8258                 m1 = m_getpackets_internal(&needed, 0, M_DONTWAIT, 0, MCLBYTES);
8259                 m_freem_list(m1);
8260         }
8261
8262         printf("%s thread ending\n", __func__);
8263
8264         OSDecrementAtomic(&mbtest_running);
8265         wakeup_one((caddr_t)&mbtest_running);
8266 }
8267
8268 static void sysctl_mbtest(void)
8269 {
8270         /* We launch three threads - wait for all of them */
8271         OSIncrementAtomic(&mbtest_running);
8272         OSIncrementAtomic(&mbtest_running);
8273         OSIncrementAtomic(&mbtest_running);
8274
8275         thread_call_func_delayed((thread_call_func_t)mbtest_thread, NULL, 10);
8276         thread_call_func_delayed((thread_call_func_t)mbtest_thread, NULL, 10);
8277         thread_call_func_delayed((thread_call_func_t)mbtest_thread, NULL, 10);
8278
8279         while (mbtest_running) {
8280                 msleep((caddr_t)&mbtest_running, NULL, PUSER, "mbtest_running", NULL);
8281         }
8282 }
8283
8284 static int
8285 mbtest SYSCTL_HANDLER_ARGS
8286 {
8287 #pragma unused(arg1, arg2)
8288         int error = 0, val, oldval = mbtest_val;
8289
8290         val = oldval;
8291         error = sysctl_handle_int(oidp, &val, 0, req);
8292         if (error || !req->newptr)
8293                 return (error);
8294
8295         if (val != oldval)
8296                 sysctl_mbtest();
8297
8298         mbtest_val = val;
8299
8300         return (error);
8301 }
8302 #endif
8303
8304
8305 static void
8306 mtracelarge_register(size_t size)
8307 {
8308         int i;
8309         struct mtracelarge *trace;
8310         uintptr_t bt[MLEAK_STACK_DEPTH];
8311         unsigned int depth;
8312
8313         depth = backtrace(bt, MLEAK_STACK_DEPTH);
8314         /* Check if this entry is already on the list. */
8315         for (i = 0; i < MTRACELARGE_NUM_TRACES; i++) {
8316                 trace = &mtracelarge_table[i];
8317                 if (trace->size == size && trace->depth == depth &&
8318                     memcmp(bt, trace->addr, depth * sizeof(uintptr_t)) == 0) {
8319                         return;
8320                 }
8321
8322         }
8323         for (i = 0; i < MTRACELARGE_NUM_TRACES; i++) {
8324                 trace = &mtracelarge_table[i];
8325                 if (size > trace->size) {
8326                         trace->depth = depth;
8327                         memcpy(trace->addr, bt, depth * sizeof(uintptr_t));
8328                         trace->size = size;
8329                         break;
8330                 }
8331         }
8332 }
8333
8334 SYSCTL_DECL(_kern_ipc);
8335 #if DEBUG || DEVELOPMENT
8336 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbtest,
8337     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &mbtest_val, 0, &mbtest, "I",
8338     "Toggle to test mbufs");
8339 #endif
8340 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat,
8341     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
8342     0, 0, mbstat_sysctl, "S,mbstat", "");
8343 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat,
8344     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
8345     0, 0, mb_stat_sysctl, "S,mb_stat", "");
8346 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace,
8347     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
8348     0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", "");
8349 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table,
8350     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
8351     0, 0, mleak_table_sysctl, "S,mleak_table", "");
8352 SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor,
8353     CTLFLAG_RW | CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, "");
8354 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized,
8355     CTLFLAG_RD | CTLFLAG_LOCKED, &mb_normalized, 0, "");
8356 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog,
8357     CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, "");
8358 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_drain_force,
8359     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0,
8360     m_drain_force_sysctl, "I",
8361     "Forces the mbuf garbage collection to run");
8362 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_drain_maxint,
8363     CTLFLAG_RW | CTLFLAG_LOCKED, &mb_drain_maxint, 0,
8364     "Minimum time interval between garbage collection");