2 * Copyright (c) 1998-2014 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
30 * Copyright (c) 1982, 1986, 1988, 1991, 1993
31 * The Regents of the University of California. All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/malloc.h>
74 #include <sys/kernel.h>
75 #include <sys/sysctl.h>
76 #include <sys/syslog.h>
77 #include <sys/protosw.h>
78 #include <sys/domain.h>
79 #include <sys/queue.h>
82 #include <dev/random/randomdev.h>
84 #include <kern/kern_types.h>
85 #include <kern/simple_lock.h>
86 #include <kern/queue.h>
87 #include <kern/sched_prim.h>
88 #include <kern/cpu_number.h>
89 #include <kern/zalloc.h>
91 #include <libkern/OSAtomic.h>
92 #include <libkern/OSDebug.h>
93 #include <libkern/libkern.h>
95 #include <IOKit/IOMapper.h>
97 #include <machine/limits.h>
98 #include <machine/machine_routines.h>
101 #include <security/mac_framework.h>
104 #include <sys/mcache.h>
105 #include <net/ntstat.h>
108 * MBUF IMPLEMENTATION NOTES.
110 * There is a total of 5 per-CPU caches:
113 * This is a cache of rudimentary objects of MSIZE in size; each
114 * object represents an mbuf structure. This cache preserves only
115 * the m_type field of the mbuf during its transactions.
118 * This is a cache of rudimentary objects of MCLBYTES in size; each
119 * object represents a mcluster structure. This cache does not
120 * preserve the contents of the objects during its transactions.
123 * This is a cache of rudimentary objects of MBIGCLBYTES in size; each
124 * object represents a mbigcluster structure. This cache does not
125 * preserve the contents of the objects during its transaction.
128 * This is a cache of mbufs each having a cluster attached to it.
129 * It is backed by MC_MBUF and MC_CL rudimentary caches. Several
130 * fields of the mbuf related to the external cluster are preserved
131 * during transactions.
134 * This is a cache of mbufs each having a big cluster attached to it.
135 * It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several
136 * fields of the mbuf related to the external cluster are preserved
137 * during transactions.
141 * Allocation requests are handled first at the per-CPU (mcache) layer
142 * before falling back to the slab layer. Performance is optimal when
143 * the request is satisfied at the CPU layer because global data/lock
144 * never gets accessed. When the slab layer is entered for allocation,
145 * the slab freelist will be checked first for available objects before
146 * the VM backing store is invoked. Slab layer operations are serialized
147 * for all of the caches as the mbuf global lock is held most of the time.
148 * Allocation paths are different depending on the class of objects:
150 * a. Rudimentary object:
152 * { m_get_common(), m_clattach(), m_mclget(),
153 * m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
154 * composite object allocation }
157 * | +-----------------------+
159 * mcache_alloc/mcache_alloc_ext() mbuf_slab_audit()
162 * [CPU cache] -------> (found?) -------+
165 * mbuf_slab_alloc() |
168 * +---------> [freelist] -------> (found?) -------+
174 * +---<<---- kmem_mb_alloc()
176 * b. Composite object:
178 * { m_getpackets_internal(), m_allocpacket_internal() }
181 * | +------ (done) ---------+
183 * mcache_alloc/mcache_alloc_ext() mbuf_cslab_audit()
186 * [CPU cache] -------> (found?) -------+
189 * mbuf_cslab_alloc() |
192 * [freelist] -------> (found?) -------+
195 * (rudimentary object) |
196 * mcache_alloc/mcache_alloc_ext() ------>>-----+
198 * Auditing notes: If auditing is enabled, buffers will be subjected to
199 * integrity checks by the audit routine. This is done by verifying their
200 * contents against DEADBEEF (free) pattern before returning them to caller.
201 * As part of this step, the routine will also record the transaction and
202 * pattern-fill the buffers with BADDCAFE (uninitialized) pattern. It will
203 * also restore any constructed data structure fields if necessary.
205 * OBJECT DEALLOCATION:
207 * Freeing an object simply involves placing it into the CPU cache; this
208 * pollutes the cache to benefit subsequent allocations. The slab layer
209 * will only be entered if the object is to be purged out of the cache.
210 * During normal operations, this happens only when the CPU layer resizes
211 * its bucket while it's adjusting to the allocation load. Deallocation
212 * paths are different depending on the class of objects:
214 * a. Rudimentary object:
216 * { m_free(), m_freem_list(), composite object deallocation }
219 * | +------ (done) ---------+
221 * mcache_free/mcache_free_ext() |
224 * mbuf_slab_audit() |
227 * [CPU cache] ---> (not purging?) -----+
233 * [freelist] ----------->>------------+
234 * (objects never get purged to VM)
236 * b. Composite object:
238 * { m_free(), m_freem_list() }
241 * | +------ (done) ---------+
243 * mcache_free/mcache_free_ext() |
246 * mbuf_cslab_audit() |
249 * [CPU cache] ---> (not purging?) -----+
252 * mbuf_cslab_free() |
255 * [freelist] ---> (not purging?) -----+
258 * (rudimentary object) |
259 * mcache_free/mcache_free_ext() ------->>------+
261 * Auditing notes: If auditing is enabled, the audit routine will save
262 * any constructed data structure fields (if necessary) before filling the
263 * contents of the buffers with DEADBEEF (free) pattern and recording the
264 * transaction. Buffers that are freed (whether at CPU or slab layer) are
265 * expected to contain the free pattern.
269 * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
270 * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT). Additionally,
271 * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
272 * i.e. modify the boot argument parameter to "mbuf_debug=0x13". Leak
273 * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g.
274 * "mbuf_debug=0x113". Note that debugging consumes more CPU and memory.
276 * Each object is associated with exactly one mcache_audit_t structure that
277 * contains the information related to its last buffer transaction. Given
278 * an address of an object, the audit structure can be retrieved by finding
279 * the position of the object relevant to the base address of the cluster:
281 * +------------+ +=============+
282 * | mbuf addr | | mclaudit[i] |
283 * +------------+ +=============+
285 * i = MTOBG(addr) +-------------+
286 * | +-----> | cl_audit[1] | -----> mcache_audit_t
287 * b = BGTOM(i) | +-------------+
289 * x = MCLIDX(b, addr) | +-------------+
290 * | | | cl_audit[7] |
291 * +-----------------+ +-------------+
294 * The mclaudit[] array is allocated at initialization time, but its contents
295 * get populated when the corresponding cluster is created. Because a page
296 * can be turned into NMBPBG number of mbufs, we preserve enough space for the
297 * mbufs so that there is a 1-to-1 mapping between them. A page that never
298 * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
299 * remaining entries unused. For 16KB cluster, only one entry from the first
300 * page is allocated and used for the entire object.
303 /* TODO: should be in header file */
304 /* kernel translater */
305 extern vm_offset_t
kmem_mb_alloc(vm_map_t
, int, int);
306 extern ppnum_t
pmap_find_phys(pmap_t pmap
, addr64_t va
);
307 extern vm_map_t mb_map
; /* special map */
310 decl_lck_mtx_data(static, mbuf_mlock_data
);
311 static lck_mtx_t
*mbuf_mlock
= &mbuf_mlock_data
;
312 static lck_attr_t
*mbuf_mlock_attr
;
313 static lck_grp_t
*mbuf_mlock_grp
;
314 static lck_grp_attr_t
*mbuf_mlock_grp_attr
;
316 /* Back-end (common) layer */
317 static void *mbuf_worker_run
; /* wait channel for worker thread */
318 static int mbuf_worker_ready
; /* worker thread is runnable */
319 static int mbuf_expand_mcl
; /* number of cluster creation requets */
320 static int mbuf_expand_big
; /* number of big cluster creation requests */
321 static int mbuf_expand_16k
; /* number of 16KB cluster creation requests */
322 static int ncpu
; /* number of CPUs */
323 static ppnum_t
*mcl_paddr
; /* Array of cluster physical addresses */
324 static ppnum_t mcl_pages
; /* Size of array (# physical pages) */
325 static ppnum_t mcl_paddr_base
; /* Handle returned by IOMapper::iovmAlloc() */
326 static mcache_t
*ref_cache
; /* Cache of cluster reference & flags */
327 static mcache_t
*mcl_audit_con_cache
; /* Audit contents cache */
328 static unsigned int mbuf_debug
; /* patchable mbuf mcache flags */
329 static unsigned int mb_normalized
; /* number of packets "normalized" */
331 #define MB_GROWTH_AGGRESSIVE 1 /* Threshold: 1/2 of total */
332 #define MB_GROWTH_NORMAL 2 /* Threshold: 3/4 of total */
335 MC_MBUF
= 0, /* Regular mbuf */
337 MC_BIGCL
, /* Large (4KB) cluster */
338 MC_16KCL
, /* Jumbo (16KB) cluster */
339 MC_MBUF_CL
, /* mbuf + cluster */
340 MC_MBUF_BIGCL
, /* mbuf + large (4KB) cluster */
341 MC_MBUF_16KCL
/* mbuf + jumbo (16KB) cluster */
344 #define MBUF_CLASS_MIN MC_MBUF
345 #define MBUF_CLASS_MAX MC_MBUF_16KCL
346 #define MBUF_CLASS_LAST MC_16KCL
347 #define MBUF_CLASS_VALID(c) \
348 ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
349 #define MBUF_CLASS_COMPOSITE(c) \
350 ((int)(c) > MBUF_CLASS_LAST)
354 * mbuf specific mcache allocation request flags.
356 #define MCR_COMP MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
359 * Per-cluster slab structure.
361 * A slab is a cluster control structure that contains one or more object
362 * chunks; the available chunks are chained in the slab's freelist (sl_head).
363 * Each time a chunk is taken out of the slab, the slab's reference count
364 * gets incremented. When all chunks have been taken out, the empty slab
365 * gets removed (SLF_DETACHED) from the class's slab list. A chunk that is
366 * returned to a slab causes the slab's reference count to be decremented;
367 * it also causes the slab to be reinserted back to class's slab list, if
368 * it's not already done.
370 * Compartmentalizing of the object chunks into slabs allows us to easily
371 * merge one or more slabs together when the adjacent slabs are idle, as
372 * well as to convert or move a slab from one class to another; e.g. the
373 * mbuf cluster slab can be converted to a regular cluster slab when all
374 * mbufs in the slab have been freed.
376 * A slab may also span across multiple clusters for chunks larger than
377 * a cluster's size. In this case, only the slab of the first cluster is
378 * used. The rest of the slabs are marked with SLF_PARTIAL to indicate
379 * that they are part of the larger slab.
381 * Each slab controls a page of memory.
383 typedef struct mcl_slab
{
384 struct mcl_slab
*sl_next
; /* neighboring slab */
385 u_int8_t sl_class
; /* controlling mbuf class */
386 int8_t sl_refcnt
; /* outstanding allocations */
387 int8_t sl_chunks
; /* chunks (bufs) in this slab */
388 u_int16_t sl_flags
; /* slab flags (see below) */
389 u_int16_t sl_len
; /* slab length */
390 void *sl_base
; /* base of allocated memory */
391 void *sl_head
; /* first free buffer */
392 TAILQ_ENTRY(mcl_slab
) sl_link
; /* next/prev slab on freelist */
395 #define SLF_MAPPED 0x0001 /* backed by a mapped page */
396 #define SLF_PARTIAL 0x0002 /* part of another slab */
397 #define SLF_DETACHED 0x0004 /* not in slab freelist */
400 * The array of slabs are broken into groups of arrays per 1MB of kernel
401 * memory to reduce the footprint. Each group is allocated on demand
402 * whenever a new piece of memory mapped in from the VM crosses the 1MB
405 #define NSLABSPMB ((1 << MBSHIFT) >> PGSHIFT) /* 256 slabs/grp */
407 typedef struct mcl_slabg
{
408 mcl_slab_t slg_slab
[NSLABSPMB
]; /* group of slabs */
412 * Number of slabs needed to control a 16KB cluster object.
414 #define NSLABSP16KB (M16KCLBYTES >> PGSHIFT)
417 * Per-cluster audit structure.
420 mcache_audit_t
*cl_audit
[NMBPBG
]; /* array of audits */
424 struct thread
*msa_thread
; /* thread doing transaction */
425 struct thread
*msa_pthread
; /* previous transaction thread */
426 uint32_t msa_tstamp
; /* transaction timestamp (ms) */
427 uint32_t msa_ptstamp
; /* prev transaction timestamp (ms) */
428 uint16_t msa_depth
; /* pc stack depth */
429 uint16_t msa_pdepth
; /* previous transaction pc stack */
430 void *msa_stack
[MCACHE_STACK_DEPTH
];
431 void *msa_pstack
[MCACHE_STACK_DEPTH
];
432 } mcl_scratch_audit_t
;
436 * Size of data from the beginning of an mbuf that covers m_hdr,
437 * pkthdr and m_ext structures. If auditing is enabled, we allocate
438 * a shadow mbuf structure of this size inside each audit structure,
439 * and the contents of the real mbuf gets copied into it when the mbuf
440 * is freed. This allows us to pattern-fill the mbuf for integrity
441 * check, and to preserve any constructed mbuf fields (e.g. mbuf +
442 * cluster cache case). Note that we don't save the contents of
443 * clusters when they are freed; we simply pattern-fill them.
445 u_int8_t sc_mbuf
[(MSIZE
- _MHLEN
) + sizeof (_m_ext_t
)];
446 mcl_scratch_audit_t sc_scratch
__attribute__((aligned(8)));
447 } mcl_saved_contents_t
;
449 #define AUDIT_CONTENTS_SIZE (sizeof (mcl_saved_contents_t))
451 #define MCA_SAVED_MBUF_PTR(_mca) \
452 ((struct mbuf *)(void *)((mcl_saved_contents_t *) \
453 (_mca)->mca_contents)->sc_mbuf)
454 #define MCA_SAVED_MBUF_SIZE \
455 (sizeof (((mcl_saved_contents_t *)0)->sc_mbuf))
456 #define MCA_SAVED_SCRATCH_PTR(_mca) \
457 (&((mcl_saved_contents_t *)(_mca)->mca_contents)->sc_scratch)
460 * mbuf specific mcache audit flags
462 #define MB_INUSE 0x01 /* object has not been returned to slab */
463 #define MB_COMP_INUSE 0x02 /* object has not been returned to cslab */
464 #define MB_SCVALID 0x04 /* object has valid saved contents */
467 * Each of the following two arrays hold up to nmbclusters elements.
469 static mcl_audit_t
*mclaudit
; /* array of cluster audit information */
470 static unsigned int maxclaudit
; /* max # of entries in audit table */
471 static mcl_slabg_t
**slabstbl
; /* cluster slabs table */
472 static unsigned int maxslabgrp
; /* max # of entries in slabs table */
473 static unsigned int slabgrp
; /* # of entries in slabs table */
476 int nclusters
; /* # of clusters for non-jumbo (legacy) sizes */
477 int njcl
; /* # of clusters for jumbo sizes */
478 int njclbytes
; /* size of a jumbo cluster */
479 union mbigcluster
*mbutl
; /* first mapped cluster address */
480 union mbigcluster
*embutl
; /* ending virtual address of mclusters */
481 int _max_linkhdr
; /* largest link-level header */
482 int _max_protohdr
; /* largest protocol header */
483 int max_hdr
; /* largest link+protocol header */
484 int max_datalen
; /* MHLEN - max_hdr */
486 static boolean_t mclverify
; /* debug: pattern-checking */
487 static boolean_t mcltrace
; /* debug: stack tracing */
488 static boolean_t mclfindleak
; /* debug: leak detection */
489 static boolean_t mclexpleak
; /* debug: expose leak info to user space */
491 static struct timeval mb_start
; /* beginning of time */
493 /* mbuf leak detection variables */
494 static struct mleak_table mleak_table
;
495 static mleak_stat_t
*mleak_stat
;
497 #define MLEAK_STAT_SIZE(n) \
498 ((size_t)(&((mleak_stat_t *)0)->ml_trace[n]))
501 mcache_obj_t
*element
; /* the alloc'ed element, NULL if unused */
502 u_int32_t trace_index
; /* mtrace index for corresponding backtrace */
503 u_int32_t count
; /* How many objects were requested */
504 u_int64_t hitcount
; /* for determining hash effectiveness */
508 u_int64_t collisions
;
512 uintptr_t addr
[MLEAK_STACK_DEPTH
];
515 /* Size must be a power of two for the zhash to be able to just mask off bits */
516 #define MLEAK_ALLOCATION_MAP_NUM 512
517 #define MLEAK_TRACE_MAP_NUM 256
520 * Sample factor for how often to record a trace. This is overwritable
521 * by the boot-arg mleak_sample_factor.
523 #define MLEAK_SAMPLE_FACTOR 500
526 * Number of top leakers recorded.
528 #define MLEAK_NUM_TRACES 5
530 #define MB_LEAK_SPACING_64 " "
531 #define MB_LEAK_SPACING_32 " "
534 #define MB_LEAK_HDR_32 "\n\
535 trace [1] trace [2] trace [3] trace [4] trace [5] \n\
536 ---------- ---------- ---------- ---------- ---------- \n\
539 #define MB_LEAK_HDR_64 "\n\
540 trace [1] trace [2] trace [3] \
541 trace [4] trace [5] \n\
542 ------------------ ------------------ ------------------ \
543 ------------------ ------------------ \n\
546 static uint32_t mleak_alloc_buckets
= MLEAK_ALLOCATION_MAP_NUM
;
547 static uint32_t mleak_trace_buckets
= MLEAK_TRACE_MAP_NUM
;
549 /* Hashmaps of allocations and their corresponding traces */
550 static struct mallocation
*mleak_allocations
;
551 static struct mtrace
*mleak_traces
;
552 static struct mtrace
*mleak_top_trace
[MLEAK_NUM_TRACES
];
554 /* Lock to protect mleak tables from concurrent modification */
555 decl_lck_mtx_data(static, mleak_lock_data
);
556 static lck_mtx_t
*mleak_lock
= &mleak_lock_data
;
557 static lck_attr_t
*mleak_lock_attr
;
558 static lck_grp_t
*mleak_lock_grp
;
559 static lck_grp_attr_t
*mleak_lock_grp_attr
;
561 extern u_int32_t high_sb_max
;
563 /* The minimum number of objects that are allocated, to start. */
565 #define MINBIGCL (MINCL >> 1)
566 #define MIN16KCL (MINCL >> 2)
568 /* Low watermarks (only map in pages once free counts go below) */
569 #define MBIGCL_LOWAT MINBIGCL
570 #define M16KCL_LOWAT MIN16KCL
573 mbuf_class_t mtbl_class
; /* class type */
574 mcache_t
*mtbl_cache
; /* mcache for this buffer class */
575 TAILQ_HEAD(mcl_slhead
, mcl_slab
) mtbl_slablist
; /* slab list */
576 mcache_obj_t
*mtbl_cobjlist
; /* composite objects freelist */
577 mb_class_stat_t
*mtbl_stats
; /* statistics fetchable via sysctl */
578 u_int32_t mtbl_maxsize
; /* maximum buffer size */
579 int mtbl_minlimit
; /* minimum allowed */
580 int mtbl_maxlimit
; /* maximum allowed */
581 u_int32_t mtbl_wantpurge
; /* purge during next reclaim */
582 uint32_t mtbl_avgtotal
; /* average total on iOS */
585 #define m_class(c) mbuf_table[c].mtbl_class
586 #define m_cache(c) mbuf_table[c].mtbl_cache
587 #define m_slablist(c) mbuf_table[c].mtbl_slablist
588 #define m_cobjlist(c) mbuf_table[c].mtbl_cobjlist
589 #define m_maxsize(c) mbuf_table[c].mtbl_maxsize
590 #define m_minlimit(c) mbuf_table[c].mtbl_minlimit
591 #define m_maxlimit(c) mbuf_table[c].mtbl_maxlimit
592 #define m_wantpurge(c) mbuf_table[c].mtbl_wantpurge
593 #define m_avgtotal(c) mbuf_table[c].mtbl_avgtotal
594 #define m_cname(c) mbuf_table[c].mtbl_stats->mbcl_cname
595 #define m_size(c) mbuf_table[c].mtbl_stats->mbcl_size
596 #define m_total(c) mbuf_table[c].mtbl_stats->mbcl_total
597 #define m_active(c) mbuf_table[c].mtbl_stats->mbcl_active
598 #define m_infree(c) mbuf_table[c].mtbl_stats->mbcl_infree
599 #define m_slab_cnt(c) mbuf_table[c].mtbl_stats->mbcl_slab_cnt
600 #define m_alloc_cnt(c) mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
601 #define m_free_cnt(c) mbuf_table[c].mtbl_stats->mbcl_free_cnt
602 #define m_notified(c) mbuf_table[c].mtbl_stats->mbcl_notified
603 #define m_purge_cnt(c) mbuf_table[c].mtbl_stats->mbcl_purge_cnt
604 #define m_fail_cnt(c) mbuf_table[c].mtbl_stats->mbcl_fail_cnt
605 #define m_ctotal(c) mbuf_table[c].mtbl_stats->mbcl_ctotal
606 #define m_peak(c) mbuf_table[c].mtbl_stats->mbcl_peak_reported
607 #define m_release_cnt(c) mbuf_table[c].mtbl_stats->mbcl_release_cnt
609 static mbuf_table_t mbuf_table
[] = {
611 * The caches for mbufs, regular clusters and big clusters.
612 * The average total values were based on data gathered by actual
613 * usage patterns on iOS.
615 { MC_MBUF
, NULL
, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF
)),
616 NULL
, NULL
, 0, 0, 0, 0, 3000 },
617 { MC_CL
, NULL
, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL
)),
618 NULL
, NULL
, 0, 0, 0, 0, 2000 },
619 { MC_BIGCL
, NULL
, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL
)),
620 NULL
, NULL
, 0, 0, 0, 0, 1000 },
621 { MC_16KCL
, NULL
, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL
)),
622 NULL
, NULL
, 0, 0, 0, 0, 1000 },
624 * The following are special caches; they serve as intermediate
625 * caches backed by the above rudimentary caches. Each object
626 * in the cache is an mbuf with a cluster attached to it. Unlike
627 * the above caches, these intermediate caches do not directly
628 * deal with the slab structures; instead, the constructed
629 * cached elements are simply stored in the freelists.
631 { MC_MBUF_CL
, NULL
, { NULL
, NULL
}, NULL
, NULL
, 0, 0, 0, 0, 2000 },
632 { MC_MBUF_BIGCL
, NULL
, { NULL
, NULL
}, NULL
, NULL
, 0, 0, 0, 0, 1000 },
633 { MC_MBUF_16KCL
, NULL
, { NULL
, NULL
}, NULL
, NULL
, 0, 0, 0, 0, 1000 },
636 #define NELEM(a) (sizeof (a) / sizeof ((a)[0]))
638 static void *mb_waitchan
= &mbuf_table
; /* wait channel for all caches */
639 static int mb_waiters
; /* number of waiters */
641 boolean_t mb_peak_newreport
= FALSE
;
642 boolean_t mb_peak_firstreport
= FALSE
;
644 /* generate a report by default after 1 week of uptime */
645 #define MBUF_PEAK_FIRST_REPORT_THRESHOLD 604800
647 #define MB_WDT_MAXTIME 10 /* # of secs before watchdog panic */
648 static struct timeval mb_wdtstart
; /* watchdog start timestamp */
649 static char *mbuf_dump_buf
;
651 #define MBUF_DUMP_BUF_SIZE 2048
654 * mbuf watchdog is enabled by default on embedded platforms. It is
655 * also toggeable via the kern.ipc.mb_watchdog sysctl.
656 * Garbage collection is also enabled by default on embedded platforms.
657 * mb_drain_maxint controls the amount of time to wait (in seconds) before
658 * consecutive calls to m_drain().
660 static unsigned int mb_watchdog
= 0;
661 static unsigned int mb_drain_maxint
= 0;
664 static u_int32_t mb_redzone_cookie
;
665 static void m_redzone_init(struct mbuf
*);
666 static void m_redzone_verify(struct mbuf
*m
);
668 /* The following are used to serialize m_clalloc() */
669 static boolean_t mb_clalloc_busy
;
670 static void *mb_clalloc_waitchan
= &mb_clalloc_busy
;
671 static int mb_clalloc_waiters
;
673 static void mbuf_mtypes_sync(boolean_t
);
674 static int mbstat_sysctl SYSCTL_HANDLER_ARGS
;
675 static void mbuf_stat_sync(void);
676 static int mb_stat_sysctl SYSCTL_HANDLER_ARGS
;
677 static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS
;
678 static int mleak_table_sysctl SYSCTL_HANDLER_ARGS
;
679 static char *mbuf_dump(void);
680 static void mbuf_table_init(void);
681 static inline void m_incref(struct mbuf
*);
682 static inline u_int32_t
m_decref(struct mbuf
*);
683 static int m_clalloc(const u_int32_t
, const int, const u_int32_t
);
684 static void mbuf_worker_thread_init(void);
685 static mcache_obj_t
*slab_alloc(mbuf_class_t
, int);
686 static void slab_free(mbuf_class_t
, mcache_obj_t
*);
687 static unsigned int mbuf_slab_alloc(void *, mcache_obj_t
***,
689 static void mbuf_slab_free(void *, mcache_obj_t
*, int);
690 static void mbuf_slab_audit(void *, mcache_obj_t
*, boolean_t
);
691 static void mbuf_slab_notify(void *, u_int32_t
);
692 static unsigned int cslab_alloc(mbuf_class_t
, mcache_obj_t
***,
694 static unsigned int cslab_free(mbuf_class_t
, mcache_obj_t
*, int);
695 static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t
***,
697 static void mbuf_cslab_free(void *, mcache_obj_t
*, int);
698 static void mbuf_cslab_audit(void *, mcache_obj_t
*, boolean_t
);
699 static int freelist_populate(mbuf_class_t
, unsigned int, int);
700 static void freelist_init(mbuf_class_t
);
701 static boolean_t
mbuf_cached_above(mbuf_class_t
, int);
702 static boolean_t
mbuf_steal(mbuf_class_t
, unsigned int);
703 static void m_reclaim(mbuf_class_t
, unsigned int, boolean_t
);
704 static int m_howmany(int, size_t);
705 static void mbuf_worker_thread(void);
706 static void mbuf_watchdog(void);
707 static boolean_t
mbuf_sleep(mbuf_class_t
, unsigned int, int);
709 static void mcl_audit_init(void *, mcache_audit_t
**, mcache_obj_t
**,
710 size_t, unsigned int);
711 static void mcl_audit_free(void *, unsigned int);
712 static mcache_audit_t
*mcl_audit_buf2mca(mbuf_class_t
, mcache_obj_t
*);
713 static void mcl_audit_mbuf(mcache_audit_t
*, void *, boolean_t
, boolean_t
);
714 static void mcl_audit_cluster(mcache_audit_t
*, void *, size_t, boolean_t
,
716 static void mcl_audit_restore_mbuf(struct mbuf
*, mcache_audit_t
*, boolean_t
);
717 static void mcl_audit_save_mbuf(struct mbuf
*, mcache_audit_t
*);
718 static void mcl_audit_scratch(mcache_audit_t
*);
719 static void mcl_audit_mcheck_panic(struct mbuf
*);
720 static void mcl_audit_verify_nextptr(void *, mcache_audit_t
*);
722 static void mleak_activate(void);
723 static void mleak_logger(u_int32_t
, mcache_obj_t
*, boolean_t
);
724 static boolean_t
mleak_log(uintptr_t *, mcache_obj_t
*, uint32_t, int);
725 static void mleak_free(mcache_obj_t
*);
726 static void mleak_sort_traces(void);
727 static void mleak_update_stats(void);
729 static mcl_slab_t
*slab_get(void *);
730 static void slab_init(mcl_slab_t
*, mbuf_class_t
, u_int32_t
,
731 void *, void *, unsigned int, int, int);
732 static void slab_insert(mcl_slab_t
*, mbuf_class_t
);
733 static void slab_remove(mcl_slab_t
*, mbuf_class_t
);
734 static boolean_t
slab_inrange(mcl_slab_t
*, void *);
735 static void slab_nextptr_panic(mcl_slab_t
*, void *);
736 static void slab_detach(mcl_slab_t
*);
737 static boolean_t
slab_is_detached(mcl_slab_t
*);
739 static int m_copyback0(struct mbuf
**, int, int, const void *, int, int);
740 static struct mbuf
*m_split0(struct mbuf
*, int, int, int);
741 __private_extern__
void mbuf_report_peak_usage(void);
742 static boolean_t
mbuf_report_usage(mbuf_class_t
);
744 /* flags for m_copyback0 */
745 #define M_COPYBACK0_COPYBACK 0x0001 /* copyback from cp */
746 #define M_COPYBACK0_PRESERVE 0x0002 /* preserve original data */
747 #define M_COPYBACK0_COW 0x0004 /* do copy-on-write */
748 #define M_COPYBACK0_EXTEND 0x0008 /* extend chain */
751 * This flag is set for all mbufs that come out of and into the composite
752 * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL. mbufs that
753 * are marked with such a flag have clusters attached to them, and will be
754 * treated differently when they are freed; instead of being placed back
755 * into the mbuf and cluster freelists, the composite mbuf + cluster objects
756 * are placed back into the appropriate composite cache's freelist, and the
757 * actual freeing is deferred until the composite objects are purged. At
758 * such a time, this flag will be cleared from the mbufs and the objects
759 * will be freed into their own separate freelists.
761 #define EXTF_COMPOSITE 0x1
764 * This flag indicates that the external cluster is read-only, i.e. it is
765 * or was referred to by more than one mbufs. Once set, this flag is never
768 #define EXTF_READONLY 0x2
769 #define EXTF_MASK (EXTF_COMPOSITE | EXTF_READONLY)
771 #define MEXT_RFA(m) ((m)->m_ext.ext_refflags)
772 #define MEXT_REF(m) (MEXT_RFA(m)->refcnt)
773 #define MEXT_FLAGS(m) (MEXT_RFA(m)->flags)
774 #define MBUF_IS_COMPOSITE(m) \
775 (MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE)
778 * Macros used to verify the integrity of the mbuf.
780 #define _MCHECK(m) { \
781 if ((m)->m_type != MT_FREE) { \
782 if (mclaudit == NULL) \
783 panic("MCHECK: m_type=%d m=%p", \
784 (u_int16_t)(m)->m_type, m); \
786 mcl_audit_mcheck_panic(m); \
790 #define MBUF_IN_MAP(addr) \
791 ((void *)(addr) >= (void *)mbutl && (void *)(addr) < (void *)embutl)
793 #define MRANGE(addr) { \
794 if (!MBUF_IN_MAP(addr)) \
795 panic("MRANGE: address out of range 0x%p", addr); \
799 * Macro version of mtod.
801 #define MTOD(m, t) ((t)((m)->m_data))
804 * Macros to obtain (4KB) cluster index and base cluster address.
807 #define MTOBG(x) (((char *)(x) - (char *)mbutl) >> MBIGCLSHIFT)
808 #define BGTOM(x) ((union mbigcluster *)(mbutl + (x)))
811 * Macro to find the mbuf index relative to a base.
813 #define MCLIDX(c, m) (((char *)(m) - (char *)(c)) >> MSIZESHIFT)
816 * Same thing for 2KB cluster index.
818 #define CLBGIDX(c, m) (((char *)(m) - (char *)(c)) >> MCLSHIFT)
821 * Macros used during mbuf and cluster initialization.
823 #define MBUF_INIT_PKTHDR(m) { \
824 (m)->m_pkthdr.rcvif = NULL; \
825 (m)->m_pkthdr.pkt_hdr = NULL; \
826 (m)->m_pkthdr.len = 0; \
827 (m)->m_pkthdr.csum_flags = 0; \
828 (m)->m_pkthdr.csum_data = 0; \
829 (m)->m_pkthdr.vlan_tag = 0; \
830 m_classifier_init(m, 0); \
836 #define MBUF_INIT(m, pkthdr, type) { \
838 (m)->m_next = (m)->m_nextpkt = NULL; \
840 (m)->m_type = type; \
841 if ((pkthdr) == 0) { \
842 (m)->m_data = (m)->m_dat; \
845 (m)->m_data = (m)->m_pktdat; \
846 (m)->m_flags = M_PKTHDR; \
847 MBUF_INIT_PKTHDR(m); \
851 #define MEXT_INIT(m, buf, size, free, arg, rfa, ref, flag) { \
852 (m)->m_data = (m)->m_ext.ext_buf = (buf); \
853 (m)->m_flags |= M_EXT; \
854 (m)->m_ext.ext_size = (size); \
855 (m)->m_ext.ext_free = (free); \
856 (m)->m_ext.ext_arg = (arg); \
857 (m)->m_ext.ext_refs.forward = (m)->m_ext.ext_refs.backward = \
858 &(m)->m_ext.ext_refs; \
859 MEXT_RFA(m) = (rfa); \
860 MEXT_REF(m) = (ref); \
861 MEXT_FLAGS(m) = (flag); \
864 #define MBUF_CL_INIT(m, buf, rfa, ref, flag) \
865 MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, ref, flag)
867 #define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
868 MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, ref, flag)
870 #define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
871 MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, ref, flag)
874 * Macro to convert BSD malloc sleep flag to mcache's
876 #define MSLEEPF(f) ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
879 * The structure that holds all mbuf class statistics exportable via sysctl.
880 * Similar to mbstat structure, the mb_stat structure is protected by the
881 * global mbuf lock. It contains additional information about the classes
882 * that allows for a more accurate view of the state of the allocator.
884 struct mb_stat
*mb_stat
;
885 struct omb_stat
*omb_stat
; /* For backwards compatibility */
887 #define MB_STAT_SIZE(n) \
888 ((size_t)(&((mb_stat_t *)0)->mbs_class[n]))
889 #define OMB_STAT_SIZE(n) \
890 ((size_t)(&((struct omb_stat *)0)->mbs_class[n]))
893 * The legacy structure holding all of the mbuf allocation statistics.
894 * The actual statistics used by the kernel are stored in the mbuf_table
895 * instead, and are updated atomically while the global mbuf lock is held.
896 * They are mirrored in mbstat to support legacy applications (e.g. netstat).
897 * Unlike before, the kernel no longer relies on the contents of mbstat for
898 * its operations (e.g. cluster expansion) because the structure is exposed
899 * to outside and could possibly be modified, therefore making it unsafe.
900 * With the exception of the mbstat.m_mtypes array (see below), all of the
901 * statistics are updated as they change.
903 struct mbstat mbstat
;
905 #define MBSTAT_MTYPES_MAX \
906 (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
909 * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
910 * atomically and stored in a per-CPU structure which is lock-free; this is
911 * done in order to avoid writing to the global mbstat data structure which
912 * would cause false sharing. During sysctl request for kern.ipc.mbstat,
913 * the statistics across all CPUs will be converged into the mbstat.m_mtypes
914 * array and returned to the application. Any updates for types greater or
915 * equal than MT_MAX would be done atomically to the mbstat; this slows down
916 * performance but is okay since the kernel uses only up to MT_MAX-1 while
917 * anything beyond that (up to type 255) is considered a corner case.
920 unsigned int cpu_mtypes
[MT_MAX
];
921 } __attribute__((aligned(MAX_CPU_CACHE_LINE_SIZE
), packed
)) mtypes_cpu_t
;
924 mtypes_cpu_t mbs_cpu
[1];
927 static mbuf_mtypes_t
*mbuf_mtypes
; /* per-CPU statistics */
929 #define MBUF_MTYPES_SIZE(n) \
930 ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n]))
932 #define MTYPES_CPU(p) \
933 ((mtypes_cpu_t *)(void *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number())))
935 #define mtype_stat_add(type, n) { \
936 if ((unsigned)(type) < MT_MAX) { \
937 mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes); \
938 atomic_add_32(&mbs->cpu_mtypes[type], n); \
939 } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) { \
940 atomic_add_16((int16_t *)&mbstat.m_mtypes[type], n); \
944 #define mtype_stat_sub(t, n) mtype_stat_add(t, -(n))
945 #define mtype_stat_inc(t) mtype_stat_add(t, 1)
946 #define mtype_stat_dec(t) mtype_stat_sub(t, 1)
949 mbuf_mtypes_sync(boolean_t locked
)
955 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
957 bzero(&mtc
, sizeof (mtc
));
958 for (m
= 0; m
< ncpu
; m
++) {
959 mtypes_cpu_t
*scp
= &mbuf_mtypes
->mbs_cpu
[m
];
962 bcopy(&scp
->cpu_mtypes
, &temp
.cpu_mtypes
,
963 sizeof (temp
.cpu_mtypes
));
965 for (n
= 0; n
< MT_MAX
; n
++)
966 mtc
.cpu_mtypes
[n
] += temp
.cpu_mtypes
[n
];
969 lck_mtx_lock(mbuf_mlock
);
970 for (n
= 0; n
< MT_MAX
; n
++)
971 mbstat
.m_mtypes
[n
] = mtc
.cpu_mtypes
[n
];
973 lck_mtx_unlock(mbuf_mlock
);
977 mbstat_sysctl SYSCTL_HANDLER_ARGS
979 #pragma unused(oidp, arg1, arg2)
980 mbuf_mtypes_sync(FALSE
);
982 return (SYSCTL_OUT(req
, &mbstat
, sizeof (mbstat
)));
993 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
995 for (k
= 0; k
< NELEM(mbuf_table
); k
++) {
997 ccp
= &cp
->mc_cpu
[0];
998 bktsize
= ccp
->cc_bktsize
;
999 sp
= mbuf_table
[k
].mtbl_stats
;
1001 if (cp
->mc_flags
& MCF_NOCPUCACHE
)
1002 sp
->mbcl_mc_state
= MCS_DISABLED
;
1003 else if (cp
->mc_purge_cnt
> 0)
1004 sp
->mbcl_mc_state
= MCS_PURGING
;
1005 else if (bktsize
== 0)
1006 sp
->mbcl_mc_state
= MCS_OFFLINE
;
1008 sp
->mbcl_mc_state
= MCS_ONLINE
;
1010 sp
->mbcl_mc_cached
= 0;
1011 for (m
= 0; m
< ncpu
; m
++) {
1012 ccp
= &cp
->mc_cpu
[m
];
1013 if (ccp
->cc_objs
> 0)
1014 sp
->mbcl_mc_cached
+= ccp
->cc_objs
;
1015 if (ccp
->cc_pobjs
> 0)
1016 sp
->mbcl_mc_cached
+= ccp
->cc_pobjs
;
1018 sp
->mbcl_mc_cached
+= (cp
->mc_full
.bl_total
* bktsize
);
1019 sp
->mbcl_active
= sp
->mbcl_total
- sp
->mbcl_mc_cached
-
1022 sp
->mbcl_mc_waiter_cnt
= cp
->mc_waiter_cnt
;
1023 sp
->mbcl_mc_wretry_cnt
= cp
->mc_wretry_cnt
;
1024 sp
->mbcl_mc_nwretry_cnt
= cp
->mc_nwretry_cnt
;
1026 /* Calculate total count specific to each class */
1027 sp
->mbcl_ctotal
= sp
->mbcl_total
;
1028 switch (m_class(k
)) {
1030 /* Deduct mbufs used in composite caches */
1031 sp
->mbcl_ctotal
-= (m_total(MC_MBUF_CL
) +
1032 m_total(MC_MBUF_BIGCL
));
1036 /* Deduct clusters used in composite cache */
1037 sp
->mbcl_ctotal
-= m_total(MC_MBUF_CL
);
1041 /* Deduct clusters used in composite cache */
1042 sp
->mbcl_ctotal
-= m_total(MC_MBUF_BIGCL
);
1046 /* Deduct clusters used in composite cache */
1047 sp
->mbcl_ctotal
-= m_total(MC_MBUF_16KCL
);
1057 mb_stat_sysctl SYSCTL_HANDLER_ARGS
1059 #pragma unused(oidp, arg1, arg2)
1061 int k
, statsz
, proc64
= proc_is64bit(req
->p
);
1063 lck_mtx_lock(mbuf_mlock
);
1067 struct omb_class_stat
*oc
;
1068 struct mb_class_stat
*c
;
1070 omb_stat
->mbs_cnt
= mb_stat
->mbs_cnt
;
1071 oc
= &omb_stat
->mbs_class
[0];
1072 c
= &mb_stat
->mbs_class
[0];
1073 for (k
= 0; k
< omb_stat
->mbs_cnt
; k
++, oc
++, c
++) {
1074 (void) snprintf(oc
->mbcl_cname
, sizeof (oc
->mbcl_cname
),
1075 "%s", c
->mbcl_cname
);
1076 oc
->mbcl_size
= c
->mbcl_size
;
1077 oc
->mbcl_total
= c
->mbcl_total
;
1078 oc
->mbcl_active
= c
->mbcl_active
;
1079 oc
->mbcl_infree
= c
->mbcl_infree
;
1080 oc
->mbcl_slab_cnt
= c
->mbcl_slab_cnt
;
1081 oc
->mbcl_alloc_cnt
= c
->mbcl_alloc_cnt
;
1082 oc
->mbcl_free_cnt
= c
->mbcl_free_cnt
;
1083 oc
->mbcl_notified
= c
->mbcl_notified
;
1084 oc
->mbcl_purge_cnt
= c
->mbcl_purge_cnt
;
1085 oc
->mbcl_fail_cnt
= c
->mbcl_fail_cnt
;
1086 oc
->mbcl_ctotal
= c
->mbcl_ctotal
;
1087 oc
->mbcl_release_cnt
= c
->mbcl_release_cnt
;
1088 oc
->mbcl_mc_state
= c
->mbcl_mc_state
;
1089 oc
->mbcl_mc_cached
= c
->mbcl_mc_cached
;
1090 oc
->mbcl_mc_waiter_cnt
= c
->mbcl_mc_waiter_cnt
;
1091 oc
->mbcl_mc_wretry_cnt
= c
->mbcl_mc_wretry_cnt
;
1092 oc
->mbcl_mc_nwretry_cnt
= c
->mbcl_mc_nwretry_cnt
;
1095 statsz
= OMB_STAT_SIZE(NELEM(mbuf_table
));
1098 statsz
= MB_STAT_SIZE(NELEM(mbuf_table
));
1101 lck_mtx_unlock(mbuf_mlock
);
1103 return (SYSCTL_OUT(req
, statp
, statsz
));
1107 mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS
1109 #pragma unused(oidp, arg1, arg2)
1112 /* Ensure leak tracing turned on */
1113 if (!mclfindleak
|| !mclexpleak
)
1116 lck_mtx_lock(mleak_lock
);
1117 mleak_update_stats();
1118 i
= SYSCTL_OUT(req
, mleak_stat
, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES
));
1119 lck_mtx_unlock(mleak_lock
);
1125 mleak_table_sysctl SYSCTL_HANDLER_ARGS
1127 #pragma unused(oidp, arg1, arg2)
1130 /* Ensure leak tracing turned on */
1131 if (!mclfindleak
|| !mclexpleak
)
1134 lck_mtx_lock(mleak_lock
);
1135 i
= SYSCTL_OUT(req
, &mleak_table
, sizeof (mleak_table
));
1136 lck_mtx_unlock(mleak_lock
);
1142 m_incref(struct mbuf
*m
)
1145 volatile UInt32
*addr
= (volatile UInt32
*)&MEXT_REF(m
);
1151 } while (!OSCompareAndSwap(old
, new, addr
));
1154 * If cluster is shared, mark it with (sticky) EXTF_READONLY;
1155 * we don't clear the flag when the refcount goes back to 1
1156 * to simplify code calling m_mclhasreference().
1158 if (new > 1 && !(MEXT_FLAGS(m
) & EXTF_READONLY
))
1159 (void) OSBitOrAtomic(EXTF_READONLY
, &MEXT_FLAGS(m
));
1162 static inline u_int32_t
1163 m_decref(struct mbuf
*m
)
1166 volatile UInt32
*addr
= (volatile UInt32
*)&MEXT_REF(m
);
1172 } while (!OSCompareAndSwap(old
, new, addr
));
1178 mbuf_table_init(void)
1180 unsigned int b
, c
, s
;
1183 MALLOC(omb_stat
, struct omb_stat
*, OMB_STAT_SIZE(NELEM(mbuf_table
)),
1184 M_TEMP
, M_WAITOK
| M_ZERO
);
1185 VERIFY(omb_stat
!= NULL
);
1187 MALLOC(mb_stat
, mb_stat_t
*, MB_STAT_SIZE(NELEM(mbuf_table
)),
1188 M_TEMP
, M_WAITOK
| M_ZERO
);
1189 VERIFY(mb_stat
!= NULL
);
1191 mb_stat
->mbs_cnt
= NELEM(mbuf_table
);
1192 for (m
= 0; m
< NELEM(mbuf_table
); m
++)
1193 mbuf_table
[m
].mtbl_stats
= &mb_stat
->mbs_class
[m
];
1195 #if CONFIG_MBUF_JUMBO
1197 * Set aside 1/3 of the mbuf cluster map for jumbo clusters; we do
1198 * this only on platforms where jumbo cluster pool is enabled.
1200 njcl
= nmbclusters
/ 3;
1201 njclbytes
= M16KCLBYTES
;
1202 #endif /* CONFIG_MBUF_JUMBO */
1205 * nclusters holds both the 2KB and 4KB pools, so ensure it's
1206 * a multiple of 4KB clusters.
1208 nclusters
= P2ROUNDDOWN(nmbclusters
- njcl
, NCLPBG
);
1211 * Each jumbo cluster takes 8 2KB clusters, so make
1212 * sure that the pool size is evenly divisible by 8;
1213 * njcl is in 2KB unit, hence treated as such.
1215 njcl
= P2ROUNDDOWN(nmbclusters
- nclusters
, 8);
1217 /* Update nclusters with rounded down value of njcl */
1218 nclusters
= P2ROUNDDOWN(nmbclusters
- njcl
, NCLPBG
);
1222 * njcl is valid only on platforms with 16KB jumbo clusters, where
1223 * it is configured to 1/3 of the pool size. On these platforms,
1224 * the remaining is used for 2KB and 4KB clusters. On platforms
1225 * without 16KB jumbo clusters, the entire pool is used for both
1226 * 2KB and 4KB clusters. A 4KB cluster can either be splitted into
1227 * 16 mbufs, or into 2 2KB clusters.
1229 * +---+---+------------ ... -----------+------- ... -------+
1230 * | c | b | s | njcl |
1231 * +---+---+------------ ... -----------+------- ... -------+
1233 * 1/32th of the shared region is reserved for pure 2KB and 4KB
1234 * clusters (1/64th each.)
1236 c
= P2ROUNDDOWN((nclusters
>> 6), 2); /* in 2KB unit */
1237 b
= P2ROUNDDOWN((nclusters
>> (6 + NCLPBGSHIFT
)), 2); /* in 4KB unit */
1238 s
= nclusters
- (c
+ (b
<< NCLPBGSHIFT
)); /* in 2KB unit */
1241 * 1/64th (c) is reserved for 2KB clusters.
1243 m_minlimit(MC_CL
) = c
;
1244 m_maxlimit(MC_CL
) = s
+ c
; /* in 2KB unit */
1245 m_maxsize(MC_CL
) = m_size(MC_CL
) = MCLBYTES
;
1246 (void) snprintf(m_cname(MC_CL
), MAX_MBUF_CNAME
, "cl");
1249 * Another 1/64th (b) of the map is reserved for 4KB clusters.
1250 * It cannot be turned into 2KB clusters or mbufs.
1252 m_minlimit(MC_BIGCL
) = b
;
1253 m_maxlimit(MC_BIGCL
) = (s
>> NCLPBGSHIFT
) + b
; /* in 4KB unit */
1254 m_maxsize(MC_BIGCL
) = m_size(MC_BIGCL
) = MBIGCLBYTES
;
1255 (void) snprintf(m_cname(MC_BIGCL
), MAX_MBUF_CNAME
, "bigcl");
1258 * The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB)
1260 m_minlimit(MC_MBUF
) = 0;
1261 m_maxlimit(MC_MBUF
) = (s
<< NMBPCLSHIFT
); /* in mbuf unit */
1262 m_maxsize(MC_MBUF
) = m_size(MC_MBUF
) = MSIZE
;
1263 (void) snprintf(m_cname(MC_MBUF
), MAX_MBUF_CNAME
, "mbuf");
1266 * Set limits for the composite classes.
1268 m_minlimit(MC_MBUF_CL
) = 0;
1269 m_maxlimit(MC_MBUF_CL
) = m_maxlimit(MC_CL
);
1270 m_maxsize(MC_MBUF_CL
) = MCLBYTES
;
1271 m_size(MC_MBUF_CL
) = m_size(MC_MBUF
) + m_size(MC_CL
);
1272 (void) snprintf(m_cname(MC_MBUF_CL
), MAX_MBUF_CNAME
, "mbuf_cl");
1274 m_minlimit(MC_MBUF_BIGCL
) = 0;
1275 m_maxlimit(MC_MBUF_BIGCL
) = m_maxlimit(MC_BIGCL
);
1276 m_maxsize(MC_MBUF_BIGCL
) = MBIGCLBYTES
;
1277 m_size(MC_MBUF_BIGCL
) = m_size(MC_MBUF
) + m_size(MC_BIGCL
);
1278 (void) snprintf(m_cname(MC_MBUF_BIGCL
), MAX_MBUF_CNAME
, "mbuf_bigcl");
1281 * And for jumbo classes.
1283 m_minlimit(MC_16KCL
) = 0;
1284 m_maxlimit(MC_16KCL
) = (njcl
>> NCLPJCLSHIFT
); /* in 16KB unit */
1285 m_maxsize(MC_16KCL
) = m_size(MC_16KCL
) = M16KCLBYTES
;
1286 (void) snprintf(m_cname(MC_16KCL
), MAX_MBUF_CNAME
, "16kcl");
1288 m_minlimit(MC_MBUF_16KCL
) = 0;
1289 m_maxlimit(MC_MBUF_16KCL
) = m_maxlimit(MC_16KCL
);
1290 m_maxsize(MC_MBUF_16KCL
) = M16KCLBYTES
;
1291 m_size(MC_MBUF_16KCL
) = m_size(MC_MBUF
) + m_size(MC_16KCL
);
1292 (void) snprintf(m_cname(MC_MBUF_16KCL
), MAX_MBUF_CNAME
, "mbuf_16kcl");
1295 * Initialize the legacy mbstat structure.
1297 bzero(&mbstat
, sizeof (mbstat
));
1298 mbstat
.m_msize
= m_maxsize(MC_MBUF
);
1299 mbstat
.m_mclbytes
= m_maxsize(MC_CL
);
1300 mbstat
.m_minclsize
= MINCLSIZE
;
1301 mbstat
.m_mlen
= MLEN
;
1302 mbstat
.m_mhlen
= MHLEN
;
1303 mbstat
.m_bigmclbytes
= m_maxsize(MC_BIGCL
);
1306 #if defined(__LP64__)
1307 typedef struct ncl_tbl
{
1308 uint64_t nt_maxmem
; /* memory (sane) size */
1309 uint32_t nt_mbpool
; /* mbuf pool size */
1313 static ncl_tbl_t ncl_table
[] = {
1314 { (1ULL << GBSHIFT
) /* 1 GB */, (64 << MBSHIFT
) /* 64 MB */ },
1315 { (1ULL << (GBSHIFT
+ 3)) /* 8 GB */, (96 << MBSHIFT
) /* 96 MB */ },
1316 { (1ULL << (GBSHIFT
+ 4)) /* 16 GB */, (128 << MBSHIFT
) /* 128 MB */ },
1321 static ncl_tbl_t ncl_table_srv
[] = {
1322 { (1ULL << GBSHIFT
) /* 1 GB */, (96 << MBSHIFT
) /* 96 MB */ },
1323 { (1ULL << (GBSHIFT
+ 2)) /* 4 GB */, (128 << MBSHIFT
) /* 128 MB */ },
1324 { (1ULL << (GBSHIFT
+ 3)) /* 8 GB */, (160 << MBSHIFT
) /* 160 MB */ },
1325 { (1ULL << (GBSHIFT
+ 4)) /* 16 GB */, (192 << MBSHIFT
) /* 192 MB */ },
1326 { (1ULL << (GBSHIFT
+ 5)) /* 32 GB */, (256 << MBSHIFT
) /* 256 MB */ },
1327 { (1ULL << (GBSHIFT
+ 6)) /* 64 GB */, (384 << MBSHIFT
) /* 384 MB */ },
1330 #endif /* __LP64__ */
1332 __private_extern__
unsigned int
1333 mbuf_default_ncl(int server
, uint64_t mem
)
1335 #if !defined(__LP64__)
1336 #pragma unused(server)
1339 * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
1341 if ((n
= ((mem
/ 16) / MCLBYTES
)) > 32768)
1345 ncl_tbl_t
*tbl
= (server
? ncl_table_srv
: ncl_table
);
1347 * 64-bit kernel (mbuf pool size based on table).
1349 n
= tbl
[0].nt_mbpool
;
1350 for (i
= 0; tbl
[i
].nt_mbpool
!= 0; i
++) {
1351 if (mem
< tbl
[i
].nt_maxmem
)
1353 n
= tbl
[i
].nt_mbpool
;
1356 #endif /* !__LP64__ */
1360 __private_extern__
void
1364 unsigned int initmcl
= 0;
1366 thread_t thread
= THREAD_NULL
;
1368 microuptime(&mb_start
);
1371 * These MBUF_ values must be equal to their private counterparts.
1373 _CASSERT(MBUF_EXT
== M_EXT
);
1374 _CASSERT(MBUF_PKTHDR
== M_PKTHDR
);
1375 _CASSERT(MBUF_EOR
== M_EOR
);
1376 _CASSERT(MBUF_LOOP
== M_LOOP
);
1377 _CASSERT(MBUF_BCAST
== M_BCAST
);
1378 _CASSERT(MBUF_MCAST
== M_MCAST
);
1379 _CASSERT(MBUF_FRAG
== M_FRAG
);
1380 _CASSERT(MBUF_FIRSTFRAG
== M_FIRSTFRAG
);
1381 _CASSERT(MBUF_LASTFRAG
== M_LASTFRAG
);
1382 _CASSERT(MBUF_PROMISC
== M_PROMISC
);
1383 _CASSERT(MBUF_HASFCS
== M_HASFCS
);
1385 _CASSERT(MBUF_TYPE_FREE
== MT_FREE
);
1386 _CASSERT(MBUF_TYPE_DATA
== MT_DATA
);
1387 _CASSERT(MBUF_TYPE_HEADER
== MT_HEADER
);
1388 _CASSERT(MBUF_TYPE_SOCKET
== MT_SOCKET
);
1389 _CASSERT(MBUF_TYPE_PCB
== MT_PCB
);
1390 _CASSERT(MBUF_TYPE_RTABLE
== MT_RTABLE
);
1391 _CASSERT(MBUF_TYPE_HTABLE
== MT_HTABLE
);
1392 _CASSERT(MBUF_TYPE_ATABLE
== MT_ATABLE
);
1393 _CASSERT(MBUF_TYPE_SONAME
== MT_SONAME
);
1394 _CASSERT(MBUF_TYPE_SOOPTS
== MT_SOOPTS
);
1395 _CASSERT(MBUF_TYPE_FTABLE
== MT_FTABLE
);
1396 _CASSERT(MBUF_TYPE_RIGHTS
== MT_RIGHTS
);
1397 _CASSERT(MBUF_TYPE_IFADDR
== MT_IFADDR
);
1398 _CASSERT(MBUF_TYPE_CONTROL
== MT_CONTROL
);
1399 _CASSERT(MBUF_TYPE_OOBDATA
== MT_OOBDATA
);
1401 _CASSERT(MBUF_TSO_IPV4
== CSUM_TSO_IPV4
);
1402 _CASSERT(MBUF_TSO_IPV6
== CSUM_TSO_IPV6
);
1403 _CASSERT(MBUF_CSUM_REQ_SUM16
== CSUM_PARTIAL
);
1404 _CASSERT(MBUF_CSUM_TCP_SUM16
== MBUF_CSUM_REQ_SUM16
);
1405 _CASSERT(MBUF_CSUM_REQ_IP
== CSUM_IP
);
1406 _CASSERT(MBUF_CSUM_REQ_TCP
== CSUM_TCP
);
1407 _CASSERT(MBUF_CSUM_REQ_UDP
== CSUM_UDP
);
1408 _CASSERT(MBUF_CSUM_REQ_TCPIPV6
== CSUM_TCPIPV6
);
1409 _CASSERT(MBUF_CSUM_REQ_UDPIPV6
== CSUM_UDPIPV6
);
1410 _CASSERT(MBUF_CSUM_DID_IP
== CSUM_IP_CHECKED
);
1411 _CASSERT(MBUF_CSUM_IP_GOOD
== CSUM_IP_VALID
);
1412 _CASSERT(MBUF_CSUM_DID_DATA
== CSUM_DATA_VALID
);
1413 _CASSERT(MBUF_CSUM_PSEUDO_HDR
== CSUM_PSEUDO_HDR
);
1415 _CASSERT(MBUF_WAITOK
== M_WAIT
);
1416 _CASSERT(MBUF_DONTWAIT
== M_DONTWAIT
);
1417 _CASSERT(MBUF_COPYALL
== M_COPYALL
);
1419 _CASSERT(MBUF_SC2TC(MBUF_SC_BK_SYS
) == MBUF_TC_BK
);
1420 _CASSERT(MBUF_SC2TC(MBUF_SC_BK
) == MBUF_TC_BK
);
1421 _CASSERT(MBUF_SC2TC(MBUF_SC_BE
) == MBUF_TC_BE
);
1422 _CASSERT(MBUF_SC2TC(MBUF_SC_RD
) == MBUF_TC_BE
);
1423 _CASSERT(MBUF_SC2TC(MBUF_SC_OAM
) == MBUF_TC_BE
);
1424 _CASSERT(MBUF_SC2TC(MBUF_SC_AV
) == MBUF_TC_VI
);
1425 _CASSERT(MBUF_SC2TC(MBUF_SC_RV
) == MBUF_TC_VI
);
1426 _CASSERT(MBUF_SC2TC(MBUF_SC_VI
) == MBUF_TC_VI
);
1427 _CASSERT(MBUF_SC2TC(MBUF_SC_VO
) == MBUF_TC_VO
);
1428 _CASSERT(MBUF_SC2TC(MBUF_SC_CTL
) == MBUF_TC_VO
);
1430 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BK
) == SCVAL_BK
);
1431 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BE
) == SCVAL_BE
);
1432 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VI
) == SCVAL_VI
);
1433 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VO
) == SCVAL_VO
);
1435 /* Module specific scratch space (32-bit alignment requirement) */
1436 _CASSERT(!(offsetof(struct mbuf
, m_pkthdr
.pkt_mpriv
) %
1437 sizeof (uint32_t)));
1439 /* Initialize random red zone cookie value */
1440 _CASSERT(sizeof (mb_redzone_cookie
) ==
1441 sizeof (((struct pkthdr
*)0)->redzone
));
1442 read_random(&mb_redzone_cookie
, sizeof (mb_redzone_cookie
));
1444 /* Make sure we don't save more than we should */
1445 _CASSERT(MCA_SAVED_MBUF_SIZE
<= sizeof (struct mbuf
));
1447 if (nmbclusters
== 0)
1448 nmbclusters
= NMBCLUSTERS
;
1450 /* This should be a sane (at least even) value by now */
1451 VERIFY(nmbclusters
!= 0 && !(nmbclusters
& 0x1));
1453 /* Setup the mbuf table */
1456 /* Global lock for common layer */
1457 mbuf_mlock_grp_attr
= lck_grp_attr_alloc_init();
1458 mbuf_mlock_grp
= lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr
);
1459 mbuf_mlock_attr
= lck_attr_alloc_init();
1460 lck_mtx_init(mbuf_mlock
, mbuf_mlock_grp
, mbuf_mlock_attr
);
1463 * Allocate cluster slabs table:
1465 * maxslabgrp = (N * 2048) / (1024 * 1024)
1467 * Where N is nmbclusters rounded up to the nearest 512. This yields
1468 * mcl_slab_g_t units, each one representing a MB of memory.
1471 (P2ROUNDUP(nmbclusters
, (MBSIZE
>> 11)) << MCLSHIFT
) >> MBSHIFT
;
1472 MALLOC(slabstbl
, mcl_slabg_t
**, maxslabgrp
* sizeof (mcl_slabg_t
*),
1473 M_TEMP
, M_WAITOK
| M_ZERO
);
1474 VERIFY(slabstbl
!= NULL
);
1477 * Allocate audit structures, if needed:
1479 * maxclaudit = (maxslabgrp * 1024 * 1024) / 4096
1481 * This yields mcl_audit_t units, each one representing a page.
1483 PE_parse_boot_argn("mbuf_debug", &mbuf_debug
, sizeof (mbuf_debug
));
1484 mbuf_debug
|= mcache_getflags();
1485 if (mbuf_debug
& MCF_DEBUG
) {
1486 maxclaudit
= ((maxslabgrp
<< MBSHIFT
) >> PGSHIFT
);
1487 MALLOC(mclaudit
, mcl_audit_t
*, maxclaudit
* sizeof (*mclaudit
),
1488 M_TEMP
, M_WAITOK
| M_ZERO
);
1489 VERIFY(mclaudit
!= NULL
);
1491 mcl_audit_con_cache
= mcache_create("mcl_audit_contents",
1492 AUDIT_CONTENTS_SIZE
, sizeof (u_int64_t
), 0, MCR_SLEEP
);
1493 VERIFY(mcl_audit_con_cache
!= NULL
);
1495 mclverify
= (mbuf_debug
& MCF_VERIFY
);
1496 mcltrace
= (mbuf_debug
& MCF_TRACE
);
1497 mclfindleak
= !(mbuf_debug
& MCF_NOLEAKLOG
);
1498 mclexpleak
= mclfindleak
&& (mbuf_debug
& MCF_EXPLEAKLOG
);
1500 /* Enable mbuf leak logging, with a lock to protect the tables */
1502 mleak_lock_grp_attr
= lck_grp_attr_alloc_init();
1503 mleak_lock_grp
= lck_grp_alloc_init("mleak_lock", mleak_lock_grp_attr
);
1504 mleak_lock_attr
= lck_attr_alloc_init();
1505 lck_mtx_init(mleak_lock
, mleak_lock_grp
, mleak_lock_attr
);
1509 /* Calculate the number of pages assigned to the cluster pool */
1510 mcl_pages
= (nmbclusters
* MCLBYTES
) / CLBYTES
;
1511 MALLOC(mcl_paddr
, ppnum_t
*, mcl_pages
* sizeof (ppnum_t
),
1513 VERIFY(mcl_paddr
!= NULL
);
1515 /* Register with the I/O Bus mapper */
1516 mcl_paddr_base
= IOMapperIOVMAlloc(mcl_pages
);
1517 bzero((char *)mcl_paddr
, mcl_pages
* sizeof (ppnum_t
));
1519 embutl
= (union mbigcluster
*)
1520 ((void *)((unsigned char *)mbutl
+ (nmbclusters
* MCLBYTES
)));
1521 VERIFY((((char *)embutl
- (char *)mbutl
) % MBIGCLBYTES
) == 0);
1523 /* Prime up the freelist */
1524 PE_parse_boot_argn("initmcl", &initmcl
, sizeof (initmcl
));
1526 initmcl
>>= NCLPBGSHIFT
; /* become a 4K unit */
1527 if (initmcl
> m_maxlimit(MC_BIGCL
))
1528 initmcl
= m_maxlimit(MC_BIGCL
);
1530 if (initmcl
< m_minlimit(MC_BIGCL
))
1531 initmcl
= m_minlimit(MC_BIGCL
);
1533 lck_mtx_lock(mbuf_mlock
);
1536 * For classes with non-zero minimum limits, populate their freelists
1537 * so that m_total(class) is at least m_minlimit(class).
1539 VERIFY(m_total(MC_BIGCL
) == 0 && m_minlimit(MC_BIGCL
) != 0);
1540 freelist_populate(m_class(MC_BIGCL
), initmcl
, M_WAIT
);
1541 VERIFY(m_total(MC_BIGCL
) >= m_minlimit(MC_BIGCL
));
1542 freelist_init(m_class(MC_CL
));
1544 for (m
= 0; m
< NELEM(mbuf_table
); m
++) {
1545 /* Make sure we didn't miss any */
1546 VERIFY(m_minlimit(m_class(m
)) == 0 ||
1547 m_total(m_class(m
)) >= m_minlimit(m_class(m
)));
1549 /* populate the initial sizes and report from there on */
1550 m_peak(m_class(m
)) = m_total(m_class(m
));
1552 mb_peak_newreport
= FALSE
;
1554 lck_mtx_unlock(mbuf_mlock
);
1556 (void) kernel_thread_start((thread_continue_t
)mbuf_worker_thread_init
,
1558 thread_deallocate(thread
);
1560 ref_cache
= mcache_create("mext_ref", sizeof (struct ext_ref
),
1563 /* Create the cache for each class */
1564 for (m
= 0; m
< NELEM(mbuf_table
); m
++) {
1565 void *allocfunc
, *freefunc
, *auditfunc
, *logfunc
;
1569 if (m_class(m
) == MC_MBUF_CL
|| m_class(m
) == MC_MBUF_BIGCL
||
1570 m_class(m
) == MC_MBUF_16KCL
) {
1571 allocfunc
= mbuf_cslab_alloc
;
1572 freefunc
= mbuf_cslab_free
;
1573 auditfunc
= mbuf_cslab_audit
;
1574 logfunc
= mleak_logger
;
1576 allocfunc
= mbuf_slab_alloc
;
1577 freefunc
= mbuf_slab_free
;
1578 auditfunc
= mbuf_slab_audit
;
1579 logfunc
= mleak_logger
;
1583 * Disable per-CPU caches for jumbo classes if there
1584 * is no jumbo cluster pool available in the system.
1585 * The cache itself is still created (but will never
1586 * be populated) since it simplifies the code.
1588 if ((m_class(m
) == MC_MBUF_16KCL
|| m_class(m
) == MC_16KCL
) &&
1590 flags
|= MCF_NOCPUCACHE
;
1593 flags
|= MCF_NOLEAKLOG
;
1595 m_cache(m
) = mcache_create_ext(m_cname(m
), m_maxsize(m
),
1596 allocfunc
, freefunc
, auditfunc
, logfunc
, mbuf_slab_notify
,
1597 (void *)(uintptr_t)m
, flags
, MCR_SLEEP
);
1601 * Allocate structure for per-CPU statistics that's aligned
1602 * on the CPU cache boundary; this code assumes that we never
1603 * uninitialize this framework, since the original address
1604 * before alignment is not saved.
1606 ncpu
= ml_get_max_cpus();
1607 MALLOC(buf
, void *, MBUF_MTYPES_SIZE(ncpu
) + CPU_CACHE_LINE_SIZE
,
1609 VERIFY(buf
!= NULL
);
1611 mbuf_mtypes
= (mbuf_mtypes_t
*)P2ROUNDUP((intptr_t)buf
,
1612 CPU_CACHE_LINE_SIZE
);
1613 bzero(mbuf_mtypes
, MBUF_MTYPES_SIZE(ncpu
));
1616 * Set the max limit on sb_max to be 1/16 th of the size of
1617 * memory allocated for mbuf clusters.
1619 high_sb_max
= (nmbclusters
<< (MCLSHIFT
- 4));
1620 if (high_sb_max
< sb_max
) {
1621 /* sb_max is too large for this configuration, scale it down */
1622 if (high_sb_max
> (1 << MBSHIFT
)) {
1623 /* We have atleast 16 M of mbuf pool */
1624 sb_max
= high_sb_max
;
1625 } else if ((nmbclusters
<< MCLSHIFT
) > (1 << MBSHIFT
)) {
1627 * If we have more than 1M of mbufpool, cap the size of
1628 * max sock buf at 1M
1630 sb_max
= high_sb_max
= (1 << MBSHIFT
);
1632 sb_max
= high_sb_max
;
1636 /* allocate space for mbuf_dump_buf */
1637 MALLOC(mbuf_dump_buf
, char *, MBUF_DUMP_BUF_SIZE
, M_TEMP
, M_WAITOK
);
1638 VERIFY(mbuf_dump_buf
!= NULL
);
1640 if (mbuf_debug
& MCF_DEBUG
) {
1641 printf("%s: MLEN %d, MHLEN %d\n", __func__
,
1642 (int)_MLEN
, (int)_MHLEN
);
1645 printf("%s: done [%d MB total pool size, (%d/%d) split]\n", __func__
,
1646 (nmbclusters
<< MCLSHIFT
) >> MBSHIFT
,
1647 (nclusters
<< MCLSHIFT
) >> MBSHIFT
,
1648 (njcl
<< MCLSHIFT
) >> MBSHIFT
);
1652 * Obtain a slab of object(s) from the class's freelist.
1654 static mcache_obj_t
*
1655 slab_alloc(mbuf_class_t
class, int wait
)
1660 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
1662 VERIFY(class != MC_16KCL
|| njcl
> 0);
1664 /* This should always be NULL for us */
1665 VERIFY(m_cobjlist(class) == NULL
);
1668 * Treat composite objects as having longer lifespan by using
1669 * a slab from the reverse direction, in hoping that this could
1670 * reduce the probability of fragmentation for slabs that hold
1671 * more than one buffer chunks (e.g. mbuf slabs). For other
1672 * slabs, this probably doesn't make much of a difference.
1674 if ((class == MC_MBUF
|| class == MC_CL
) && (wait
& MCR_COMP
))
1675 sp
= (mcl_slab_t
*)TAILQ_LAST(&m_slablist(class), mcl_slhead
);
1677 sp
= (mcl_slab_t
*)TAILQ_FIRST(&m_slablist(class));
1680 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1681 /* The slab list for this class is empty */
1685 VERIFY(m_infree(class) > 0);
1686 VERIFY(!slab_is_detached(sp
));
1687 VERIFY(sp
->sl_class
== class &&
1688 (sp
->sl_flags
& (SLF_MAPPED
| SLF_PARTIAL
)) == SLF_MAPPED
);
1690 VERIFY(slab_inrange(sp
, buf
) && sp
== slab_get(buf
));
1692 if (class == MC_MBUF
) {
1693 sp
->sl_head
= buf
->obj_next
;
1694 VERIFY(sp
->sl_head
!= NULL
|| sp
->sl_refcnt
== (NMBPBG
- 1));
1695 } else if (class == MC_CL
) {
1696 sp
->sl_head
= buf
->obj_next
;
1697 VERIFY(sp
->sl_head
!= NULL
|| sp
->sl_refcnt
== (NCLPBG
- 1));
1701 if (sp
->sl_head
!= NULL
&& !slab_inrange(sp
, sp
->sl_head
)) {
1702 slab_nextptr_panic(sp
, sp
->sl_head
);
1703 /* In case sl_head is in the map but not in the slab */
1704 VERIFY(slab_inrange(sp
, sp
->sl_head
));
1708 /* Increment slab reference */
1711 if (mclaudit
!= NULL
) {
1712 mcache_audit_t
*mca
= mcl_audit_buf2mca(class, buf
);
1713 mca
->mca_uflags
= 0;
1714 /* Save contents on mbuf objects only */
1715 if (class == MC_MBUF
)
1716 mca
->mca_uflags
|= MB_SCVALID
;
1719 if (class == MC_CL
) {
1720 mbstat
.m_clfree
= (--m_infree(MC_CL
)) + m_infree(MC_MBUF_CL
);
1722 * A 2K cluster slab can have at most NCLPBG references.
1724 VERIFY(sp
->sl_refcnt
>= 1 && sp
->sl_refcnt
<= NCLPBG
&&
1725 sp
->sl_chunks
== NCLPBG
&&
1726 sp
->sl_len
== m_maxsize(MC_BIGCL
));
1727 VERIFY(sp
->sl_refcnt
< NCLPBG
|| sp
->sl_head
== NULL
);
1728 } else if (class == MC_BIGCL
) {
1729 mbstat
.m_bigclfree
= (--m_infree(MC_BIGCL
)) +
1730 m_infree(MC_MBUF_BIGCL
);
1732 * A 4K cluster slab can have at most 1 reference.
1734 VERIFY(sp
->sl_refcnt
== 1 && sp
->sl_chunks
== 1 &&
1735 sp
->sl_len
== m_maxsize(class) && sp
->sl_head
== NULL
);
1736 } else if (class == MC_16KCL
) {
1740 --m_infree(MC_16KCL
);
1741 VERIFY(sp
->sl_refcnt
== 1 && sp
->sl_chunks
== 1 &&
1742 sp
->sl_len
== m_maxsize(class) && sp
->sl_head
== NULL
);
1744 * Increment 2nd-Nth slab reference, where N is NSLABSP16KB.
1745 * A 16KB big cluster takes NSLABSP16KB slabs, each having at
1748 for (nsp
= sp
, k
= 1; k
< NSLABSP16KB
; k
++) {
1750 /* Next slab must already be present */
1751 VERIFY(nsp
!= NULL
);
1753 VERIFY(!slab_is_detached(nsp
));
1754 VERIFY(nsp
->sl_class
== MC_16KCL
&&
1755 nsp
->sl_flags
== (SLF_MAPPED
| SLF_PARTIAL
) &&
1756 nsp
->sl_refcnt
== 1 && nsp
->sl_chunks
== 0 &&
1757 nsp
->sl_len
== 0 && nsp
->sl_base
== sp
->sl_base
&&
1758 nsp
->sl_head
== NULL
);
1761 VERIFY(class == MC_MBUF
);
1762 --m_infree(MC_MBUF
);
1764 * If auditing is turned on, this check is
1765 * deferred until later in mbuf_slab_audit().
1767 if (mclaudit
== NULL
)
1768 _MCHECK((struct mbuf
*)buf
);
1770 * Since we have incremented the reference count above,
1771 * an mbuf slab (formerly a 4KB cluster slab that was cut
1772 * up into mbufs) must have a reference count between 1
1773 * and NMBPBG at this point.
1775 VERIFY(sp
->sl_refcnt
>= 1 && sp
->sl_refcnt
<= NMBPBG
&&
1776 sp
->sl_chunks
== NMBPBG
&&
1777 sp
->sl_len
== m_maxsize(MC_BIGCL
));
1778 VERIFY(sp
->sl_refcnt
< NMBPBG
|| sp
->sl_head
== NULL
);
1781 /* If empty, remove this slab from the class's freelist */
1782 if (sp
->sl_head
== NULL
) {
1783 VERIFY(class != MC_MBUF
|| sp
->sl_refcnt
== NMBPBG
);
1784 VERIFY(class != MC_CL
|| sp
->sl_refcnt
== NCLPBG
);
1785 slab_remove(sp
, class);
1792 * Place a slab of object(s) back into a class's slab list.
1795 slab_free(mbuf_class_t
class, mcache_obj_t
*buf
)
1799 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
1801 VERIFY(class != MC_16KCL
|| njcl
> 0);
1802 VERIFY(buf
->obj_next
== NULL
);
1804 VERIFY(sp
->sl_class
== class && slab_inrange(sp
, buf
) &&
1805 (sp
->sl_flags
& (SLF_MAPPED
| SLF_PARTIAL
)) == SLF_MAPPED
);
1807 /* Decrement slab reference */
1810 if (class == MC_CL
) {
1811 VERIFY(IS_P2ALIGNED(buf
, MCLBYTES
));
1813 * A slab that has been splitted for 2KB clusters can have
1814 * at most 1 outstanding reference at this point.
1816 VERIFY(sp
->sl_refcnt
>= 0 && sp
->sl_refcnt
<= (NCLPBG
- 1) &&
1817 sp
->sl_chunks
== NCLPBG
&&
1818 sp
->sl_len
== m_maxsize(MC_BIGCL
));
1819 VERIFY(sp
->sl_refcnt
< (NCLPBG
- 1) ||
1820 (slab_is_detached(sp
) && sp
->sl_head
== NULL
));
1821 } else if (class == MC_BIGCL
) {
1822 VERIFY(IS_P2ALIGNED(buf
, MCLBYTES
));
1824 * A 4KB cluster slab can have at most 1 reference
1825 * which must be 0 at this point.
1827 VERIFY(sp
->sl_refcnt
== 0 && sp
->sl_chunks
== 1 &&
1828 sp
->sl_len
== m_maxsize(class) && sp
->sl_head
== NULL
);
1829 VERIFY(slab_is_detached(sp
));
1830 } else if (class == MC_16KCL
) {
1834 * A 16KB cluster takes NSLABSP16KB slabs, all must
1835 * now have 0 reference.
1837 VERIFY(IS_P2ALIGNED(buf
, MBIGCLBYTES
));
1838 VERIFY(sp
->sl_refcnt
== 0 && sp
->sl_chunks
== 1 &&
1839 sp
->sl_len
== m_maxsize(class) && sp
->sl_head
== NULL
);
1840 VERIFY(slab_is_detached(sp
));
1841 for (nsp
= sp
, k
= 1; k
< NSLABSP16KB
; k
++) {
1843 /* Next slab must already be present */
1844 VERIFY(nsp
!= NULL
);
1846 VERIFY(slab_is_detached(nsp
));
1847 VERIFY(nsp
->sl_class
== MC_16KCL
&&
1848 (nsp
->sl_flags
& (SLF_MAPPED
| SLF_PARTIAL
)) &&
1849 nsp
->sl_refcnt
== 0 && nsp
->sl_chunks
== 0 &&
1850 nsp
->sl_len
== 0 && nsp
->sl_base
== sp
->sl_base
&&
1851 nsp
->sl_head
== NULL
);
1855 * A slab that has been splitted for mbufs has at most NMBPBG
1856 * reference counts. Since we have decremented one reference
1857 * above, it must now be between 0 and NMBPBG-1.
1859 VERIFY(class == MC_MBUF
);
1860 VERIFY(sp
->sl_refcnt
>= 0 && sp
->sl_refcnt
<= (NMBPBG
- 1) &&
1861 sp
->sl_chunks
== NMBPBG
&&
1862 sp
->sl_len
== m_maxsize(MC_BIGCL
));
1863 VERIFY(sp
->sl_refcnt
< (NMBPBG
- 1) ||
1864 (slab_is_detached(sp
) && sp
->sl_head
== NULL
));
1868 * When auditing is enabled, ensure that the buffer still
1869 * contains the free pattern. Otherwise it got corrupted
1870 * while at the CPU cache layer.
1872 if (mclaudit
!= NULL
) {
1873 mcache_audit_t
*mca
= mcl_audit_buf2mca(class, buf
);
1875 mcache_audit_free_verify(mca
, buf
, 0, m_maxsize(class));
1877 mca
->mca_uflags
&= ~MB_SCVALID
;
1880 if (class == MC_CL
) {
1881 mbstat
.m_clfree
= (++m_infree(MC_CL
)) + m_infree(MC_MBUF_CL
);
1882 buf
->obj_next
= sp
->sl_head
;
1883 } else if (class == MC_BIGCL
) {
1884 mbstat
.m_bigclfree
= (++m_infree(MC_BIGCL
)) +
1885 m_infree(MC_MBUF_BIGCL
);
1886 } else if (class == MC_16KCL
) {
1887 ++m_infree(MC_16KCL
);
1889 ++m_infree(MC_MBUF
);
1890 buf
->obj_next
= sp
->sl_head
;
1895 * If a slab has been splitted to either one which holds 2KB clusters,
1896 * or one which holds mbufs, turn it back to one which holds a 4KB
1899 if (class == MC_MBUF
&& sp
->sl_refcnt
== 0 &&
1900 m_total(class) > m_minlimit(class) &&
1901 m_total(MC_BIGCL
) < m_maxlimit(MC_BIGCL
)) {
1904 m_total(MC_BIGCL
)++;
1905 mbstat
.m_bigclusters
= m_total(MC_BIGCL
);
1906 m_total(MC_MBUF
) -= NMBPBG
;
1907 mbstat
.m_mbufs
= m_total(MC_MBUF
);
1908 m_infree(MC_MBUF
) -= NMBPBG
;
1909 mtype_stat_add(MT_FREE
, -((unsigned)NMBPBG
));
1911 VERIFY(m_total(MC_BIGCL
) <= m_maxlimit(MC_BIGCL
));
1912 VERIFY(m_total(MC_MBUF
) >= m_minlimit(MC_MBUF
));
1915 struct mbuf
*m
= sp
->sl_head
;
1917 sp
->sl_head
= m
->m_next
;
1920 VERIFY(sp
->sl_head
== NULL
);
1922 /* Remove the slab from the mbuf class's slab list */
1923 slab_remove(sp
, class);
1925 /* Reinitialize it as a 4KB cluster slab */
1926 slab_init(sp
, MC_BIGCL
, sp
->sl_flags
, sp
->sl_base
, sp
->sl_base
,
1930 mcache_set_pattern(MCACHE_FREE_PATTERN
,
1931 (caddr_t
)sp
->sl_head
, m_maxsize(MC_BIGCL
));
1933 mbstat
.m_bigclfree
= (++m_infree(MC_BIGCL
)) +
1934 m_infree(MC_MBUF_BIGCL
);
1936 VERIFY(slab_is_detached(sp
));
1937 /* And finally switch class */
1939 } else if (class == MC_CL
&& sp
->sl_refcnt
== 0 &&
1940 m_total(class) > m_minlimit(class) &&
1941 m_total(MC_BIGCL
) < m_maxlimit(MC_BIGCL
)) {
1944 m_total(MC_BIGCL
)++;
1945 mbstat
.m_bigclusters
= m_total(MC_BIGCL
);
1946 m_total(MC_CL
) -= NCLPBG
;
1947 mbstat
.m_clusters
= m_total(MC_CL
);
1948 m_infree(MC_CL
) -= NCLPBG
;
1949 VERIFY(m_total(MC_BIGCL
) <= m_maxlimit(MC_BIGCL
));
1950 VERIFY(m_total(MC_CL
) >= m_minlimit(MC_CL
));
1953 union mcluster
*c
= sp
->sl_head
;
1955 sp
->sl_head
= c
->mcl_next
;
1958 VERIFY(sp
->sl_head
== NULL
);
1960 /* Remove the slab from the 2KB cluster class's slab list */
1961 slab_remove(sp
, class);
1963 /* Reinitialize it as a 4KB cluster slab */
1964 slab_init(sp
, MC_BIGCL
, sp
->sl_flags
, sp
->sl_base
, sp
->sl_base
,
1968 mcache_set_pattern(MCACHE_FREE_PATTERN
,
1969 (caddr_t
)sp
->sl_head
, m_maxsize(MC_BIGCL
));
1971 mbstat
.m_bigclfree
= (++m_infree(MC_BIGCL
)) +
1972 m_infree(MC_MBUF_BIGCL
);
1974 VERIFY(slab_is_detached(sp
));
1975 /* And finally switch class */
1979 /* Reinsert the slab to the class's slab list */
1980 if (slab_is_detached(sp
))
1981 slab_insert(sp
, class);
1985 * Common allocator for rudimentary objects called by the CPU cache layer
1986 * during an allocation request whenever there is no available element in the
1987 * bucket layer. It returns one or more elements from the appropriate global
1988 * freelist. If the freelist is empty, it will attempt to populate it and
1989 * retry the allocation.
1992 mbuf_slab_alloc(void *arg
, mcache_obj_t
***plist
, unsigned int num
, int wait
)
1994 mbuf_class_t
class = (mbuf_class_t
)arg
;
1995 unsigned int need
= num
;
1996 mcache_obj_t
**list
= *plist
;
1998 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2001 lck_mtx_lock(mbuf_mlock
);
2004 if ((*list
= slab_alloc(class, wait
)) != NULL
) {
2005 (*list
)->obj_next
= NULL
;
2006 list
= *plist
= &(*list
)->obj_next
;
2010 * If the number of elements in freelist has
2011 * dropped below low watermark, asynchronously
2012 * populate the freelist now rather than doing
2013 * it later when we run out of elements.
2015 if (!mbuf_cached_above(class, wait
) &&
2016 m_infree(class) < m_total(class) >> 5) {
2017 (void) freelist_populate(class, 1,
2023 VERIFY(m_infree(class) == 0 || class == MC_CL
);
2025 (void) freelist_populate(class, 1,
2026 (wait
& MCR_NOSLEEP
) ? M_DONTWAIT
: M_WAIT
);
2028 if (m_infree(class) > 0)
2031 /* Check if there's anything at the cache layer */
2032 if (mbuf_cached_above(class, wait
))
2035 /* watchdog checkpoint */
2038 /* We have nothing and cannot block; give up */
2039 if (wait
& MCR_NOSLEEP
) {
2040 if (!(wait
& MCR_TRYHARD
)) {
2041 m_fail_cnt(class)++;
2048 * If the freelist is still empty and the caller is
2049 * willing to be blocked, sleep on the wait channel
2050 * until an element is available. Otherwise, if
2051 * MCR_TRYHARD is set, do our best to satisfy the
2052 * request without having to go to sleep.
2054 if (mbuf_worker_ready
&&
2055 mbuf_sleep(class, need
, wait
))
2058 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
2062 m_alloc_cnt(class) += num
- need
;
2063 lck_mtx_unlock(mbuf_mlock
);
2065 return (num
- need
);
2069 * Common de-allocator for rudimentary objects called by the CPU cache
2070 * layer when one or more elements need to be returned to the appropriate
2074 mbuf_slab_free(void *arg
, mcache_obj_t
*list
, __unused
int purged
)
2076 mbuf_class_t
class = (mbuf_class_t
)arg
;
2077 mcache_obj_t
*nlist
;
2078 unsigned int num
= 0;
2081 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2083 lck_mtx_lock(mbuf_mlock
);
2086 nlist
= list
->obj_next
;
2087 list
->obj_next
= NULL
;
2088 slab_free(class, list
);
2090 if ((list
= nlist
) == NULL
)
2093 m_free_cnt(class) += num
;
2095 if ((w
= mb_waiters
) > 0)
2098 lck_mtx_unlock(mbuf_mlock
);
2101 wakeup(mb_waitchan
);
2105 * Common auditor for rudimentary objects called by the CPU cache layer
2106 * during an allocation or free request. For the former, this is called
2107 * after the objects are obtained from either the bucket or slab layer
2108 * and before they are returned to the caller. For the latter, this is
2109 * called immediately during free and before placing the objects into
2110 * the bucket or slab layer.
2113 mbuf_slab_audit(void *arg
, mcache_obj_t
*list
, boolean_t alloc
)
2115 mbuf_class_t
class = (mbuf_class_t
)arg
;
2116 mcache_audit_t
*mca
;
2118 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2120 while (list
!= NULL
) {
2121 lck_mtx_lock(mbuf_mlock
);
2122 mca
= mcl_audit_buf2mca(class, list
);
2124 /* Do the sanity checks */
2125 if (class == MC_MBUF
) {
2126 mcl_audit_mbuf(mca
, list
, FALSE
, alloc
);
2127 ASSERT(mca
->mca_uflags
& MB_SCVALID
);
2129 mcl_audit_cluster(mca
, list
, m_maxsize(class),
2131 ASSERT(!(mca
->mca_uflags
& MB_SCVALID
));
2133 /* Record this transaction */
2135 mcache_buffer_log(mca
, list
, m_cache(class), &mb_start
);
2138 mca
->mca_uflags
|= MB_INUSE
;
2140 mca
->mca_uflags
&= ~MB_INUSE
;
2141 /* Unpair the object (unconditionally) */
2142 mca
->mca_uptr
= NULL
;
2143 lck_mtx_unlock(mbuf_mlock
);
2145 list
= list
->obj_next
;
2150 * Common notify routine for all caches. It is called by mcache when
2151 * one or more objects get freed. We use this indication to trigger
2152 * the wakeup of any sleeping threads so that they can retry their
2153 * allocation requests.
2156 mbuf_slab_notify(void *arg
, u_int32_t reason
)
2158 mbuf_class_t
class = (mbuf_class_t
)arg
;
2161 ASSERT(MBUF_CLASS_VALID(class));
2163 if (reason
!= MCN_RETRYALLOC
)
2166 lck_mtx_lock(mbuf_mlock
);
2167 if ((w
= mb_waiters
) > 0) {
2168 m_notified(class)++;
2171 lck_mtx_unlock(mbuf_mlock
);
2174 wakeup(mb_waitchan
);
2178 * Obtain object(s) from the composite class's freelist.
2181 cslab_alloc(mbuf_class_t
class, mcache_obj_t
***plist
, unsigned int num
)
2183 unsigned int need
= num
;
2184 mcl_slab_t
*sp
, *clsp
, *nsp
;
2186 mcache_obj_t
**list
= *plist
;
2190 VERIFY(class != MC_MBUF_16KCL
|| njcl
> 0);
2191 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
2193 /* Get what we can from the freelist */
2194 while ((*list
= m_cobjlist(class)) != NULL
) {
2197 m
= (struct mbuf
*)*list
;
2199 cl
= m
->m_ext
.ext_buf
;
2200 clsp
= slab_get(cl
);
2201 VERIFY(m
->m_flags
== M_EXT
&& cl
!= NULL
);
2202 VERIFY(MEXT_RFA(m
) != NULL
&& MBUF_IS_COMPOSITE(m
));
2204 if (class == MC_MBUF_CL
) {
2205 VERIFY(clsp
->sl_refcnt
>= 1 &&
2206 clsp
->sl_refcnt
<= NCLPBG
);
2208 VERIFY(clsp
->sl_refcnt
== 1);
2211 if (class == MC_MBUF_16KCL
) {
2213 for (nsp
= clsp
, k
= 1; k
< NSLABSP16KB
; k
++) {
2215 /* Next slab must already be present */
2216 VERIFY(nsp
!= NULL
);
2217 VERIFY(nsp
->sl_refcnt
== 1);
2221 if ((m_cobjlist(class) = (*list
)->obj_next
) != NULL
&&
2222 !MBUF_IN_MAP(m_cobjlist(class))) {
2223 slab_nextptr_panic(sp
, m_cobjlist(class));
2226 (*list
)->obj_next
= NULL
;
2227 list
= *plist
= &(*list
)->obj_next
;
2232 m_infree(class) -= (num
- need
);
2234 return (num
- need
);
2238 * Place object(s) back into a composite class's freelist.
2241 cslab_free(mbuf_class_t
class, mcache_obj_t
*list
, int purged
)
2243 mcache_obj_t
*o
, *tail
;
2244 unsigned int num
= 0;
2245 struct mbuf
*m
, *ms
;
2246 mcache_audit_t
*mca
= NULL
;
2247 mcache_obj_t
*ref_list
= NULL
;
2248 mcl_slab_t
*clsp
, *nsp
;
2250 mbuf_class_t cl_class
;
2252 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2253 VERIFY(class != MC_MBUF_16KCL
|| njcl
> 0);
2254 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
2256 if (class == MC_MBUF_CL
) {
2258 } else if (class == MC_MBUF_BIGCL
) {
2259 cl_class
= MC_BIGCL
;
2261 VERIFY(class == MC_MBUF_16KCL
);
2262 cl_class
= MC_16KCL
;
2267 while ((m
= ms
= (struct mbuf
*)o
) != NULL
) {
2268 mcache_obj_t
*rfa
, *nexto
= o
->obj_next
;
2270 /* Do the mbuf sanity checks */
2271 if (mclaudit
!= NULL
) {
2272 mca
= mcl_audit_buf2mca(MC_MBUF
, (mcache_obj_t
*)m
);
2274 mcache_audit_free_verify(mca
, m
, 0,
2275 m_maxsize(MC_MBUF
));
2277 ms
= MCA_SAVED_MBUF_PTR(mca
);
2280 /* Do the cluster sanity checks */
2281 cl
= ms
->m_ext
.ext_buf
;
2282 clsp
= slab_get(cl
);
2284 size_t size
= m_maxsize(cl_class
);
2285 mcache_audit_free_verify(mcl_audit_buf2mca(cl_class
,
2286 (mcache_obj_t
*)cl
), cl
, 0, size
);
2288 VERIFY(ms
->m_type
== MT_FREE
);
2289 VERIFY(ms
->m_flags
== M_EXT
);
2290 VERIFY(MEXT_RFA(ms
) != NULL
&& MBUF_IS_COMPOSITE(ms
));
2291 if (cl_class
== MC_CL
) {
2292 VERIFY(clsp
->sl_refcnt
>= 1 &&
2293 clsp
->sl_refcnt
<= NCLPBG
);
2295 VERIFY(clsp
->sl_refcnt
== 1);
2297 if (cl_class
== MC_16KCL
) {
2299 for (nsp
= clsp
, k
= 1; k
< NSLABSP16KB
; k
++) {
2301 /* Next slab must already be present */
2302 VERIFY(nsp
!= NULL
);
2303 VERIFY(nsp
->sl_refcnt
== 1);
2308 * If we're asked to purge, restore the actual mbuf using
2309 * contents of the shadow structure (if auditing is enabled)
2310 * and clear EXTF_COMPOSITE flag from the mbuf, as we are
2311 * about to free it and the attached cluster into their caches.
2314 /* Restore constructed mbuf fields */
2315 if (mclaudit
!= NULL
)
2316 mcl_audit_restore_mbuf(m
, mca
, TRUE
);
2321 rfa
= (mcache_obj_t
*)(void *)MEXT_RFA(m
);
2322 rfa
->obj_next
= ref_list
;
2326 m
->m_type
= MT_FREE
;
2327 m
->m_flags
= m
->m_len
= 0;
2328 m
->m_next
= m
->m_nextpkt
= NULL
;
2330 /* Save mbuf fields and make auditing happy */
2331 if (mclaudit
!= NULL
)
2332 mcl_audit_mbuf(mca
, o
, FALSE
, FALSE
);
2334 VERIFY(m_total(class) > 0);
2339 slab_free(MC_MBUF
, o
);
2341 /* And free the cluster */
2342 ((mcache_obj_t
*)cl
)->obj_next
= NULL
;
2343 if (class == MC_MBUF_CL
)
2344 slab_free(MC_CL
, cl
);
2345 else if (class == MC_MBUF_BIGCL
)
2346 slab_free(MC_BIGCL
, cl
);
2348 slab_free(MC_16KCL
, cl
);
2357 tail
->obj_next
= m_cobjlist(class);
2358 m_cobjlist(class) = list
;
2359 m_infree(class) += num
;
2360 } else if (ref_list
!= NULL
) {
2361 mcache_free_ext(ref_cache
, ref_list
);
2368 * Common allocator for composite objects called by the CPU cache layer
2369 * during an allocation request whenever there is no available element in
2370 * the bucket layer. It returns one or more composite elements from the
2371 * appropriate global freelist. If the freelist is empty, it will attempt
2372 * to obtain the rudimentary objects from their caches and construct them
2373 * into composite mbuf + cluster objects.
2376 mbuf_cslab_alloc(void *arg
, mcache_obj_t
***plist
, unsigned int needed
,
2379 mbuf_class_t
class = (mbuf_class_t
)arg
;
2380 mbuf_class_t cl_class
= 0;
2381 unsigned int num
= 0, cnum
= 0, want
= needed
;
2382 mcache_obj_t
*ref_list
= NULL
;
2383 mcache_obj_t
*mp_list
= NULL
;
2384 mcache_obj_t
*clp_list
= NULL
;
2385 mcache_obj_t
**list
;
2386 struct ext_ref
*rfa
;
2390 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2393 VERIFY(class != MC_MBUF_16KCL
|| njcl
> 0);
2395 /* There should not be any slab for this class */
2396 VERIFY(m_slab_cnt(class) == 0 &&
2397 m_slablist(class).tqh_first
== NULL
&&
2398 m_slablist(class).tqh_last
== NULL
);
2400 lck_mtx_lock(mbuf_mlock
);
2402 /* Try using the freelist first */
2403 num
= cslab_alloc(class, plist
, needed
);
2405 if (num
== needed
) {
2406 m_alloc_cnt(class) += num
;
2407 lck_mtx_unlock(mbuf_mlock
);
2411 lck_mtx_unlock(mbuf_mlock
);
2414 * We could not satisfy the request using the freelist alone;
2415 * allocate from the appropriate rudimentary caches and use
2416 * whatever we can get to construct the composite objects.
2421 * Mark these allocation requests as coming from a composite cache.
2422 * Also, if the caller is willing to be blocked, mark the request
2423 * with MCR_FAILOK such that we don't end up sleeping at the mbuf
2424 * slab layer waiting for the individual object when one or more
2425 * of the already-constructed composite objects are available.
2428 if (!(wait
& MCR_NOSLEEP
))
2431 /* allocate mbufs */
2432 needed
= mcache_alloc_ext(m_cache(MC_MBUF
), &mp_list
, needed
, wait
);
2434 ASSERT(mp_list
== NULL
);
2438 /* allocate clusters */
2439 if (class == MC_MBUF_CL
) {
2441 } else if (class == MC_MBUF_BIGCL
) {
2442 cl_class
= MC_BIGCL
;
2444 VERIFY(class == MC_MBUF_16KCL
);
2445 cl_class
= MC_16KCL
;
2447 needed
= mcache_alloc_ext(m_cache(cl_class
), &clp_list
, needed
, wait
);
2449 ASSERT(clp_list
== NULL
);
2453 needed
= mcache_alloc_ext(ref_cache
, &ref_list
, needed
, wait
);
2455 ASSERT(ref_list
== NULL
);
2460 * By this time "needed" is MIN(mbuf, cluster, ref). Any left
2461 * overs will get freed accordingly before we return to caller.
2463 for (cnum
= 0; cnum
< needed
; cnum
++) {
2466 m
= ms
= (struct mbuf
*)mp_list
;
2467 mp_list
= mp_list
->obj_next
;
2470 clp_list
= clp_list
->obj_next
;
2471 ((mcache_obj_t
*)cl
)->obj_next
= NULL
;
2473 rfa
= (struct ext_ref
*)ref_list
;
2474 ref_list
= ref_list
->obj_next
;
2475 ((mcache_obj_t
*)(void *)rfa
)->obj_next
= NULL
;
2478 * If auditing is enabled, construct the shadow mbuf
2479 * in the audit structure instead of in the actual one.
2480 * mbuf_cslab_audit() will take care of restoring the
2481 * contents after the integrity check.
2483 if (mclaudit
!= NULL
) {
2484 mcache_audit_t
*mca
, *cl_mca
;
2486 lck_mtx_lock(mbuf_mlock
);
2487 mca
= mcl_audit_buf2mca(MC_MBUF
, (mcache_obj_t
*)m
);
2488 ms
= MCA_SAVED_MBUF_PTR(mca
);
2489 cl_mca
= mcl_audit_buf2mca(MC_CL
, (mcache_obj_t
*)cl
);
2492 * Pair them up. Note that this is done at the time
2493 * the mbuf+cluster objects are constructed. This
2494 * information should be treated as "best effort"
2495 * debugging hint since more than one mbufs can refer
2496 * to a cluster. In that case, the cluster might not
2497 * be freed along with the mbuf it was paired with.
2499 mca
->mca_uptr
= cl_mca
;
2500 cl_mca
->mca_uptr
= mca
;
2502 ASSERT(mca
->mca_uflags
& MB_SCVALID
);
2503 ASSERT(!(cl_mca
->mca_uflags
& MB_SCVALID
));
2504 lck_mtx_unlock(mbuf_mlock
);
2506 /* Technically, they are in the freelist */
2510 mcache_set_pattern(MCACHE_FREE_PATTERN
, m
,
2511 m_maxsize(MC_MBUF
));
2513 if (class == MC_MBUF_CL
)
2514 size
= m_maxsize(MC_CL
);
2515 else if (class == MC_MBUF_BIGCL
)
2516 size
= m_maxsize(MC_BIGCL
);
2518 size
= m_maxsize(MC_16KCL
);
2520 mcache_set_pattern(MCACHE_FREE_PATTERN
, cl
,
2525 MBUF_INIT(ms
, 0, MT_FREE
);
2526 if (class == MC_MBUF_16KCL
) {
2527 MBUF_16KCL_INIT(ms
, cl
, rfa
, 0, EXTF_COMPOSITE
);
2528 } else if (class == MC_MBUF_BIGCL
) {
2529 MBUF_BIGCL_INIT(ms
, cl
, rfa
, 0, EXTF_COMPOSITE
);
2531 MBUF_CL_INIT(ms
, cl
, rfa
, 0, EXTF_COMPOSITE
);
2533 VERIFY(ms
->m_flags
== M_EXT
);
2534 VERIFY(MEXT_RFA(ms
) != NULL
&& MBUF_IS_COMPOSITE(ms
));
2536 *list
= (mcache_obj_t
*)m
;
2537 (*list
)->obj_next
= NULL
;
2538 list
= *plist
= &(*list
)->obj_next
;
2543 * Free up what's left of the above.
2545 if (mp_list
!= NULL
)
2546 mcache_free_ext(m_cache(MC_MBUF
), mp_list
);
2547 if (clp_list
!= NULL
)
2548 mcache_free_ext(m_cache(cl_class
), clp_list
);
2549 if (ref_list
!= NULL
)
2550 mcache_free_ext(ref_cache
, ref_list
);
2552 lck_mtx_lock(mbuf_mlock
);
2553 if (num
> 0 || cnum
> 0) {
2554 m_total(class) += cnum
;
2555 VERIFY(m_total(class) <= m_maxlimit(class));
2556 m_alloc_cnt(class) += num
+ cnum
;
2558 if ((num
+ cnum
) < want
)
2559 m_fail_cnt(class) += (want
- (num
+ cnum
));
2560 lck_mtx_unlock(mbuf_mlock
);
2562 return (num
+ cnum
);
2566 * Common de-allocator for composite objects called by the CPU cache
2567 * layer when one or more elements need to be returned to the appropriate
2571 mbuf_cslab_free(void *arg
, mcache_obj_t
*list
, int purged
)
2573 mbuf_class_t
class = (mbuf_class_t
)arg
;
2577 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2579 lck_mtx_lock(mbuf_mlock
);
2581 num
= cslab_free(class, list
, purged
);
2582 m_free_cnt(class) += num
;
2584 if ((w
= mb_waiters
) > 0)
2587 lck_mtx_unlock(mbuf_mlock
);
2590 wakeup(mb_waitchan
);
2594 * Common auditor for composite objects called by the CPU cache layer
2595 * during an allocation or free request. For the former, this is called
2596 * after the objects are obtained from either the bucket or slab layer
2597 * and before they are returned to the caller. For the latter, this is
2598 * called immediately during free and before placing the objects into
2599 * the bucket or slab layer.
2602 mbuf_cslab_audit(void *arg
, mcache_obj_t
*list
, boolean_t alloc
)
2604 mbuf_class_t
class = (mbuf_class_t
)arg
;
2605 mcache_audit_t
*mca
;
2606 struct mbuf
*m
, *ms
;
2607 mcl_slab_t
*clsp
, *nsp
;
2611 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2613 while ((m
= ms
= (struct mbuf
*)list
) != NULL
) {
2614 lck_mtx_lock(mbuf_mlock
);
2615 /* Do the mbuf sanity checks and record its transaction */
2616 mca
= mcl_audit_buf2mca(MC_MBUF
, (mcache_obj_t
*)m
);
2617 mcl_audit_mbuf(mca
, m
, TRUE
, alloc
);
2619 mcache_buffer_log(mca
, m
, m_cache(class), &mb_start
);
2622 mca
->mca_uflags
|= MB_COMP_INUSE
;
2624 mca
->mca_uflags
&= ~MB_COMP_INUSE
;
2627 * Use the shadow mbuf in the audit structure if we are
2628 * freeing, since the contents of the actual mbuf has been
2629 * pattern-filled by the above call to mcl_audit_mbuf().
2631 if (!alloc
&& mclverify
)
2632 ms
= MCA_SAVED_MBUF_PTR(mca
);
2634 /* Do the cluster sanity checks and record its transaction */
2635 cl
= ms
->m_ext
.ext_buf
;
2636 clsp
= slab_get(cl
);
2637 VERIFY(ms
->m_flags
== M_EXT
&& cl
!= NULL
);
2638 VERIFY(MEXT_RFA(ms
) != NULL
&& MBUF_IS_COMPOSITE(ms
));
2639 if (class == MC_MBUF_CL
)
2640 VERIFY(clsp
->sl_refcnt
>= 1 &&
2641 clsp
->sl_refcnt
<= NCLPBG
);
2643 VERIFY(clsp
->sl_refcnt
== 1);
2645 if (class == MC_MBUF_16KCL
) {
2647 for (nsp
= clsp
, k
= 1; k
< NSLABSP16KB
; k
++) {
2649 /* Next slab must already be present */
2650 VERIFY(nsp
!= NULL
);
2651 VERIFY(nsp
->sl_refcnt
== 1);
2655 mca
= mcl_audit_buf2mca(MC_CL
, cl
);
2656 if (class == MC_MBUF_CL
)
2657 size
= m_maxsize(MC_CL
);
2658 else if (class == MC_MBUF_BIGCL
)
2659 size
= m_maxsize(MC_BIGCL
);
2661 size
= m_maxsize(MC_16KCL
);
2662 mcl_audit_cluster(mca
, cl
, size
, alloc
, FALSE
);
2664 mcache_buffer_log(mca
, cl
, m_cache(class), &mb_start
);
2667 mca
->mca_uflags
|= MB_COMP_INUSE
;
2669 mca
->mca_uflags
&= ~MB_COMP_INUSE
;
2670 lck_mtx_unlock(mbuf_mlock
);
2672 list
= list
->obj_next
;
2677 * Allocate some number of mbuf clusters and place on cluster freelist.
2680 m_clalloc(const u_int32_t num
, const int wait
, const u_int32_t bufsize
)
2684 int numpages
= 0, large_buffer
= (bufsize
== m_maxsize(MC_16KCL
));
2685 vm_offset_t page
= 0;
2686 mcache_audit_t
*mca_list
= NULL
;
2687 mcache_obj_t
*con_list
= NULL
;
2690 VERIFY(bufsize
== m_maxsize(MC_BIGCL
) ||
2691 bufsize
== m_maxsize(MC_16KCL
));
2693 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
2696 * Multiple threads may attempt to populate the cluster map one
2697 * after another. Since we drop the lock below prior to acquiring
2698 * the physical page(s), our view of the cluster map may no longer
2699 * be accurate, and we could end up over-committing the pages beyond
2700 * the maximum allowed for each class. To prevent it, this entire
2701 * operation (including the page mapping) is serialized.
2703 while (mb_clalloc_busy
) {
2704 mb_clalloc_waiters
++;
2705 (void) msleep(mb_clalloc_waitchan
, mbuf_mlock
,
2706 (PZERO
-1), "m_clalloc", NULL
);
2707 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
2710 /* We are busy now; tell everyone else to go away */
2711 mb_clalloc_busy
= TRUE
;
2714 * Honor the caller's wish to block or not block. We have a way
2715 * to grow the pool asynchronously using the mbuf worker thread.
2717 i
= m_howmany(num
, bufsize
);
2718 if (i
== 0 || (wait
& M_DONTWAIT
))
2721 lck_mtx_unlock(mbuf_mlock
);
2723 size
= round_page(i
* bufsize
);
2724 page
= kmem_mb_alloc(mb_map
, size
, large_buffer
);
2727 * If we did ask for "n" 16KB physically contiguous chunks
2728 * and didn't get them, then please try again without this
2731 if (large_buffer
&& page
== 0)
2732 page
= kmem_mb_alloc(mb_map
, size
, 0);
2735 if (bufsize
== m_maxsize(MC_BIGCL
)) {
2736 /* Try for 1 page if failed, only 4KB request */
2738 page
= kmem_mb_alloc(mb_map
, size
, 0);
2742 lck_mtx_lock(mbuf_mlock
);
2747 VERIFY(IS_P2ALIGNED(page
, NBPG
));
2748 numpages
= size
/ NBPG
;
2750 /* If auditing is enabled, allocate the audit structures now */
2751 if (mclaudit
!= NULL
) {
2755 * Yes, I realize this is a waste of memory for clusters
2756 * that never get transformed into mbufs, as we may end
2757 * up with NMBPBG-1 unused audit structures per cluster.
2758 * But doing so tremendously simplifies the allocation
2759 * strategy, since at this point we are not holding the
2760 * mbuf lock and the caller is okay to be blocked.
2762 if (bufsize
== m_maxsize(MC_BIGCL
)) {
2763 needed
= numpages
* NMBPBG
;
2765 i
= mcache_alloc_ext(mcl_audit_con_cache
,
2766 &con_list
, needed
, MCR_SLEEP
);
2768 VERIFY(con_list
!= NULL
&& i
== needed
);
2770 needed
= numpages
/ NSLABSP16KB
;
2773 i
= mcache_alloc_ext(mcache_audit_cache
,
2774 (mcache_obj_t
**)&mca_list
, needed
, MCR_SLEEP
);
2776 VERIFY(mca_list
!= NULL
&& i
== needed
);
2779 lck_mtx_lock(mbuf_mlock
);
2781 for (i
= 0; i
< numpages
; i
++, page
+= NBPG
) {
2782 ppnum_t offset
= ((char *)page
- (char *)mbutl
) / NBPG
;
2783 ppnum_t new_page
= pmap_find_phys(kernel_pmap
, page
);
2784 mbuf_class_t
class = MC_BIGCL
;
2787 * If there is a mapper the appropriate I/O page is returned;
2788 * zero out the page to discard its past contents to prevent
2789 * exposing leftover kernel memory.
2791 VERIFY(offset
< mcl_pages
);
2792 if (mcl_paddr_base
!= 0) {
2793 bzero((void *)(uintptr_t) page
, page_size
);
2794 new_page
= IOMapperInsertPage(mcl_paddr_base
,
2797 mcl_paddr
[offset
] = new_page
;
2799 /* Pattern-fill this fresh page */
2801 mcache_set_pattern(MCACHE_FREE_PATTERN
,
2802 (caddr_t
)page
, NBPG
);
2804 if (bufsize
== m_maxsize(MC_BIGCL
)) {
2805 union mbigcluster
*mbc
= (union mbigcluster
*)page
;
2807 /* One for the entire page */
2809 if (mclaudit
!= NULL
) {
2810 mcl_audit_init(mbc
, &mca_list
, &con_list
,
2811 AUDIT_CONTENTS_SIZE
, NMBPBG
);
2813 VERIFY(sp
->sl_refcnt
== 0 && sp
->sl_flags
== 0);
2814 slab_init(sp
, MC_BIGCL
, SLF_MAPPED
,
2815 mbc
, mbc
, bufsize
, 0, 1);
2817 /* Insert this slab */
2818 slab_insert(sp
, MC_BIGCL
);
2820 /* Update stats now since slab_get() drops the lock */
2821 mbstat
.m_bigclfree
= ++m_infree(MC_BIGCL
) +
2822 m_infree(MC_MBUF_BIGCL
);
2823 mbstat
.m_bigclusters
= ++m_total(MC_BIGCL
);
2824 VERIFY(m_total(MC_BIGCL
) <= m_maxlimit(MC_BIGCL
));
2826 } else if ((i
% NSLABSP16KB
) == 0) {
2827 union m16kcluster
*m16kcl
= (union m16kcluster
*)page
;
2832 /* One for the entire 16KB */
2833 sp
= slab_get(m16kcl
);
2834 if (mclaudit
!= NULL
)
2835 mcl_audit_init(m16kcl
, &mca_list
, NULL
, 0, 1);
2837 VERIFY(sp
->sl_refcnt
== 0 && sp
->sl_flags
== 0);
2838 slab_init(sp
, MC_16KCL
, SLF_MAPPED
,
2839 m16kcl
, m16kcl
, bufsize
, 0, 1);
2842 * 2nd-Nth page's slab is part of the first one,
2843 * where N is NSLABSP16KB.
2845 for (k
= 1; k
< NSLABSP16KB
; k
++) {
2846 nsp
= slab_get(((union mbigcluster
*)page
) + k
);
2847 VERIFY(nsp
->sl_refcnt
== 0 &&
2848 nsp
->sl_flags
== 0);
2849 slab_init(nsp
, MC_16KCL
,
2850 SLF_MAPPED
| SLF_PARTIAL
,
2851 m16kcl
, NULL
, 0, 0, 0);
2854 /* Insert this slab */
2855 slab_insert(sp
, MC_16KCL
);
2857 /* Update stats now since slab_get() drops the lock */
2858 m_infree(MC_16KCL
)++;
2859 m_total(MC_16KCL
)++;
2860 VERIFY(m_total(MC_16KCL
) <= m_maxlimit(MC_16KCL
));
2863 if (!mb_peak_newreport
&& mbuf_report_usage(class))
2864 mb_peak_newreport
= TRUE
;
2866 VERIFY(mca_list
== NULL
&& con_list
== NULL
);
2868 /* We're done; let others enter */
2869 mb_clalloc_busy
= FALSE
;
2870 if (mb_clalloc_waiters
> 0) {
2871 mb_clalloc_waiters
= 0;
2872 wakeup(mb_clalloc_waitchan
);
2875 if (bufsize
== m_maxsize(MC_BIGCL
))
2878 VERIFY(bufsize
== m_maxsize(MC_16KCL
));
2879 return (numpages
/ NSLABSP16KB
);
2882 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
2884 /* We're done; let others enter */
2885 mb_clalloc_busy
= FALSE
;
2886 if (mb_clalloc_waiters
> 0) {
2887 mb_clalloc_waiters
= 0;
2888 wakeup(mb_clalloc_waitchan
);
2892 * When non-blocking we kick a thread if we have to grow the
2893 * pool or if the number of free clusters is less than requested.
2895 if (bufsize
== m_maxsize(MC_BIGCL
)) {
2898 * Remember total number of 4KB clusters needed
2901 i
+= m_total(MC_BIGCL
);
2902 if (i
> mbuf_expand_big
) {
2903 mbuf_expand_big
= i
;
2904 if (mbuf_worker_ready
)
2905 wakeup((caddr_t
)&mbuf_worker_run
);
2909 if (m_infree(MC_BIGCL
) >= num
)
2914 * Remember total number of 16KB clusters needed
2917 i
+= m_total(MC_16KCL
);
2918 if (i
> mbuf_expand_16k
) {
2919 mbuf_expand_16k
= i
;
2920 if (mbuf_worker_ready
)
2921 wakeup((caddr_t
)&mbuf_worker_run
);
2925 if (m_infree(MC_16KCL
) >= num
)
2932 * Populate the global freelist of the corresponding buffer class.
2935 freelist_populate(mbuf_class_t
class, unsigned int num
, int wait
)
2937 mcache_obj_t
*o
= NULL
;
2938 int i
, numpages
= 0, count
;
2940 VERIFY(class == MC_MBUF
|| class == MC_CL
|| class == MC_BIGCL
||
2943 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
2949 numpages
= (num
* m_size(class) + NBPG
- 1) / NBPG
;
2950 i
= m_clalloc(numpages
, wait
, m_maxsize(MC_BIGCL
));
2952 /* Respect the 4KB clusters minimum limit */
2953 if (m_total(MC_BIGCL
) == m_maxlimit(MC_BIGCL
) &&
2954 m_infree(MC_BIGCL
) <= m_minlimit(MC_BIGCL
)) {
2955 if (class != MC_BIGCL
|| (wait
& MCR_COMP
))
2958 if (class == MC_BIGCL
)
2963 return (m_clalloc(num
, wait
, m_maxsize(class)) != 0);
2971 VERIFY(class == MC_MBUF
|| class == MC_CL
);
2973 /* how many objects will we cut the page into? */
2974 int numobj
= (class == MC_MBUF
? NMBPBG
: NCLPBG
);
2976 for (count
= 0; count
< numpages
; count
++) {
2978 /* respect totals, minlimit, maxlimit */
2979 if (m_total(MC_BIGCL
) <= m_minlimit(MC_BIGCL
) ||
2980 m_total(class) >= m_maxlimit(class))
2983 if ((o
= slab_alloc(MC_BIGCL
, wait
)) == NULL
)
2986 struct mbuf
*m
= (struct mbuf
*)o
;
2987 union mcluster
*c
= (union mcluster
*)o
;
2988 mcl_slab_t
*sp
= slab_get(o
);
2989 mcache_audit_t
*mca
= NULL
;
2991 VERIFY(slab_is_detached(sp
) &&
2992 (sp
->sl_flags
& (SLF_MAPPED
| SLF_PARTIAL
)) == SLF_MAPPED
);
2995 * Make sure that the cluster is unmolested
2999 mca
= mcl_audit_buf2mca(MC_BIGCL
, o
);
3000 mcache_audit_free_verify(mca
, o
, 0,
3001 m_maxsize(MC_BIGCL
));
3004 /* Reinitialize it as an mbuf or 2K slab */
3005 slab_init(sp
, class, sp
->sl_flags
,
3006 sp
->sl_base
, NULL
, sp
->sl_len
, 0, numobj
);
3008 VERIFY(o
== (mcache_obj_t
*)sp
->sl_base
);
3009 VERIFY(sp
->sl_head
== NULL
);
3011 VERIFY(m_total(MC_BIGCL
) > 0);
3012 m_total(MC_BIGCL
)--;
3013 mbstat
.m_bigclusters
= m_total(MC_BIGCL
);
3015 m_total(class) += numobj
;
3016 m_infree(class) += numobj
;
3018 VERIFY(m_total(MC_BIGCL
) >= m_minlimit(MC_BIGCL
));
3019 VERIFY(m_total(class) <= m_maxlimit(class));
3020 if (!mb_peak_newreport
&& mbuf_report_usage(class))
3021 mb_peak_newreport
= TRUE
;
3024 if (class == MC_MBUF
) {
3025 mbstat
.m_mbufs
= m_total(MC_MBUF
);
3026 mtype_stat_add(MT_FREE
, NMBPBG
);
3029 * If auditing is enabled, construct the
3030 * shadow mbuf in the audit structure
3031 * instead of the actual one.
3032 * mbuf_slab_audit() will take care of
3033 * restoring the contents after the
3036 if (mclaudit
!= NULL
) {
3038 mca
= mcl_audit_buf2mca(MC_MBUF
,
3040 ms
= MCA_SAVED_MBUF_PTR(mca
);
3041 ms
->m_type
= MT_FREE
;
3043 m
->m_type
= MT_FREE
;
3045 m
->m_next
= sp
->sl_head
;
3046 sp
->sl_head
= (void *)m
++;
3048 } else { /* MC_CL */
3050 m_infree(MC_CL
) + m_infree(MC_MBUF_CL
);
3051 mbstat
.m_clusters
= m_total(MC_CL
);
3053 c
->mcl_next
= sp
->sl_head
;
3054 sp
->sl_head
= (void *)c
++;
3058 /* Insert into the mbuf or 2k slab list */
3059 slab_insert(sp
, class);
3061 if ((i
= mb_waiters
) > 0)
3064 wakeup(mb_waitchan
);
3066 return (count
!= 0);
3070 * For each class, initialize the freelist to hold m_minlimit() objects.
3073 freelist_init(mbuf_class_t
class)
3075 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
3077 VERIFY(class == MC_CL
|| class == MC_BIGCL
);
3078 VERIFY(m_total(class) == 0);
3079 VERIFY(m_minlimit(class) > 0);
3081 while (m_total(class) < m_minlimit(class))
3082 (void) freelist_populate(class, m_minlimit(class), M_WAIT
);
3084 VERIFY(m_total(class) >= m_minlimit(class));
3088 * (Inaccurately) check if it might be worth a trip back to the
3089 * mcache layer due the availability of objects there. We'll
3090 * end up back here if there's nothing up there.
3093 mbuf_cached_above(mbuf_class_t
class, int wait
)
3097 if (wait
& MCR_COMP
)
3098 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL
)) ||
3099 !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL
)));
3103 if (wait
& MCR_COMP
)
3104 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL
)));
3108 if (wait
& MCR_COMP
)
3109 return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL
)));
3113 if (wait
& MCR_COMP
)
3114 return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL
)));
3127 return (!mcache_bkt_isempty(m_cache(class)));
3131 * If possible, convert constructed objects to raw ones.
3134 mbuf_steal(mbuf_class_t
class, unsigned int num
)
3136 mcache_obj_t
*top
= NULL
;
3137 mcache_obj_t
**list
= &top
;
3138 unsigned int tot
= 0;
3140 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
3152 /* Get the required number of constructed objects if possible */
3153 if (m_infree(class) > m_minlimit(class)) {
3154 tot
= cslab_alloc(class, &list
,
3155 MIN(num
, m_infree(class)));
3158 /* And destroy them to get back the raw objects */
3160 (void) cslab_free(class, top
, 1);
3168 return (tot
== num
);
3172 m_reclaim(mbuf_class_t
class, unsigned int num
, boolean_t comp
)
3176 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
3178 VERIFY(m_total(MC_CL
) <= m_maxlimit(MC_CL
));
3179 VERIFY(m_total(MC_BIGCL
) <= m_maxlimit(MC_BIGCL
));
3180 VERIFY(m_total(MC_16KCL
) <= m_maxlimit(MC_16KCL
));
3183 * This logic can be made smarter; for now, simply mark
3184 * all other related classes as potential victims.
3188 m_wantpurge(MC_CL
)++;
3189 m_wantpurge(MC_BIGCL
)++;
3190 m_wantpurge(MC_MBUF_CL
)++;
3191 m_wantpurge(MC_MBUF_BIGCL
)++;
3195 m_wantpurge(MC_MBUF
)++;
3196 m_wantpurge(MC_BIGCL
)++;
3197 m_wantpurge(MC_MBUF_BIGCL
)++;
3199 m_wantpurge(MC_MBUF_CL
)++;
3203 m_wantpurge(MC_MBUF
)++;
3204 m_wantpurge(MC_CL
)++;
3205 m_wantpurge(MC_MBUF_CL
)++;
3207 m_wantpurge(MC_MBUF_BIGCL
)++;
3212 m_wantpurge(MC_MBUF_16KCL
)++;
3221 * Run through each marked class and check if we really need to
3222 * purge (and therefore temporarily disable) the per-CPU caches
3223 * layer used by the class. If so, remember the classes since
3224 * we are going to drop the lock below prior to purging.
3226 for (m
= 0; m
< NELEM(mbuf_table
); m
++) {
3227 if (m_wantpurge(m
) > 0) {
3230 * Try hard to steal the required number of objects
3231 * from the freelist of other mbuf classes. Only
3232 * purge and disable the per-CPU caches layer when
3233 * we don't have enough; it's the last resort.
3235 if (!mbuf_steal(m
, num
))
3240 lck_mtx_unlock(mbuf_mlock
);
3243 /* signal the domains to drain */
3244 net_drain_domains();
3246 /* Sigh; we have no other choices but to ask mcache to purge */
3247 for (m
= 0; m
< NELEM(mbuf_table
); m
++) {
3248 if ((bmap
& (1 << m
)) &&
3249 mcache_purge_cache(m_cache(m
), TRUE
)) {
3250 lck_mtx_lock(mbuf_mlock
);
3253 lck_mtx_unlock(mbuf_mlock
);
3258 * Request mcache to reap extra elements from all of its caches;
3259 * note that all reaps are serialized and happen only at a fixed
3264 lck_mtx_lock(mbuf_mlock
);
3267 static inline struct mbuf
*
3268 m_get_common(int wait
, short type
, int hdr
)
3271 int mcflags
= MSLEEPF(wait
);
3273 /* Is this due to a non-blocking retry? If so, then try harder */
3274 if (mcflags
& MCR_NOSLEEP
)
3275 mcflags
|= MCR_TRYHARD
;
3277 m
= mcache_alloc(m_cache(MC_MBUF
), mcflags
);
3279 MBUF_INIT(m
, hdr
, type
);
3280 mtype_stat_inc(type
);
3281 mtype_stat_dec(MT_FREE
);
3283 if (hdr
&& mac_init_mbuf(m
, wait
) != 0) {
3287 #endif /* MAC_NET */
3293 * Space allocation routines; these are also available as macros
3294 * for critical paths.
3296 #define _M_GET(wait, type) m_get_common(wait, type, 0)
3297 #define _M_GETHDR(wait, type) m_get_common(wait, type, 1)
3298 #define _M_RETRY(wait, type) _M_GET(wait, type)
3299 #define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
3300 #define _MGET(m, how, type) ((m) = _M_GET(how, type))
3301 #define _MGETHDR(m, how, type) ((m) = _M_GETHDR(how, type))
3304 m_get(int wait
, int type
)
3306 return (_M_GET(wait
, type
));
3310 m_gethdr(int wait
, int type
)
3312 return (_M_GETHDR(wait
, type
));
3316 m_retry(int wait
, int type
)
3318 return (_M_RETRY(wait
, type
));
3322 m_retryhdr(int wait
, int type
)
3324 return (_M_RETRYHDR(wait
, type
));
3328 m_getclr(int wait
, int type
)
3332 _MGET(m
, wait
, type
);
3334 bzero(MTOD(m
, caddr_t
), MLEN
);
3339 m_free(struct mbuf
*m
)
3341 struct mbuf
*n
= m
->m_next
;
3343 if (m
->m_type
== MT_FREE
)
3344 panic("m_free: freeing an already freed mbuf");
3346 if (m
->m_flags
& M_PKTHDR
) {
3347 /* Check for scratch area overflow */
3348 m_redzone_verify(m
);
3349 /* Free the aux data and tags if there is any */
3350 m_tag_delete_chain(m
, NULL
);
3353 if (m
->m_flags
& M_EXT
) {
3355 u_int32_t composite
;
3357 refcnt
= m_decref(m
);
3358 composite
= (MEXT_FLAGS(m
) & EXTF_COMPOSITE
);
3359 if (refcnt
== 0 && !composite
) {
3360 if (m
->m_ext
.ext_free
== NULL
) {
3361 mcache_free(m_cache(MC_CL
), m
->m_ext
.ext_buf
);
3362 } else if (m
->m_ext
.ext_free
== m_bigfree
) {
3363 mcache_free(m_cache(MC_BIGCL
),
3365 } else if (m
->m_ext
.ext_free
== m_16kfree
) {
3366 mcache_free(m_cache(MC_16KCL
),
3369 (*(m
->m_ext
.ext_free
))(m
->m_ext
.ext_buf
,
3370 m
->m_ext
.ext_size
, m
->m_ext
.ext_arg
);
3372 mcache_free(ref_cache
, MEXT_RFA(m
));
3374 } else if (refcnt
== 0 && composite
) {
3375 VERIFY(m
->m_type
!= MT_FREE
);
3377 mtype_stat_dec(m
->m_type
);
3378 mtype_stat_inc(MT_FREE
);
3380 m
->m_type
= MT_FREE
;
3383 m
->m_next
= m
->m_nextpkt
= NULL
;
3385 MEXT_FLAGS(m
) &= ~EXTF_READONLY
;
3387 /* "Free" into the intermediate cache */
3388 if (m
->m_ext
.ext_free
== NULL
) {
3389 mcache_free(m_cache(MC_MBUF_CL
), m
);
3390 } else if (m
->m_ext
.ext_free
== m_bigfree
) {
3391 mcache_free(m_cache(MC_MBUF_BIGCL
), m
);
3393 VERIFY(m
->m_ext
.ext_free
== m_16kfree
);
3394 mcache_free(m_cache(MC_MBUF_16KCL
), m
);
3400 if (m
->m_type
!= MT_FREE
) {
3401 mtype_stat_dec(m
->m_type
);
3402 mtype_stat_inc(MT_FREE
);
3405 m
->m_type
= MT_FREE
;
3406 m
->m_flags
= m
->m_len
= 0;
3407 m
->m_next
= m
->m_nextpkt
= NULL
;
3409 mcache_free(m_cache(MC_MBUF
), m
);
3414 __private_extern__
struct mbuf
*
3415 m_clattach(struct mbuf
*m
, int type
, caddr_t extbuf
,
3416 void (*extfree
)(caddr_t
, u_int
, caddr_t
), u_int extsize
, caddr_t extarg
,
3419 struct ext_ref
*rfa
= NULL
;
3421 if (m
== NULL
&& (m
= _M_GETHDR(wait
, type
)) == NULL
)
3424 if (m
->m_flags
& M_EXT
) {
3426 u_int32_t composite
;
3428 refcnt
= m_decref(m
);
3429 composite
= (MEXT_FLAGS(m
) & EXTF_COMPOSITE
);
3430 if (refcnt
== 0 && !composite
) {
3431 if (m
->m_ext
.ext_free
== NULL
) {
3432 mcache_free(m_cache(MC_CL
), m
->m_ext
.ext_buf
);
3433 } else if (m
->m_ext
.ext_free
== m_bigfree
) {
3434 mcache_free(m_cache(MC_BIGCL
),
3436 } else if (m
->m_ext
.ext_free
== m_16kfree
) {
3437 mcache_free(m_cache(MC_16KCL
),
3440 (*(m
->m_ext
.ext_free
))(m
->m_ext
.ext_buf
,
3441 m
->m_ext
.ext_size
, m
->m_ext
.ext_arg
);
3443 /* Re-use the reference structure */
3445 } else if (refcnt
== 0 && composite
) {
3446 VERIFY(m
->m_type
!= MT_FREE
);
3448 mtype_stat_dec(m
->m_type
);
3449 mtype_stat_inc(MT_FREE
);
3451 m
->m_type
= MT_FREE
;
3454 m
->m_next
= m
->m_nextpkt
= NULL
;
3456 MEXT_FLAGS(m
) &= ~EXTF_READONLY
;
3458 /* "Free" into the intermediate cache */
3459 if (m
->m_ext
.ext_free
== NULL
) {
3460 mcache_free(m_cache(MC_MBUF_CL
), m
);
3461 } else if (m
->m_ext
.ext_free
== m_bigfree
) {
3462 mcache_free(m_cache(MC_MBUF_BIGCL
), m
);
3464 VERIFY(m
->m_ext
.ext_free
== m_16kfree
);
3465 mcache_free(m_cache(MC_MBUF_16KCL
), m
);
3468 * Allocate a new mbuf, since we didn't divorce
3469 * the composite mbuf + cluster pair above.
3471 if ((m
= _M_GETHDR(wait
, type
)) == NULL
)
3477 (rfa
= mcache_alloc(ref_cache
, MSLEEPF(wait
))) == NULL
) {
3482 MEXT_INIT(m
, extbuf
, extsize
, extfree
, extarg
, rfa
, 1, 0);
3488 * Perform `fast' allocation mbuf clusters from a cache of recently-freed
3489 * clusters. (If the cache is empty, new clusters are allocated en-masse.)
3492 m_getcl(int wait
, int type
, int flags
)
3495 int mcflags
= MSLEEPF(wait
);
3496 int hdr
= (flags
& M_PKTHDR
);
3498 /* Is this due to a non-blocking retry? If so, then try harder */
3499 if (mcflags
& MCR_NOSLEEP
)
3500 mcflags
|= MCR_TRYHARD
;
3502 m
= mcache_alloc(m_cache(MC_MBUF_CL
), mcflags
);
3505 struct ext_ref
*rfa
;
3508 VERIFY(m
->m_type
== MT_FREE
&& m
->m_flags
== M_EXT
);
3509 cl
= m
->m_ext
.ext_buf
;
3512 ASSERT(cl
!= NULL
&& rfa
!= NULL
);
3513 VERIFY(MBUF_IS_COMPOSITE(m
) && m
->m_ext
.ext_free
== NULL
);
3515 flag
= MEXT_FLAGS(m
);
3517 MBUF_INIT(m
, hdr
, type
);
3518 MBUF_CL_INIT(m
, cl
, rfa
, 1, flag
);
3520 mtype_stat_inc(type
);
3521 mtype_stat_dec(MT_FREE
);
3523 if (hdr
&& mac_init_mbuf(m
, wait
) != 0) {
3527 #endif /* MAC_NET */
3532 /* m_mclget() add an mbuf cluster to a normal mbuf */
3534 m_mclget(struct mbuf
*m
, int wait
)
3536 struct ext_ref
*rfa
;
3538 if ((rfa
= mcache_alloc(ref_cache
, MSLEEPF(wait
))) == NULL
)
3541 m
->m_ext
.ext_buf
= m_mclalloc(wait
);
3542 if (m
->m_ext
.ext_buf
!= NULL
) {
3543 MBUF_CL_INIT(m
, m
->m_ext
.ext_buf
, rfa
, 1, 0);
3545 mcache_free(ref_cache
, rfa
);
3550 /* Allocate an mbuf cluster */
3552 m_mclalloc(int wait
)
3554 int mcflags
= MSLEEPF(wait
);
3556 /* Is this due to a non-blocking retry? If so, then try harder */
3557 if (mcflags
& MCR_NOSLEEP
)
3558 mcflags
|= MCR_TRYHARD
;
3560 return (mcache_alloc(m_cache(MC_CL
), mcflags
));
3563 /* Free an mbuf cluster */
3565 m_mclfree(caddr_t p
)
3567 mcache_free(m_cache(MC_CL
), p
);
3571 * mcl_hasreference() checks if a cluster of an mbuf is referenced by
3572 * another mbuf; see comments in m_incref() regarding EXTF_READONLY.
3575 m_mclhasreference(struct mbuf
*m
)
3577 if (!(m
->m_flags
& M_EXT
))
3580 ASSERT(MEXT_RFA(m
) != NULL
);
3582 return ((MEXT_FLAGS(m
) & EXTF_READONLY
) ? 1 : 0);
3585 __private_extern__ caddr_t
3586 m_bigalloc(int wait
)
3588 int mcflags
= MSLEEPF(wait
);
3590 /* Is this due to a non-blocking retry? If so, then try harder */
3591 if (mcflags
& MCR_NOSLEEP
)
3592 mcflags
|= MCR_TRYHARD
;
3594 return (mcache_alloc(m_cache(MC_BIGCL
), mcflags
));
3597 __private_extern__
void
3598 m_bigfree(caddr_t p
, __unused u_int size
, __unused caddr_t arg
)
3600 mcache_free(m_cache(MC_BIGCL
), p
);
3603 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
3604 __private_extern__
struct mbuf
*
3605 m_mbigget(struct mbuf
*m
, int wait
)
3607 struct ext_ref
*rfa
;
3609 if ((rfa
= mcache_alloc(ref_cache
, MSLEEPF(wait
))) == NULL
)
3612 m
->m_ext
.ext_buf
= m_bigalloc(wait
);
3613 if (m
->m_ext
.ext_buf
!= NULL
) {
3614 MBUF_BIGCL_INIT(m
, m
->m_ext
.ext_buf
, rfa
, 1, 0);
3616 mcache_free(ref_cache
, rfa
);
3621 __private_extern__ caddr_t
3622 m_16kalloc(int wait
)
3624 int mcflags
= MSLEEPF(wait
);
3626 /* Is this due to a non-blocking retry? If so, then try harder */
3627 if (mcflags
& MCR_NOSLEEP
)
3628 mcflags
|= MCR_TRYHARD
;
3630 return (mcache_alloc(m_cache(MC_16KCL
), mcflags
));
3633 __private_extern__
void
3634 m_16kfree(caddr_t p
, __unused u_int size
, __unused caddr_t arg
)
3636 mcache_free(m_cache(MC_16KCL
), p
);
3639 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
3640 __private_extern__
struct mbuf
*
3641 m_m16kget(struct mbuf
*m
, int wait
)
3643 struct ext_ref
*rfa
;
3645 if ((rfa
= mcache_alloc(ref_cache
, MSLEEPF(wait
))) == NULL
)
3648 m
->m_ext
.ext_buf
= m_16kalloc(wait
);
3649 if (m
->m_ext
.ext_buf
!= NULL
) {
3650 MBUF_16KCL_INIT(m
, m
->m_ext
.ext_buf
, rfa
, 1, 0);
3652 mcache_free(ref_cache
, rfa
);
3658 * "Move" mbuf pkthdr from "from" to "to".
3659 * "from" must have M_PKTHDR set, and "to" must be empty.
3662 m_copy_pkthdr(struct mbuf
*to
, struct mbuf
*from
)
3664 VERIFY(from
->m_flags
& M_PKTHDR
);
3666 /* Check for scratch area overflow */
3667 m_redzone_verify(from
);
3669 if (to
->m_flags
& M_PKTHDR
) {
3670 /* Check for scratch area overflow */
3671 m_redzone_verify(to
);
3672 /* We will be taking over the tags of 'to' */
3673 m_tag_delete_chain(to
, NULL
);
3675 to
->m_pkthdr
= from
->m_pkthdr
; /* especially tags */
3676 m_classifier_init(from
, 0); /* purge classifier info */
3677 m_tag_init(from
, 1); /* purge all tags from src */
3678 m_scratch_init(from
); /* clear src scratch area */
3679 to
->m_flags
= (from
->m_flags
& M_COPYFLAGS
) | (to
->m_flags
& M_EXT
);
3680 if ((to
->m_flags
& M_EXT
) == 0)
3681 to
->m_data
= to
->m_pktdat
;
3682 m_redzone_init(to
); /* setup red zone on dst */
3686 * Duplicate "from"'s mbuf pkthdr in "to".
3687 * "from" must have M_PKTHDR set, and "to" must be empty.
3688 * In particular, this does a deep copy of the packet tags.
3691 m_dup_pkthdr(struct mbuf
*to
, struct mbuf
*from
, int how
)
3693 VERIFY(from
->m_flags
& M_PKTHDR
);
3695 /* Check for scratch area overflow */
3696 m_redzone_verify(from
);
3698 if (to
->m_flags
& M_PKTHDR
) {
3699 /* Check for scratch area overflow */
3700 m_redzone_verify(to
);
3701 /* We will be taking over the tags of 'to' */
3702 m_tag_delete_chain(to
, NULL
);
3704 to
->m_flags
= (from
->m_flags
& M_COPYFLAGS
) | (to
->m_flags
& M_EXT
);
3705 if ((to
->m_flags
& M_EXT
) == 0)
3706 to
->m_data
= to
->m_pktdat
;
3707 to
->m_pkthdr
= from
->m_pkthdr
;
3708 m_redzone_init(to
); /* setup red zone on dst */
3709 m_tag_init(to
, 0); /* preserve dst static tags */
3710 return (m_tag_copy_chain(to
, from
, how
));
3714 m_copy_pftag(struct mbuf
*to
, struct mbuf
*from
)
3716 to
->m_pkthdr
.pf_mtag
= from
->m_pkthdr
.pf_mtag
;
3718 to
->m_pkthdr
.pf_mtag
.pftag_hdr
= NULL
;
3719 to
->m_pkthdr
.pf_mtag
.pftag_flags
&= ~(PF_TAG_HDR_INET
|PF_TAG_HDR_INET6
);
3724 m_classifier_init(struct mbuf
*m
, uint32_t pktf_mask
)
3726 VERIFY(m
->m_flags
& M_PKTHDR
);
3728 m
->m_pkthdr
.pkt_proto
= 0;
3729 m
->m_pkthdr
.pkt_flowsrc
= 0;
3730 m
->m_pkthdr
.pkt_flowid
= 0;
3731 m
->m_pkthdr
.pkt_flags
&= pktf_mask
; /* caller-defined mask */
3732 /* preserve service class and interface info for loopback packets */
3733 if (!(m
->m_pkthdr
.pkt_flags
& PKTF_LOOP
))
3734 (void) m_set_service_class(m
, MBUF_SC_BE
);
3735 if (!(m
->m_pkthdr
.pkt_flags
& PKTF_IFAINFO
))
3736 m
->m_pkthdr
.pkt_ifainfo
= 0;
3738 m
->m_pkthdr
.pkt_bwseq
= 0;
3739 #endif /* MEASURE_BW */
3743 m_copy_classifier(struct mbuf
*to
, struct mbuf
*from
)
3745 VERIFY(to
->m_flags
& M_PKTHDR
);
3746 VERIFY(from
->m_flags
& M_PKTHDR
);
3748 to
->m_pkthdr
.pkt_proto
= from
->m_pkthdr
.pkt_proto
;
3749 to
->m_pkthdr
.pkt_flowsrc
= from
->m_pkthdr
.pkt_flowsrc
;
3750 to
->m_pkthdr
.pkt_flowid
= from
->m_pkthdr
.pkt_flowid
;
3751 to
->m_pkthdr
.pkt_flags
= from
->m_pkthdr
.pkt_flags
;
3752 (void) m_set_service_class(to
, from
->m_pkthdr
.pkt_svc
);
3753 to
->m_pkthdr
.pkt_ifainfo
= from
->m_pkthdr
.pkt_ifainfo
;
3755 to
->m_pkthdr
.pkt_bwseq
= from
->m_pkthdr
.pkt_bwseq
;
3756 #endif /* MEASURE_BW */
3760 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
3761 * if wantall is not set, return whatever number were available. Set up the
3762 * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
3763 * are chained on the m_nextpkt field. Any packets requested beyond this
3764 * are chained onto the last packet header's m_next field. The size of
3765 * the cluster is controlled by the parameter bufsize.
3767 __private_extern__
struct mbuf
*
3768 m_getpackets_internal(unsigned int *num_needed
, int num_with_pkthdrs
,
3769 int wait
, int wantall
, size_t bufsize
)
3772 struct mbuf
**np
, *top
;
3773 unsigned int pnum
, needed
= *num_needed
;
3774 mcache_obj_t
*mp_list
= NULL
;
3775 int mcflags
= MSLEEPF(wait
);
3777 struct ext_ref
*rfa
;
3781 ASSERT(bufsize
== m_maxsize(MC_CL
) ||
3782 bufsize
== m_maxsize(MC_BIGCL
) ||
3783 bufsize
== m_maxsize(MC_16KCL
));
3786 * Caller must first check for njcl because this
3787 * routine is internal and not exposed/used via KPI.
3789 VERIFY(bufsize
!= m_maxsize(MC_16KCL
) || njcl
> 0);
3796 * The caller doesn't want all the requested buffers; only some.
3797 * Try hard to get what we can, but don't block. This effectively
3798 * overrides MCR_SLEEP, since this thread will not go to sleep
3799 * if we can't get all the buffers.
3801 if (!wantall
|| (mcflags
& MCR_NOSLEEP
))
3802 mcflags
|= MCR_TRYHARD
;
3804 /* Allocate the composite mbuf + cluster elements from the cache */
3805 if (bufsize
== m_maxsize(MC_CL
))
3806 cp
= m_cache(MC_MBUF_CL
);
3807 else if (bufsize
== m_maxsize(MC_BIGCL
))
3808 cp
= m_cache(MC_MBUF_BIGCL
);
3810 cp
= m_cache(MC_MBUF_16KCL
);
3811 needed
= mcache_alloc_ext(cp
, &mp_list
, needed
, mcflags
);
3813 for (pnum
= 0; pnum
< needed
; pnum
++) {
3814 m
= (struct mbuf
*)mp_list
;
3815 mp_list
= mp_list
->obj_next
;
3817 VERIFY(m
->m_type
== MT_FREE
&& m
->m_flags
== M_EXT
);
3818 cl
= m
->m_ext
.ext_buf
;
3821 ASSERT(cl
!= NULL
&& rfa
!= NULL
);
3822 VERIFY(MBUF_IS_COMPOSITE(m
));
3824 flag
= MEXT_FLAGS(m
);
3826 MBUF_INIT(m
, num_with_pkthdrs
, MT_DATA
);
3827 if (bufsize
== m_maxsize(MC_16KCL
)) {
3828 MBUF_16KCL_INIT(m
, cl
, rfa
, 1, flag
);
3829 } else if (bufsize
== m_maxsize(MC_BIGCL
)) {
3830 MBUF_BIGCL_INIT(m
, cl
, rfa
, 1, flag
);
3832 MBUF_CL_INIT(m
, cl
, rfa
, 1, flag
);
3835 if (num_with_pkthdrs
> 0) {
3838 if (mac_mbuf_label_init(m
, wait
) != 0) {
3842 #endif /* MAC_NET */
3846 if (num_with_pkthdrs
> 0)
3851 ASSERT(pnum
!= *num_needed
|| mp_list
== NULL
);
3852 if (mp_list
!= NULL
)
3853 mcache_free_ext(cp
, mp_list
);
3856 mtype_stat_add(MT_DATA
, pnum
);
3857 mtype_stat_sub(MT_FREE
, pnum
);
3860 if (wantall
&& (pnum
!= *num_needed
)) {
3866 if (pnum
> *num_needed
) {
3867 printf("%s: File a radar related to <rdar://10146739>. \
3868 needed = %u, pnum = %u, num_needed = %u \n",
3869 __func__
, needed
, pnum
, *num_needed
);
3877 * Return list of mbuf linked by m_nextpkt. Try for numlist, and if
3878 * wantall is not set, return whatever number were available. The size of
3879 * each mbuf in the list is controlled by the parameter packetlen. Each
3880 * mbuf of the list may have a chain of mbufs linked by m_next. Each mbuf
3881 * in the chain is called a segment. If maxsegments is not null and the
3882 * value pointed to is not null, this specify the maximum number of segments
3883 * for a chain of mbufs. If maxsegments is zero or the value pointed to
3884 * is zero the caller does not have any restriction on the number of segments.
3885 * The actual number of segments of a mbuf chain is return in the value
3886 * pointed to by maxsegments.
3888 __private_extern__
struct mbuf
*
3889 m_allocpacket_internal(unsigned int *numlist
, size_t packetlen
,
3890 unsigned int *maxsegments
, int wait
, int wantall
, size_t wantsize
)
3892 struct mbuf
**np
, *top
, *first
= NULL
;
3893 size_t bufsize
, r_bufsize
;
3894 unsigned int num
= 0;
3895 unsigned int nsegs
= 0;
3896 unsigned int needed
, resid
;
3897 int mcflags
= MSLEEPF(wait
);
3898 mcache_obj_t
*mp_list
= NULL
, *rmp_list
= NULL
;
3899 mcache_t
*cp
= NULL
, *rcp
= NULL
;
3907 if (wantsize
== 0) {
3908 if (packetlen
<= MINCLSIZE
) {
3909 bufsize
= packetlen
;
3910 } else if (packetlen
> m_maxsize(MC_CL
)) {
3911 /* Use 4KB if jumbo cluster pool isn't available */
3912 if (packetlen
<= m_maxsize(MC_BIGCL
) || njcl
== 0)
3913 bufsize
= m_maxsize(MC_BIGCL
);
3915 bufsize
= m_maxsize(MC_16KCL
);
3917 bufsize
= m_maxsize(MC_CL
);
3919 } else if (wantsize
== m_maxsize(MC_CL
) ||
3920 wantsize
== m_maxsize(MC_BIGCL
) ||
3921 (wantsize
== m_maxsize(MC_16KCL
) && njcl
> 0)) {
3927 if (bufsize
<= MHLEN
) {
3929 } else if (bufsize
<= MINCLSIZE
) {
3930 if (maxsegments
!= NULL
&& *maxsegments
== 1) {
3931 bufsize
= m_maxsize(MC_CL
);
3936 } else if (bufsize
== m_maxsize(MC_16KCL
)) {
3938 nsegs
= ((packetlen
- 1) >> (PGSHIFT
+ 2)) + 1;
3939 } else if (bufsize
== m_maxsize(MC_BIGCL
)) {
3940 nsegs
= ((packetlen
- 1) >> PGSHIFT
) + 1;
3942 nsegs
= ((packetlen
- 1) >> MCLSHIFT
) + 1;
3944 if (maxsegments
!= NULL
) {
3945 if (*maxsegments
&& nsegs
> *maxsegments
) {
3946 *maxsegments
= nsegs
;
3949 *maxsegments
= nsegs
;
3953 * The caller doesn't want all the requested buffers; only some.
3954 * Try hard to get what we can, but don't block. This effectively
3955 * overrides MCR_SLEEP, since this thread will not go to sleep
3956 * if we can't get all the buffers.
3958 if (!wantall
|| (mcflags
& MCR_NOSLEEP
))
3959 mcflags
|= MCR_TRYHARD
;
3962 * Simple case where all elements in the lists/chains are mbufs.
3963 * Unless bufsize is greater than MHLEN, each segment chain is made
3964 * up of exactly 1 mbuf. Otherwise, each segment chain is made up
3965 * of 2 mbufs; the second one is used for the residual data, i.e.
3966 * the remaining data that cannot fit into the first mbuf.
3968 if (bufsize
<= MINCLSIZE
) {
3969 /* Allocate the elements in one shot from the mbuf cache */
3970 ASSERT(bufsize
<= MHLEN
|| nsegs
== 2);
3971 cp
= m_cache(MC_MBUF
);
3972 needed
= mcache_alloc_ext(cp
, &mp_list
,
3973 (*numlist
) * nsegs
, mcflags
);
3976 * The number of elements must be even if we are to use an
3977 * mbuf (instead of a cluster) to store the residual data.
3978 * If we couldn't allocate the requested number of mbufs,
3979 * trim the number down (if it's odd) in order to avoid
3980 * creating a partial segment chain.
3982 if (bufsize
> MHLEN
&& (needed
& 0x1))
3985 while (num
< needed
) {
3988 m
= (struct mbuf
*)mp_list
;
3989 mp_list
= mp_list
->obj_next
;
3992 MBUF_INIT(m
, 1, MT_DATA
);
3994 if (mac_init_mbuf(m
, wait
) != 0) {
3998 #endif /* MAC_NET */
4000 if (bufsize
> MHLEN
) {
4001 /* A second mbuf for this segment chain */
4002 m
->m_next
= (struct mbuf
*)mp_list
;
4003 mp_list
= mp_list
->obj_next
;
4004 ASSERT(m
->m_next
!= NULL
);
4006 MBUF_INIT(m
->m_next
, 0, MT_DATA
);
4012 ASSERT(num
!= *numlist
|| mp_list
== NULL
);
4015 mtype_stat_add(MT_DATA
, num
);
4016 mtype_stat_sub(MT_FREE
, num
);
4020 /* We've got them all; return to caller */
4021 if (num
== *numlist
)
4028 * Complex cases where elements are made up of one or more composite
4029 * mbufs + cluster, depending on packetlen. Each N-segment chain can
4030 * be illustrated as follows:
4032 * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
4034 * Every composite mbuf + cluster element comes from the intermediate
4035 * cache (either MC_MBUF_CL or MC_MBUF_BIGCL). For space efficiency,
4036 * the last composite element will come from the MC_MBUF_CL cache,
4037 * unless the residual data is larger than 2KB where we use the
4038 * big cluster composite cache (MC_MBUF_BIGCL) instead. Residual
4039 * data is defined as extra data beyond the first element that cannot
4040 * fit into the previous element, i.e. there is no residual data if
4041 * the chain only has 1 segment.
4043 r_bufsize
= bufsize
;
4044 resid
= packetlen
> bufsize
? packetlen
% bufsize
: 0;
4046 /* There is residual data; figure out the cluster size */
4047 if (wantsize
== 0 && packetlen
> MINCLSIZE
) {
4049 * Caller didn't request that all of the segments
4050 * in the chain use the same cluster size; use the
4051 * smaller of the cluster sizes.
4053 if (njcl
> 0 && resid
> m_maxsize(MC_BIGCL
))
4054 r_bufsize
= m_maxsize(MC_16KCL
);
4055 else if (resid
> m_maxsize(MC_CL
))
4056 r_bufsize
= m_maxsize(MC_BIGCL
);
4058 r_bufsize
= m_maxsize(MC_CL
);
4060 /* Use the same cluster size as the other segments */
4068 * Attempt to allocate composite mbuf + cluster elements for
4069 * the residual data in each chain; record the number of such
4070 * elements that can be allocated so that we know how many
4071 * segment chains we can afford to create.
4073 if (r_bufsize
<= m_maxsize(MC_CL
))
4074 rcp
= m_cache(MC_MBUF_CL
);
4075 else if (r_bufsize
<= m_maxsize(MC_BIGCL
))
4076 rcp
= m_cache(MC_MBUF_BIGCL
);
4078 rcp
= m_cache(MC_MBUF_16KCL
);
4079 needed
= mcache_alloc_ext(rcp
, &rmp_list
, *numlist
, mcflags
);
4084 /* This is temporarily reduced for calculation */
4090 * Attempt to allocate the rest of the composite mbuf + cluster
4091 * elements for the number of segment chains that we need.
4093 if (bufsize
<= m_maxsize(MC_CL
))
4094 cp
= m_cache(MC_MBUF_CL
);
4095 else if (bufsize
<= m_maxsize(MC_BIGCL
))
4096 cp
= m_cache(MC_MBUF_BIGCL
);
4098 cp
= m_cache(MC_MBUF_16KCL
);
4099 needed
= mcache_alloc_ext(cp
, &mp_list
, needed
* nsegs
, mcflags
);
4101 /* Round it down to avoid creating a partial segment chain */
4102 needed
= (needed
/ nsegs
) * nsegs
;
4108 * We're about to construct the chain(s); take into account
4109 * the number of segments we have created above to hold the
4110 * residual data for each chain, as well as restore the
4111 * original count of segments per chain.
4114 needed
+= needed
/ nsegs
;
4121 struct ext_ref
*rfa
;
4126 if (nsegs
== 1 || (num
% nsegs
) != 0 || resid
== 0) {
4127 m
= (struct mbuf
*)mp_list
;
4128 mp_list
= mp_list
->obj_next
;
4130 m
= (struct mbuf
*)rmp_list
;
4131 rmp_list
= rmp_list
->obj_next
;
4134 VERIFY(m
->m_type
== MT_FREE
&& m
->m_flags
== M_EXT
);
4135 VERIFY(m
->m_ext
.ext_free
== NULL
||
4136 m
->m_ext
.ext_free
== m_bigfree
||
4137 m
->m_ext
.ext_free
== m_16kfree
);
4139 cl
= m
->m_ext
.ext_buf
;
4142 ASSERT(cl
!= NULL
&& rfa
!= NULL
);
4143 VERIFY(MBUF_IS_COMPOSITE(m
));
4145 flag
= MEXT_FLAGS(m
);
4147 pkthdr
= (nsegs
== 1 || (num
% nsegs
) == 1);
4150 MBUF_INIT(m
, pkthdr
, MT_DATA
);
4151 if (m
->m_ext
.ext_free
== m_16kfree
) {
4152 MBUF_16KCL_INIT(m
, cl
, rfa
, 1, flag
);
4153 } else if (m
->m_ext
.ext_free
== m_bigfree
) {
4154 MBUF_BIGCL_INIT(m
, cl
, rfa
, 1, flag
);
4156 MBUF_CL_INIT(m
, cl
, rfa
, 1, flag
);
4159 if (pkthdr
&& mac_init_mbuf(m
, wait
) != 0) {
4164 #endif /* MAC_NET */
4167 if ((num
% nsegs
) == 0)
4168 np
= &first
->m_nextpkt
;
4177 mtype_stat_add(MT_DATA
, num
);
4178 mtype_stat_sub(MT_FREE
, num
);
4183 /* We've got them all; return to caller */
4184 if (num
== *numlist
) {
4185 ASSERT(mp_list
== NULL
&& rmp_list
== NULL
);
4190 /* Free up what's left of the above */
4191 if (mp_list
!= NULL
)
4192 mcache_free_ext(cp
, mp_list
);
4193 if (rmp_list
!= NULL
)
4194 mcache_free_ext(rcp
, rmp_list
);
4195 if (wantall
&& top
!= NULL
) {
4204 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
4205 * packets on receive ring.
4207 __private_extern__
struct mbuf
*
4208 m_getpacket_how(int wait
)
4210 unsigned int num_needed
= 1;
4212 return (m_getpackets_internal(&num_needed
, 1, wait
, 1,
4217 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
4218 * packets on receive ring.
4223 unsigned int num_needed
= 1;
4225 return (m_getpackets_internal(&num_needed
, 1, M_WAIT
, 1,
4230 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
4231 * if this can't be met, return whatever number were available. Set up the
4232 * first num_with_pkthdrs with mbuf hdrs configured as packet headers. These
4233 * are chained on the m_nextpkt field. Any packets requested beyond this are
4234 * chained onto the last packet header's m_next field.
4237 m_getpackets(int num_needed
, int num_with_pkthdrs
, int how
)
4239 unsigned int n
= num_needed
;
4241 return (m_getpackets_internal(&n
, num_with_pkthdrs
, how
, 0,
4246 * Return a list of mbuf hdrs set up as packet hdrs chained together
4247 * on the m_nextpkt field
4250 m_getpackethdrs(int num_needed
, int how
)
4253 struct mbuf
**np
, *top
;
4258 while (num_needed
--) {
4259 m
= _M_RETRYHDR(how
, MT_DATA
);
4271 * Free an mbuf list (m_nextpkt) while following m_next. Returns the count
4272 * for mbufs packets freed. Used by the drivers.
4275 m_freem_list(struct mbuf
*m
)
4277 struct mbuf
*nextpkt
;
4278 mcache_obj_t
*mp_list
= NULL
;
4279 mcache_obj_t
*mcl_list
= NULL
;
4280 mcache_obj_t
*mbc_list
= NULL
;
4281 mcache_obj_t
*m16k_list
= NULL
;
4282 mcache_obj_t
*m_mcl_list
= NULL
;
4283 mcache_obj_t
*m_mbc_list
= NULL
;
4284 mcache_obj_t
*m_m16k_list
= NULL
;
4285 mcache_obj_t
*ref_list
= NULL
;
4287 int mt_free
= 0, mt_data
= 0, mt_header
= 0, mt_soname
= 0, mt_tag
= 0;
4292 nextpkt
= m
->m_nextpkt
;
4293 m
->m_nextpkt
= NULL
;
4296 struct mbuf
*next
= m
->m_next
;
4297 mcache_obj_t
*o
, *rfa
;
4298 u_int32_t refcnt
, composite
;
4300 if (m
->m_type
== MT_FREE
)
4301 panic("m_free: freeing an already freed mbuf");
4303 if (m
->m_type
!= MT_FREE
)
4306 if (m
->m_flags
& M_PKTHDR
) {
4307 /* Check for scratch area overflow */
4308 m_redzone_verify(m
);
4309 /* Free the aux data and tags if there is any */
4310 m_tag_delete_chain(m
, NULL
);
4313 if (!(m
->m_flags
& M_EXT
))
4316 o
= (mcache_obj_t
*)(void *)m
->m_ext
.ext_buf
;
4317 refcnt
= m_decref(m
);
4318 composite
= (MEXT_FLAGS(m
) & EXTF_COMPOSITE
);
4319 if (refcnt
== 0 && !composite
) {
4320 if (m
->m_ext
.ext_free
== NULL
) {
4321 o
->obj_next
= mcl_list
;
4323 } else if (m
->m_ext
.ext_free
== m_bigfree
) {
4324 o
->obj_next
= mbc_list
;
4326 } else if (m
->m_ext
.ext_free
== m_16kfree
) {
4327 o
->obj_next
= m16k_list
;
4330 (*(m
->m_ext
.ext_free
))((caddr_t
)o
,
4334 rfa
= (mcache_obj_t
*)(void *)MEXT_RFA(m
);
4335 rfa
->obj_next
= ref_list
;
4338 } else if (refcnt
== 0 && composite
) {
4339 VERIFY(m
->m_type
!= MT_FREE
);
4341 * Amortize the costs of atomic operations
4342 * by doing them at the end, if possible.
4344 if (m
->m_type
== MT_DATA
)
4346 else if (m
->m_type
== MT_HEADER
)
4348 else if (m
->m_type
== MT_SONAME
)
4350 else if (m
->m_type
== MT_TAG
)
4353 mtype_stat_dec(m
->m_type
);
4355 m
->m_type
= MT_FREE
;
4358 m
->m_next
= m
->m_nextpkt
= NULL
;
4360 MEXT_FLAGS(m
) &= ~EXTF_READONLY
;
4362 /* "Free" into the intermediate cache */
4363 o
= (mcache_obj_t
*)m
;
4364 if (m
->m_ext
.ext_free
== NULL
) {
4365 o
->obj_next
= m_mcl_list
;
4367 } else if (m
->m_ext
.ext_free
== m_bigfree
) {
4368 o
->obj_next
= m_mbc_list
;
4371 VERIFY(m
->m_ext
.ext_free
== m_16kfree
);
4372 o
->obj_next
= m_m16k_list
;
4380 * Amortize the costs of atomic operations
4381 * by doing them at the end, if possible.
4383 if (m
->m_type
== MT_DATA
)
4385 else if (m
->m_type
== MT_HEADER
)
4387 else if (m
->m_type
== MT_SONAME
)
4389 else if (m
->m_type
== MT_TAG
)
4391 else if (m
->m_type
!= MT_FREE
)
4392 mtype_stat_dec(m
->m_type
);
4394 m
->m_type
= MT_FREE
;
4395 m
->m_flags
= m
->m_len
= 0;
4396 m
->m_next
= m
->m_nextpkt
= NULL
;
4398 ((mcache_obj_t
*)m
)->obj_next
= mp_list
;
4399 mp_list
= (mcache_obj_t
*)m
;
4408 mtype_stat_add(MT_FREE
, mt_free
);
4410 mtype_stat_sub(MT_DATA
, mt_data
);
4412 mtype_stat_sub(MT_HEADER
, mt_header
);
4414 mtype_stat_sub(MT_SONAME
, mt_soname
);
4416 mtype_stat_sub(MT_TAG
, mt_tag
);
4418 if (mp_list
!= NULL
)
4419 mcache_free_ext(m_cache(MC_MBUF
), mp_list
);
4420 if (mcl_list
!= NULL
)
4421 mcache_free_ext(m_cache(MC_CL
), mcl_list
);
4422 if (mbc_list
!= NULL
)
4423 mcache_free_ext(m_cache(MC_BIGCL
), mbc_list
);
4424 if (m16k_list
!= NULL
)
4425 mcache_free_ext(m_cache(MC_16KCL
), m16k_list
);
4426 if (m_mcl_list
!= NULL
)
4427 mcache_free_ext(m_cache(MC_MBUF_CL
), m_mcl_list
);
4428 if (m_mbc_list
!= NULL
)
4429 mcache_free_ext(m_cache(MC_MBUF_BIGCL
), m_mbc_list
);
4430 if (m_m16k_list
!= NULL
)
4431 mcache_free_ext(m_cache(MC_MBUF_16KCL
), m_m16k_list
);
4432 if (ref_list
!= NULL
)
4433 mcache_free_ext(ref_cache
, ref_list
);
4439 m_freem(struct mbuf
*m
)
4446 * Mbuffer utility routines.
4450 * Compute the amount of space available before the current start
4451 * of data in an mbuf.
4454 m_leadingspace(struct mbuf
*m
)
4456 if (m
->m_flags
& M_EXT
) {
4457 if (MCLHASREFERENCE(m
))
4459 return (m
->m_data
- m
->m_ext
.ext_buf
);
4461 if (m
->m_flags
& M_PKTHDR
)
4462 return (m
->m_data
- m
->m_pktdat
);
4463 return (m
->m_data
- m
->m_dat
);
4467 * Compute the amount of space available after the end of data in an mbuf.
4470 m_trailingspace(struct mbuf
*m
)
4472 if (m
->m_flags
& M_EXT
) {
4473 if (MCLHASREFERENCE(m
))
4475 return (m
->m_ext
.ext_buf
+ m
->m_ext
.ext_size
-
4476 (m
->m_data
+ m
->m_len
));
4478 return (&m
->m_dat
[MLEN
] - (m
->m_data
+ m
->m_len
));
4482 * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
4483 * copy junk along. Does not adjust packet header length.
4486 m_prepend(struct mbuf
*m
, int len
, int how
)
4490 _MGET(mn
, how
, m
->m_type
);
4495 if (m
->m_flags
& M_PKTHDR
) {
4496 M_COPY_PKTHDR(mn
, m
);
4497 m
->m_flags
&= ~M_PKTHDR
;
4508 * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
4509 * chain, copy junk along, and adjust length.
4512 m_prepend_2(struct mbuf
*m
, int len
, int how
)
4514 if (M_LEADINGSPACE(m
) >= len
) {
4518 m
= m_prepend(m
, len
, how
);
4520 if ((m
) && (m
->m_flags
& M_PKTHDR
))
4521 m
->m_pkthdr
.len
+= len
;
4526 * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
4527 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
4528 * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
4533 m_copym_mode(struct mbuf
*m
, int off0
, int len
, int wait
, uint32_t mode
)
4535 struct mbuf
*n
, *mhdr
= NULL
, **np
;
4540 if (off
< 0 || len
< 0)
4541 panic("m_copym: invalid offset %d or len %d", off
, len
);
4543 VERIFY((mode
!= M_COPYM_MUST_COPY_HDR
&&
4544 mode
!= M_COPYM_MUST_MOVE_HDR
) || (m
->m_flags
& M_PKTHDR
));
4546 if ((off
== 0 && (m
->m_flags
& M_PKTHDR
)) ||
4547 mode
== M_COPYM_MUST_COPY_HDR
|| mode
== M_COPYM_MUST_MOVE_HDR
) {
4552 while (off
>= m
->m_len
) {
4553 if (m
->m_next
== NULL
)
4554 panic("m_copym: invalid mbuf chain");
4563 if (len
!= M_COPYALL
)
4564 panic("m_copym: len != M_COPYALL");
4569 n
= _M_RETRYHDR(wait
, m
->m_type
);
4571 n
= _M_RETRY(wait
, m
->m_type
);
4578 if ((mode
== M_COPYM_MOVE_HDR
) ||
4579 (mode
== M_COPYM_MUST_MOVE_HDR
)) {
4580 M_COPY_PKTHDR(n
, mhdr
);
4581 } else if ((mode
== M_COPYM_COPY_HDR
) ||
4582 (mode
== M_COPYM_MUST_COPY_HDR
)) {
4583 if (m_dup_pkthdr(n
, mhdr
, wait
) == 0)
4586 if (len
== M_COPYALL
)
4587 n
->m_pkthdr
.len
-= off0
;
4589 n
->m_pkthdr
.len
= len
;
4592 * There is data to copy from the packet header mbuf
4593 * if it is empty or it is before the starting offset
4600 n
->m_len
= MIN(len
, (m
->m_len
- off
));
4601 if (m
->m_flags
& M_EXT
) {
4602 n
->m_ext
= m
->m_ext
;
4604 n
->m_data
= m
->m_data
+ off
;
4605 n
->m_flags
|= M_EXT
;
4608 * Limit to the capacity of the destination
4610 if (n
->m_flags
& M_PKTHDR
)
4611 n
->m_len
= MIN(n
->m_len
, MHLEN
);
4613 n
->m_len
= MIN(n
->m_len
, MLEN
);
4615 if (MTOD(n
, char *) + n
->m_len
> ((char *)n
) + MSIZE
)
4616 panic("%s n %p copy overflow",
4619 bcopy(MTOD(m
, caddr_t
)+off
, MTOD(n
, caddr_t
),
4620 (unsigned)n
->m_len
);
4622 if (len
!= M_COPYALL
)
4642 m_copym(struct mbuf
*m
, int off0
, int len
, int wait
)
4644 return (m_copym_mode(m
, off0
, len
, wait
, M_COPYM_MOVE_HDR
));
4648 * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
4649 * within this routine also, the last mbuf and offset accessed are passed
4650 * out and can be passed back in to avoid having to rescan the entire mbuf
4651 * list (normally hung off of the socket)
4654 m_copym_with_hdrs(struct mbuf
*m0
, int off0
, int len0
, int wait
,
4655 struct mbuf
**m_lastm
, int *m_off
, uint32_t mode
)
4657 struct mbuf
*m
= m0
, *n
, **np
= NULL
;
4658 int off
= off0
, len
= len0
;
4659 struct mbuf
*top
= NULL
;
4660 int mcflags
= MSLEEPF(wait
);
4663 mcache_obj_t
*list
= NULL
;
4666 if (off
== 0 && (m
->m_flags
& M_PKTHDR
))
4669 if (m_lastm
!= NULL
&& *m_lastm
!= NULL
) {
4673 while (off
>= m
->m_len
) {
4683 len
-= MIN(len
, (n
->m_len
- ((needed
== 1) ? off
: 0)));
4690 * If the caller doesn't want to be put to sleep, mark it with
4691 * MCR_TRYHARD so that we may reclaim buffers from other places
4694 if (mcflags
& MCR_NOSLEEP
)
4695 mcflags
|= MCR_TRYHARD
;
4697 if (mcache_alloc_ext(m_cache(MC_MBUF
), &list
, needed
,
4703 n
= (struct mbuf
*)list
;
4704 list
= list
->obj_next
;
4705 ASSERT(n
!= NULL
&& m
!= NULL
);
4707 type
= (top
== NULL
) ? MT_HEADER
: m
->m_type
;
4708 MBUF_INIT(n
, (top
== NULL
), type
);
4710 if (top
== NULL
&& mac_mbuf_label_init(n
, wait
) != 0) {
4711 mtype_stat_inc(MT_HEADER
);
4712 mtype_stat_dec(MT_FREE
);
4716 #endif /* MAC_NET */
4728 if ((mode
== M_COPYM_MOVE_HDR
) ||
4729 (mode
== M_COPYM_MUST_MOVE_HDR
)) {
4730 M_COPY_PKTHDR(n
, m
);
4731 } else if ((mode
== M_COPYM_COPY_HDR
) ||
4732 (mode
== M_COPYM_MUST_COPY_HDR
)) {
4733 if (m_dup_pkthdr(n
, m
, wait
) == 0)
4736 n
->m_pkthdr
.len
= len
;
4739 n
->m_len
= MIN(len
, (m
->m_len
- off
));
4741 if (m
->m_flags
& M_EXT
) {
4742 n
->m_ext
= m
->m_ext
;
4744 n
->m_data
= m
->m_data
+ off
;
4745 n
->m_flags
|= M_EXT
;
4747 if (MTOD(n
, char *) + n
->m_len
> ((char *)n
) + MSIZE
)
4748 panic("%s n %p copy overflow",
4751 bcopy(MTOD(m
, caddr_t
)+off
, MTOD(n
, caddr_t
),
4752 (unsigned)n
->m_len
);
4757 if (m_lastm
!= NULL
&& m_off
!= NULL
) {
4758 if ((off
+ n
->m_len
) == m
->m_len
) {
4759 *m_lastm
= m
->m_next
;
4763 *m_off
= off
+ n
->m_len
;
4773 mtype_stat_inc(MT_HEADER
);
4774 mtype_stat_add(type
, needed
);
4775 mtype_stat_sub(MT_FREE
, needed
+ 1);
4777 ASSERT(list
== NULL
);
4782 mcache_free_ext(m_cache(MC_MBUF
), list
);
4790 * Copy data from an mbuf chain starting "off" bytes from the beginning,
4791 * continuing for "len" bytes, into the indicated buffer.
4794 m_copydata(struct mbuf
*m
, int off
, int len
, void *vp
)
4799 if (off
< 0 || len
< 0)
4800 panic("m_copydata: invalid offset %d or len %d", off
, len
);
4804 panic("m_copydata: invalid mbuf chain");
4812 panic("m_copydata: invalid mbuf chain");
4813 count
= MIN(m
->m_len
- off
, len
);
4814 bcopy(MTOD(m
, caddr_t
) + off
, cp
, count
);
4823 * Concatenate mbuf chain n to m. Both chains must be of the same type
4824 * (e.g. MT_DATA). Any m_pkthdr is not updated.
4827 m_cat(struct mbuf
*m
, struct mbuf
*n
)
4832 if ((m
->m_flags
& M_EXT
) ||
4833 m
->m_data
+ m
->m_len
+ n
->m_len
>= &m
->m_dat
[MLEN
]) {
4834 /* just join the two chains */
4838 /* splat the data from one into the other */
4839 bcopy(MTOD(n
, caddr_t
), MTOD(m
, caddr_t
) + m
->m_len
,
4841 m
->m_len
+= n
->m_len
;
4847 m_adj(struct mbuf
*mp
, int req_len
)
4853 if ((m
= mp
) == NULL
)
4859 while (m
!= NULL
&& len
> 0) {
4860 if (m
->m_len
<= len
) {
4871 if (m
->m_flags
& M_PKTHDR
)
4872 m
->m_pkthdr
.len
-= (req_len
- len
);
4875 * Trim from tail. Scan the mbuf chain,
4876 * calculating its length and finding the last mbuf.
4877 * If the adjustment only affects this mbuf, then just
4878 * adjust and return. Otherwise, rescan and truncate
4879 * after the remaining size.
4885 if (m
->m_next
== (struct mbuf
*)0)
4889 if (m
->m_len
>= len
) {
4892 if (m
->m_flags
& M_PKTHDR
)
4893 m
->m_pkthdr
.len
-= len
;
4900 * Correct length for chain is "count".
4901 * Find the mbuf with last data, adjust its length,
4902 * and toss data from remaining mbufs on chain.
4905 if (m
->m_flags
& M_PKTHDR
)
4906 m
->m_pkthdr
.len
= count
;
4907 for (; m
; m
= m
->m_next
) {
4908 if (m
->m_len
>= count
) {
4914 while ((m
= m
->m_next
))
4920 * Rearange an mbuf chain so that len bytes are contiguous
4921 * and in the data area of an mbuf (so that mtod and dtom
4922 * will work for a structure of size len). Returns the resulting
4923 * mbuf chain on success, frees it and returns null on failure.
4924 * If there is room, it will add up to max_protohdr-len extra bytes to the
4925 * contiguous region in an attempt to avoid being called next time.
4930 m_pullup(struct mbuf
*n
, int len
)
4937 * If first mbuf has no cluster, and has room for len bytes
4938 * without shifting current data, pullup into it,
4939 * otherwise allocate a new mbuf to prepend to the chain.
4941 if ((n
->m_flags
& M_EXT
) == 0 &&
4942 n
->m_data
+ len
< &n
->m_dat
[MLEN
] && n
->m_next
) {
4943 if (n
->m_len
>= len
)
4951 _MGET(m
, M_DONTWAIT
, n
->m_type
);
4955 if (n
->m_flags
& M_PKTHDR
) {
4956 M_COPY_PKTHDR(m
, n
);
4957 n
->m_flags
&= ~M_PKTHDR
;
4960 space
= &m
->m_dat
[MLEN
] - (m
->m_data
+ m
->m_len
);
4962 count
= MIN(MIN(MAX(len
, max_protohdr
), space
), n
->m_len
);
4963 bcopy(MTOD(n
, caddr_t
), MTOD(m
, caddr_t
) + m
->m_len
,
4973 } while (len
> 0 && n
);
4987 * Like m_pullup(), except a new mbuf is always allocated, and we allow
4988 * the amount of empty space before the data in the new mbuf to be specified
4989 * (in the event that the caller expects to prepend later).
4991 __private_extern__
int MSFail
= 0;
4993 __private_extern__
struct mbuf
*
4994 m_copyup(struct mbuf
*n
, int len
, int dstoff
)
4999 if (len
> (MHLEN
- dstoff
))
5001 MGET(m
, M_DONTWAIT
, n
->m_type
);
5005 if (n
->m_flags
& M_PKTHDR
) {
5006 m_copy_pkthdr(m
, n
);
5007 n
->m_flags
&= ~M_PKTHDR
;
5009 m
->m_data
+= dstoff
;
5010 space
= &m
->m_dat
[MLEN
] - (m
->m_data
+ m
->m_len
);
5012 count
= min(min(max(len
, max_protohdr
), space
), n
->m_len
);
5013 memcpy(mtod(m
, caddr_t
) + m
->m_len
, mtod(n
, caddr_t
),
5023 } while (len
> 0 && n
);
5037 * Partition an mbuf chain in two pieces, returning the tail --
5038 * all but the first len0 bytes. In case of failure, it returns NULL and
5039 * attempts to restore the chain to its original state.
5042 m_split(struct mbuf
*m0
, int len0
, int wait
)
5044 return (m_split0(m0
, len0
, wait
, 1));
5047 static struct mbuf
*
5048 m_split0(struct mbuf
*m0
, int len0
, int wait
, int copyhdr
)
5051 unsigned len
= len0
, remain
;
5053 for (m
= m0
; m
&& len
> m
->m_len
; m
= m
->m_next
)
5057 remain
= m
->m_len
- len
;
5058 if (copyhdr
&& (m0
->m_flags
& M_PKTHDR
)) {
5059 _MGETHDR(n
, wait
, m0
->m_type
);
5062 n
->m_pkthdr
.rcvif
= m0
->m_pkthdr
.rcvif
;
5063 n
->m_pkthdr
.len
= m0
->m_pkthdr
.len
- len0
;
5064 m0
->m_pkthdr
.len
= len0
;
5065 if (m
->m_flags
& M_EXT
)
5067 if (remain
> MHLEN
) {
5068 /* m can't be the lead packet */
5070 n
->m_next
= m_split(m
, len
, wait
);
5071 if (n
->m_next
== NULL
) {
5077 MH_ALIGN(n
, remain
);
5078 } else if (remain
== 0) {
5083 _MGET(n
, wait
, m
->m_type
);
5089 if (m
->m_flags
& M_EXT
) {
5090 n
->m_flags
|= M_EXT
;
5091 n
->m_ext
= m
->m_ext
;
5093 n
->m_data
= m
->m_data
+ len
;
5095 bcopy(MTOD(m
, caddr_t
) + len
, MTOD(n
, caddr_t
), remain
);
5099 n
->m_next
= m
->m_next
;
5105 * Routine to copy from device local memory into mbufs.
5108 m_devget(char *buf
, int totlen
, int off0
, struct ifnet
*ifp
,
5109 void (*copy
)(const void *, void *, size_t))
5112 struct mbuf
*top
= NULL
, **mp
= &top
;
5113 int off
= off0
, len
;
5121 * If 'off' is non-zero, packet is trailer-encapsulated,
5122 * so we have to skip the type and length fields.
5124 cp
+= off
+ 2 * sizeof (u_int16_t
);
5125 totlen
-= 2 * sizeof (u_int16_t
);
5127 _MGETHDR(m
, M_DONTWAIT
, MT_DATA
);
5130 m
->m_pkthdr
.rcvif
= ifp
;
5131 m
->m_pkthdr
.len
= totlen
;
5134 while (totlen
> 0) {
5136 _MGET(m
, M_DONTWAIT
, MT_DATA
);
5143 len
= MIN(totlen
, epkt
- cp
);
5144 if (len
>= MINCLSIZE
) {
5145 MCLGET(m
, M_DONTWAIT
);
5146 if (m
->m_flags
& M_EXT
) {
5147 m
->m_len
= len
= MIN(len
, m_maxsize(MC_CL
));
5149 /* give up when it's out of cluster mbufs */
5157 * Place initial small packet/header at end of mbuf.
5159 if (len
< m
->m_len
) {
5161 len
+ max_linkhdr
<= m
->m_len
)
5162 m
->m_data
+= max_linkhdr
;
5169 copy(cp
, MTOD(m
, caddr_t
), (unsigned)len
);
5171 bcopy(cp
, MTOD(m
, caddr_t
), (unsigned)len
);
5182 #ifndef MBUF_GROWTH_NORMAL_THRESH
5183 #define MBUF_GROWTH_NORMAL_THRESH 25
5187 * Cluster freelist allocation check.
5190 m_howmany(int num
, size_t bufsize
)
5193 u_int32_t m_mbclusters
, m_clusters
, m_bigclusters
, m_16kclusters
;
5194 u_int32_t m_mbfree
, m_clfree
, m_bigclfree
, m_16kclfree
;
5195 u_int32_t sumclusters
, freeclusters
;
5196 u_int32_t percent_pool
, percent_kmem
;
5197 u_int32_t mb_growth
, mb_growth_thresh
;
5199 VERIFY(bufsize
== m_maxsize(MC_BIGCL
) ||
5200 bufsize
== m_maxsize(MC_16KCL
));
5202 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
5204 /* Numbers in 2K cluster units */
5205 m_mbclusters
= m_total(MC_MBUF
) >> NMBPCLSHIFT
;
5206 m_clusters
= m_total(MC_CL
);
5207 m_bigclusters
= m_total(MC_BIGCL
) << NCLPBGSHIFT
;
5208 m_16kclusters
= m_total(MC_16KCL
);
5209 sumclusters
= m_mbclusters
+ m_clusters
+ m_bigclusters
;
5211 m_mbfree
= m_infree(MC_MBUF
) >> NMBPCLSHIFT
;
5212 m_clfree
= m_infree(MC_CL
);
5213 m_bigclfree
= m_infree(MC_BIGCL
) << NCLPBGSHIFT
;
5214 m_16kclfree
= m_infree(MC_16KCL
);
5215 freeclusters
= m_mbfree
+ m_clfree
+ m_bigclfree
;
5217 /* Bail if we've maxed out the mbuf memory map */
5218 if ((bufsize
== m_maxsize(MC_BIGCL
) && sumclusters
>= nclusters
) ||
5219 (njcl
> 0 && bufsize
== m_maxsize(MC_16KCL
) &&
5220 (m_16kclusters
<< NCLPJCLSHIFT
) >= njcl
)) {
5224 if (bufsize
== m_maxsize(MC_BIGCL
)) {
5226 if (m_bigclusters
< m_minlimit(MC_BIGCL
))
5227 return (m_minlimit(MC_BIGCL
) - m_bigclusters
);
5230 ((sumclusters
- freeclusters
) * 100) / sumclusters
;
5231 percent_kmem
= (sumclusters
* 100) / nclusters
;
5234 * If a light/normal user, grow conservatively (75%)
5235 * If a heavy user, grow aggressively (50%)
5237 if (percent_kmem
< MBUF_GROWTH_NORMAL_THRESH
)
5238 mb_growth
= MB_GROWTH_NORMAL
;
5240 mb_growth
= MB_GROWTH_AGGRESSIVE
;
5242 if (percent_kmem
< 5) {
5243 /* For initial allocations */
5246 /* Return if >= MBIGCL_LOWAT clusters available */
5247 if (m_infree(MC_BIGCL
) >= MBIGCL_LOWAT
&&
5248 m_total(MC_BIGCL
) >=
5249 MBIGCL_LOWAT
+ m_minlimit(MC_BIGCL
))
5252 /* Ensure at least num clusters are accessible */
5253 if (num
>= m_infree(MC_BIGCL
))
5254 i
= num
- m_infree(MC_BIGCL
);
5255 if (num
> m_total(MC_BIGCL
) - m_minlimit(MC_BIGCL
))
5256 j
= num
- (m_total(MC_BIGCL
) -
5257 m_minlimit(MC_BIGCL
));
5262 * Grow pool if percent_pool > 75 (normal growth)
5263 * or percent_pool > 50 (aggressive growth).
5265 mb_growth_thresh
= 100 - (100 / (1 << mb_growth
));
5266 if (percent_pool
> mb_growth_thresh
)
5267 j
= ((sumclusters
+ num
) >> mb_growth
) -
5272 /* Check to ensure we didn't go over limits */
5273 if (i
+ m_bigclusters
>= m_maxlimit(MC_BIGCL
))
5274 i
= m_maxlimit(MC_BIGCL
) - m_bigclusters
;
5275 if ((i
<< 1) + sumclusters
>= nclusters
)
5276 i
= (nclusters
- sumclusters
) >> 1;
5277 VERIFY((m_total(MC_BIGCL
) + i
) <= m_maxlimit(MC_BIGCL
));
5278 VERIFY(sumclusters
+ (i
<< 1) <= nclusters
);
5280 } else { /* 16K CL */
5283 if (m_16kclusters
< MIN16KCL
)
5284 return (MIN16KCL
- m_16kclusters
);
5285 if (m_16kclfree
>= M16KCL_LOWAT
)
5288 /* Ensure at least num clusters are available */
5289 if (num
>= m_16kclfree
)
5290 i
= num
- m_16kclfree
;
5292 /* Always grow 16KCL pool aggressively */
5293 if (((m_16kclusters
+ num
) >> 1) > m_16kclfree
)
5294 j
= ((m_16kclusters
+ num
) >> 1) - m_16kclfree
;
5297 /* Check to ensure we don't go over limit */
5298 if (i
+ m_16kclusters
>= m_maxlimit(MC_16KCL
))
5299 i
= m_maxlimit(MC_16KCL
) - m_16kclusters
;
5300 VERIFY((m_total(MC_16KCL
) + i
) <= m_maxlimit(MC_16KCL
));
5305 * Return the number of bytes in the mbuf chain, m.
5308 m_length(struct mbuf
*m
)
5311 unsigned int pktlen
;
5313 if (m
->m_flags
& M_PKTHDR
)
5314 return (m
->m_pkthdr
.len
);
5317 for (m0
= m
; m0
!= NULL
; m0
= m0
->m_next
)
5318 pktlen
+= m0
->m_len
;
5323 * Copy data from a buffer back into the indicated mbuf chain,
5324 * starting "off" bytes from the beginning, extending the mbuf
5325 * chain if necessary.
5328 m_copyback(struct mbuf
*m0
, int off
, int len
, const void *cp
)
5331 struct mbuf
*origm
= m0
;
5341 m_copyback0(&m0
, off
, len
, cp
,
5342 M_COPYBACK0_COPYBACK
| M_COPYBACK0_EXTEND
, M_DONTWAIT
);
5345 if (error
!= 0 || (m0
!= NULL
&& origm
!= m0
))
5346 panic("m_copyback");
5351 m_copyback_cow(struct mbuf
*m0
, int off
, int len
, const void *cp
, int how
)
5355 /* don't support chain expansion */
5356 VERIFY(off
+ len
<= m_length(m0
));
5358 error
= m_copyback0(&m0
, off
, len
, cp
,
5359 M_COPYBACK0_COPYBACK
| M_COPYBACK0_COW
, how
);
5362 * no way to recover from partial success.
5363 * just free the chain.
5372 * m_makewritable: ensure the specified range writable.
5375 m_makewritable(struct mbuf
**mp
, int off
, int len
, int how
)
5380 int origlen
, reslen
;
5382 origlen
= m_length(*mp
);
5385 #if 0 /* M_COPYALL is large enough */
5386 if (len
== M_COPYALL
)
5387 len
= m_length(*mp
) - off
; /* XXX */
5390 error
= m_copyback0(mp
, off
, len
, NULL
,
5391 M_COPYBACK0_PRESERVE
| M_COPYBACK0_COW
, how
);
5395 for (n
= *mp
; n
; n
= n
->m_next
)
5397 if (origlen
!= reslen
)
5398 panic("m_makewritable: length changed");
5399 if (((*mp
)->m_flags
& M_PKTHDR
) && reslen
!= (*mp
)->m_pkthdr
.len
)
5400 panic("m_makewritable: inconsist");
5407 m_copyback0(struct mbuf
**mp0
, int off
, int len
, const void *vp
, int flags
,
5414 const char *cp
= vp
;
5416 VERIFY(mp0
!= NULL
);
5417 VERIFY(*mp0
!= NULL
);
5418 VERIFY((flags
& M_COPYBACK0_PRESERVE
) == 0 || cp
== NULL
);
5419 VERIFY((flags
& M_COPYBACK0_COPYBACK
) == 0 || cp
!= NULL
);
5422 * we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
5423 * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
5426 VERIFY((~flags
& (M_COPYBACK0_EXTEND
|M_COPYBACK0_COW
)) != 0);
5430 while (off
> (mlen
= m
->m_len
)) {
5433 if (m
->m_next
== NULL
) {
5436 if (!(flags
& M_COPYBACK0_EXTEND
))
5440 * try to make some space at the end of "m".
5444 if (off
+ len
>= MINCLSIZE
&&
5445 !(m
->m_flags
& M_EXT
) && m
->m_len
== 0) {
5448 tspace
= M_TRAILINGSPACE(m
);
5450 tspace
= MIN(tspace
, off
+ len
);
5452 bzero(mtod(m
, char *) + m
->m_len
,
5461 * need to allocate an mbuf.
5464 if (off
+ len
>= MINCLSIZE
) {
5465 n
= m_getcl(how
, m
->m_type
, 0);
5467 n
= _M_GET(how
, m
->m_type
);
5473 n
->m_len
= MIN(M_TRAILINGSPACE(n
), off
+ len
);
5474 bzero(mtod(n
, char *), MIN(n
->m_len
, off
));
5481 mlen
= m
->m_len
- off
;
5482 if (mlen
!= 0 && m_mclhasreference(m
)) {
5487 * this mbuf is read-only.
5488 * allocate a new writable mbuf and try again.
5492 if (!(flags
& M_COPYBACK0_COW
))
5493 panic("m_copyback0: read-only");
5494 #endif /* DIAGNOSTIC */
5497 * if we're going to write into the middle of
5498 * a mbuf, split it first.
5500 if (off
> 0 && len
< mlen
) {
5501 n
= m_split0(m
, off
, how
, 0);
5512 * XXX TODO coalesce into the trailingspace of
5513 * the previous mbuf when possible.
5517 * allocate a new mbuf. copy packet header if needed.
5519 n
= _M_GET(how
, m
->m_type
);
5522 if (off
== 0 && (m
->m_flags
& M_PKTHDR
)) {
5523 M_COPY_PKTHDR(n
, m
);
5526 if (len
>= MINCLSIZE
)
5527 MCLGET(n
, M_DONTWAIT
);
5529 (n
->m_flags
& M_EXT
) ? MCLBYTES
: MLEN
;
5535 * free the region which has been overwritten.
5536 * copying data from old mbufs if requested.
5538 if (flags
& M_COPYBACK0_PRESERVE
)
5539 datap
= mtod(n
, char *);
5543 VERIFY(off
== 0 || eatlen
>= mlen
);
5545 VERIFY(len
>= mlen
);
5549 m_copydata(m
, off
, mlen
, datap
);
5556 while (m
!= NULL
&& m_mclhasreference(m
) &&
5557 n
->m_type
== m
->m_type
&& eatlen
> 0) {
5558 mlen
= MIN(eatlen
, m
->m_len
);
5560 m_copydata(m
, 0, mlen
, datap
);
5567 *mp
= m
= m_free(m
);
5575 mlen
= MIN(mlen
, len
);
5576 if (flags
& M_COPYBACK0_COPYBACK
) {
5577 bcopy(cp
, mtod(m
, caddr_t
) + off
, (unsigned)mlen
);
5586 if (m
->m_next
== NULL
) {
5593 if (((m
= *mp0
)->m_flags
& M_PKTHDR
) && (m
->m_pkthdr
.len
< totlen
)) {
5594 VERIFY(flags
& M_COPYBACK0_EXTEND
);
5595 m
->m_pkthdr
.len
= totlen
;
5605 mcl_to_paddr(char *addr
)
5607 vm_offset_t base_phys
;
5609 if (!MBUF_IN_MAP(addr
))
5611 base_phys
= mcl_paddr
[atop_64(addr
- (char *)mbutl
)];
5615 return ((uint64_t)(ptoa_64(base_phys
) | ((uint64_t)addr
& PAGE_MASK
)));
5619 * Dup the mbuf chain passed in. The whole thing. No cute additional cruft.
5620 * And really copy the thing. That way, we don't "precompute" checksums
5621 * for unsuspecting consumers. Assumption: m->m_nextpkt == 0. Trick: for
5622 * small packets, don't dup into a cluster. That way received packets
5623 * don't take up too much room in the sockbuf (cf. sbspace()).
5628 m_dup(struct mbuf
*m
, int how
)
5630 struct mbuf
*n
, **np
;
5636 if (m
->m_flags
& M_PKTHDR
)
5640 * Quick check: if we have one mbuf and its data fits in an
5641 * mbuf with packet header, just copy and go.
5643 if (m
->m_next
== NULL
) {
5644 /* Then just move the data into an mbuf and be done... */
5646 if (m
->m_pkthdr
.len
<= MHLEN
&& m
->m_len
<= MHLEN
) {
5647 if ((n
= _M_GETHDR(how
, m
->m_type
)) == NULL
)
5649 n
->m_len
= m
->m_len
;
5650 m_dup_pkthdr(n
, m
, how
);
5651 bcopy(m
->m_data
, n
->m_data
, m
->m_len
);
5654 } else if (m
->m_len
<= MLEN
) {
5655 if ((n
= _M_GET(how
, m
->m_type
)) == NULL
)
5657 bcopy(m
->m_data
, n
->m_data
, m
->m_len
);
5658 n
->m_len
= m
->m_len
;
5664 kprintf("<%x: %x, %x, %x\n", m
, m
->m_flags
, m
->m_len
,
5668 n
= _M_GETHDR(how
, m
->m_type
);
5670 n
= _M_GET(how
, m
->m_type
);
5673 if (m
->m_flags
& M_EXT
) {
5674 if (m
->m_len
<= m_maxsize(MC_CL
))
5676 else if (m
->m_len
<= m_maxsize(MC_BIGCL
))
5677 n
= m_mbigget(n
, how
);
5678 else if (m
->m_len
<= m_maxsize(MC_16KCL
) && njcl
> 0)
5679 n
= m_m16kget(n
, how
);
5680 if (!(n
->m_flags
& M_EXT
)) {
5687 /* Don't use M_COPY_PKTHDR: preserve m_data */
5688 m_dup_pkthdr(n
, m
, how
);
5690 if (!(n
->m_flags
& M_EXT
))
5691 n
->m_data
= n
->m_pktdat
;
5693 n
->m_len
= m
->m_len
;
5695 * Get the dup on the same bdry as the original
5696 * Assume that the two mbufs have the same offset to data area
5697 * (up to word boundaries)
5699 bcopy(MTOD(m
, caddr_t
), MTOD(n
, caddr_t
), (unsigned)n
->m_len
);
5703 kprintf(">%x: %x, %x, %x\n", n
, n
->m_flags
, n
->m_len
,
5718 #define MBUF_MULTIPAGES(m) \
5719 (((m)->m_flags & M_EXT) && \
5720 ((IS_P2ALIGNED((m)->m_data, NBPG) && (m)->m_len > NBPG) || \
5721 (!IS_P2ALIGNED((m)->m_data, NBPG) && \
5722 P2ROUNDUP((m)->m_data, NBPG) < ((uintptr_t)(m)->m_data + (m)->m_len))))
5724 static struct mbuf
*
5725 m_expand(struct mbuf
*m
, struct mbuf
**last
)
5727 struct mbuf
*top
= NULL
;
5728 struct mbuf
**nm
= &top
;
5729 uintptr_t data0
, data
;
5730 unsigned int len0
, len
;
5732 VERIFY(MBUF_MULTIPAGES(m
));
5733 VERIFY(m
->m_next
== NULL
);
5734 data0
= (uintptr_t)m
->m_data
;
5742 if (IS_P2ALIGNED(data
, NBPG
) && len0
> NBPG
)
5744 else if (!IS_P2ALIGNED(data
, NBPG
) &&
5745 P2ROUNDUP(data
, NBPG
) < (data
+ len0
))
5746 len
= P2ROUNDUP(data
, NBPG
) - data
;
5751 VERIFY(m
->m_flags
& M_EXT
);
5752 m
->m_data
= (void *)data
;
5764 n
= _M_RETRY(M_DONTWAIT
, MT_DATA
);
5771 n
->m_ext
= m
->m_ext
;
5773 n
->m_flags
|= M_EXT
;
5780 m_normalize(struct mbuf
*m
)
5782 struct mbuf
*top
= NULL
;
5783 struct mbuf
**nm
= &top
;
5784 boolean_t expanded
= FALSE
;
5792 /* Does the data cross one or more page boundaries? */
5793 if (MBUF_MULTIPAGES(m
)) {
5795 if ((m
= m_expand(m
, &last
)) == NULL
) {
5811 atomic_add_32(&mb_normalized
, 1);
5816 * Append the specified data to the indicated mbuf chain,
5817 * Extend the mbuf chain if the new data does not fit in
5820 * Return 1 if able to complete the job; otherwise 0.
5823 m_append(struct mbuf
*m0
, int len
, caddr_t cp
)
5826 int remainder
, space
;
5828 for (m
= m0
; m
->m_next
!= NULL
; m
= m
->m_next
)
5831 space
= M_TRAILINGSPACE(m
);
5834 * Copy into available space.
5836 if (space
> remainder
)
5838 bcopy(cp
, mtod(m
, caddr_t
) + m
->m_len
, space
);
5840 cp
+= space
, remainder
-= space
;
5842 while (remainder
> 0) {
5844 * Allocate a new mbuf; could check space
5845 * and allocate a cluster instead.
5847 n
= m_get(M_WAITOK
, m
->m_type
);
5850 n
->m_len
= min(MLEN
, remainder
);
5851 bcopy(cp
, mtod(n
, caddr_t
), n
->m_len
);
5853 remainder
-= n
->m_len
;
5857 if (m0
->m_flags
& M_PKTHDR
)
5858 m0
->m_pkthdr
.len
+= len
- remainder
;
5859 return (remainder
== 0);
5863 m_last(struct mbuf
*m
)
5865 while (m
->m_next
!= NULL
)
5871 m_fixhdr(struct mbuf
*m0
)
5875 VERIFY(m0
->m_flags
& M_PKTHDR
);
5877 len
= m_length2(m0
, NULL
);
5878 m0
->m_pkthdr
.len
= len
;
5883 m_length2(struct mbuf
*m0
, struct mbuf
**last
)
5889 for (m
= m0
; m
!= NULL
; m
= m
->m_next
) {
5891 if (m
->m_next
== NULL
)
5900 * Defragment a mbuf chain, returning the shortest possible chain of mbufs
5901 * and clusters. If allocation fails and this cannot be completed, NULL will
5902 * be returned, but the passed in chain will be unchanged. Upon success,
5903 * the original chain will be freed, and the new chain will be returned.
5905 * If a non-packet header is passed in, the original mbuf (chain?) will
5906 * be returned unharmed.
5908 * If offset is specfied, the first mbuf in the chain will have a leading
5909 * space of the amount stated by the "off" parameter.
5911 * This routine requires that the m_pkthdr.header field of the original
5912 * mbuf chain is cleared by the caller.
5915 m_defrag_offset(struct mbuf
*m0
, u_int32_t off
, int how
)
5917 struct mbuf
*m_new
= NULL
, *m_final
= NULL
;
5918 int progress
= 0, length
, pktlen
;
5920 if (!(m0
->m_flags
& M_PKTHDR
))
5923 VERIFY(off
< MHLEN
);
5924 m_fixhdr(m0
); /* Needed sanity check */
5926 pktlen
= m0
->m_pkthdr
.len
+ off
;
5928 m_final
= m_getcl(how
, MT_DATA
, M_PKTHDR
);
5930 m_final
= m_gethdr(how
, MT_DATA
);
5932 if (m_final
== NULL
)
5937 m_final
->m_data
+= off
;
5941 * Caller must have handled the contents pointed to by this
5942 * pointer before coming here, as otherwise it will point to
5943 * the original mbuf which will get freed upon success.
5945 VERIFY(m0
->m_pkthdr
.pkt_hdr
== NULL
);
5947 if (m_dup_pkthdr(m_final
, m0
, how
) == 0)
5952 while (progress
< pktlen
) {
5953 length
= pktlen
- progress
;
5954 if (length
> MCLBYTES
)
5956 length
-= ((m_new
== m_final
) ? off
: 0);
5958 if (m_new
== NULL
) {
5960 m_new
= m_getcl(how
, MT_DATA
, 0);
5962 m_new
= m_get(how
, MT_DATA
);
5967 m_copydata(m0
, progress
, length
, mtod(m_new
, caddr_t
));
5969 m_new
->m_len
= length
;
5970 if (m_new
!= m_final
)
5971 m_cat(m_final
, m_new
);
5984 m_defrag(struct mbuf
*m0
, int how
)
5986 return (m_defrag_offset(m0
, 0, how
));
5990 m_mchtype(struct mbuf
*m
, int t
)
5993 mtype_stat_dec(m
->m_type
);
5998 m_mtod(struct mbuf
*m
)
6000 return (MTOD(m
, void *));
6006 return ((struct mbuf
*)((uintptr_t)(x
) & ~(MSIZE
-1)));
6010 m_mcheck(struct mbuf
*m
)
6016 * Return a pointer to mbuf/offset of location in mbuf chain.
6019 m_getptr(struct mbuf
*m
, int loc
, int *off
)
6023 /* Normal end of search. */
6024 if (m
->m_len
> loc
) {
6029 if (m
->m_next
== NULL
) {
6031 /* Point at the end of valid data. */
6044 * Inform the corresponding mcache(s) that there's a waiter below.
6047 mbuf_waiter_inc(mbuf_class_t
class, boolean_t comp
)
6049 mcache_waiter_inc(m_cache(class));
6051 if (class == MC_CL
) {
6052 mcache_waiter_inc(m_cache(MC_MBUF_CL
));
6053 } else if (class == MC_BIGCL
) {
6054 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL
));
6055 } else if (class == MC_16KCL
) {
6056 mcache_waiter_inc(m_cache(MC_MBUF_16KCL
));
6058 mcache_waiter_inc(m_cache(MC_MBUF_CL
));
6059 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL
));
6065 * Inform the corresponding mcache(s) that there's no more waiter below.
6068 mbuf_waiter_dec(mbuf_class_t
class, boolean_t comp
)
6070 mcache_waiter_dec(m_cache(class));
6072 if (class == MC_CL
) {
6073 mcache_waiter_dec(m_cache(MC_MBUF_CL
));
6074 } else if (class == MC_BIGCL
) {
6075 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL
));
6076 } else if (class == MC_16KCL
) {
6077 mcache_waiter_dec(m_cache(MC_MBUF_16KCL
));
6079 mcache_waiter_dec(m_cache(MC_MBUF_CL
));
6080 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL
));
6086 * Called during slab (blocking and non-blocking) allocation. If there
6087 * is at least one waiter, and the time since the first waiter is blocked
6088 * is greater than the watchdog timeout, panic the system.
6096 if (mb_waiters
== 0 || !mb_watchdog
)
6100 since
= now
.tv_sec
- mb_wdtstart
.tv_sec
;
6101 if (since
>= MB_WDT_MAXTIME
) {
6102 panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__
,
6103 mb_waiters
, since
, mbuf_dump());
6109 * Called during blocking allocation. Returns TRUE if one or more objects
6110 * are available at the per-CPU caches layer and that allocation should be
6111 * retried at that level.
6114 mbuf_sleep(mbuf_class_t
class, unsigned int num
, int wait
)
6116 boolean_t mcache_retry
= FALSE
;
6118 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
6120 /* Check if there's anything at the cache layer */
6121 if (mbuf_cached_above(class, wait
)) {
6122 mcache_retry
= TRUE
;
6126 /* Nothing? Then try hard to get it from somewhere */
6127 m_reclaim(class, num
, (wait
& MCR_COMP
));
6129 /* We tried hard and got something? */
6130 if (m_infree(class) > 0) {
6133 } else if (mbuf_cached_above(class, wait
)) {
6135 mcache_retry
= TRUE
;
6137 } else if (wait
& MCR_TRYHARD
) {
6138 mcache_retry
= TRUE
;
6143 * There's really nothing for us right now; inform the
6144 * cache(s) that there is a waiter below and go to sleep.
6146 mbuf_waiter_inc(class, (wait
& MCR_COMP
));
6148 VERIFY(!(wait
& MCR_NOSLEEP
));
6151 * If this is the first waiter, arm the watchdog timer. Otherwise
6152 * check if we need to panic the system due to watchdog timeout.
6154 if (mb_waiters
== 0)
6155 microuptime(&mb_wdtstart
);
6160 (void) msleep(mb_waitchan
, mbuf_mlock
, (PZERO
-1), m_cname(class), NULL
);
6162 /* We are now up; stop getting notified until next round */
6163 mbuf_waiter_dec(class, (wait
& MCR_COMP
));
6165 /* We waited and got something */
6166 if (m_infree(class) > 0) {
6169 } else if (mbuf_cached_above(class, wait
)) {
6171 mcache_retry
= TRUE
;
6174 return (mcache_retry
);
6178 mbuf_worker_thread(void)
6183 lck_mtx_lock(mbuf_mlock
);
6186 if (mbuf_expand_mcl
) {
6189 /* Adjust to current number of cluster in use */
6190 n
= mbuf_expand_mcl
-
6191 (m_total(MC_CL
) - m_infree(MC_CL
));
6192 if ((n
+ m_total(MC_CL
)) > m_maxlimit(MC_CL
))
6193 n
= m_maxlimit(MC_CL
) - m_total(MC_CL
);
6194 mbuf_expand_mcl
= 0;
6196 if (n
> 0 && freelist_populate(MC_CL
, n
, M_WAIT
) > 0)
6199 if (mbuf_expand_big
) {
6202 /* Adjust to current number of 4 KB cluster in use */
6203 n
= mbuf_expand_big
-
6204 (m_total(MC_BIGCL
) - m_infree(MC_BIGCL
));
6205 if ((n
+ m_total(MC_BIGCL
)) > m_maxlimit(MC_BIGCL
))
6206 n
= m_maxlimit(MC_BIGCL
) - m_total(MC_BIGCL
);
6207 mbuf_expand_big
= 0;
6209 if (n
> 0 && freelist_populate(MC_BIGCL
, n
, M_WAIT
) > 0)
6212 if (mbuf_expand_16k
) {
6215 /* Adjust to current number of 16 KB cluster in use */
6216 n
= mbuf_expand_16k
-
6217 (m_total(MC_16KCL
) - m_infree(MC_16KCL
));
6218 if ((n
+ m_total(MC_16KCL
)) > m_maxlimit(MC_16KCL
))
6219 n
= m_maxlimit(MC_16KCL
) - m_total(MC_16KCL
);
6220 mbuf_expand_16k
= 0;
6223 (void) freelist_populate(MC_16KCL
, n
, M_WAIT
);
6227 * Because we can run out of memory before filling the mbuf
6228 * map, we should not allocate more clusters than they are
6229 * mbufs -- otherwise we could have a large number of useless
6230 * clusters allocated.
6233 while (m_total(MC_MBUF
) <
6234 (m_total(MC_BIGCL
) + m_total(MC_CL
))) {
6235 if (freelist_populate(MC_MBUF
, 1, M_WAIT
) == 0)
6240 lck_mtx_unlock(mbuf_mlock
);
6242 assert_wait(&mbuf_worker_run
, THREAD_UNINT
);
6243 (void) thread_block((thread_continue_t
)mbuf_worker_thread
);
6248 mbuf_worker_thread_init(void)
6250 mbuf_worker_ready
++;
6251 mbuf_worker_thread();
6260 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
6262 VERIFY(MBUF_IN_MAP(buf
));
6263 ix
= ((char *)buf
- (char *)mbutl
) >> MBSHIFT
;
6264 VERIFY(ix
< maxslabgrp
);
6266 if ((slg
= slabstbl
[ix
]) == NULL
) {
6268 * In the current implementation, we never shrink the slabs
6269 * table; if we attempt to reallocate a cluster group when
6270 * it's already allocated, panic since this is a sign of a
6271 * memory corruption (slabstbl[ix] got nullified).
6274 VERIFY(ix
< slabgrp
);
6276 * Slabs expansion can only be done single threaded; when
6277 * we get here, it must be as a result of m_clalloc() which
6278 * is serialized and therefore mb_clalloc_busy must be set.
6280 VERIFY(mb_clalloc_busy
);
6281 lck_mtx_unlock(mbuf_mlock
);
6283 /* This is a new buffer; create the slabs group for it */
6284 MALLOC(slg
, mcl_slabg_t
*, sizeof (*slg
), M_TEMP
,
6286 VERIFY(slg
!= NULL
);
6288 lck_mtx_lock(mbuf_mlock
);
6290 * No other thread could have gone into m_clalloc() after
6291 * we dropped the lock above, so verify that it's true.
6293 VERIFY(mb_clalloc_busy
);
6297 /* Chain each slab in the group to its forward neighbor */
6298 for (k
= 1; k
< NSLABSPMB
; k
++)
6299 slg
->slg_slab
[k
- 1].sl_next
= &slg
->slg_slab
[k
];
6300 VERIFY(slg
->slg_slab
[NSLABSPMB
- 1].sl_next
== NULL
);
6302 /* And chain the last slab in the previous group to this */
6304 VERIFY(slabstbl
[ix
- 1]->
6305 slg_slab
[NSLABSPMB
- 1].sl_next
== NULL
);
6306 slabstbl
[ix
- 1]->slg_slab
[NSLABSPMB
- 1].sl_next
=
6311 ix
= MTOBG(buf
) % NSLABSPMB
;
6312 VERIFY(ix
< NSLABSPMB
);
6314 return (&slg
->slg_slab
[ix
]);
6318 slab_init(mcl_slab_t
*sp
, mbuf_class_t
class, u_int32_t flags
,
6319 void *base
, void *head
, unsigned int len
, int refcnt
, int chunks
)
6321 sp
->sl_class
= class;
6322 sp
->sl_flags
= flags
;
6326 sp
->sl_refcnt
= refcnt
;
6327 sp
->sl_chunks
= chunks
;
6332 slab_insert(mcl_slab_t
*sp
, mbuf_class_t
class)
6334 VERIFY(slab_is_detached(sp
));
6335 m_slab_cnt(class)++;
6336 TAILQ_INSERT_TAIL(&m_slablist(class), sp
, sl_link
);
6337 sp
->sl_flags
&= ~SLF_DETACHED
;
6338 if (class == MC_16KCL
) {
6340 for (k
= 1; k
< NSLABSP16KB
; k
++) {
6342 /* Next slab must already be present */
6344 VERIFY(slab_is_detached(sp
));
6345 sp
->sl_flags
&= ~SLF_DETACHED
;
6351 slab_remove(mcl_slab_t
*sp
, mbuf_class_t
class)
6353 VERIFY(!slab_is_detached(sp
));
6354 VERIFY(m_slab_cnt(class) > 0);
6355 m_slab_cnt(class)--;
6356 TAILQ_REMOVE(&m_slablist(class), sp
, sl_link
);
6358 if (class == MC_16KCL
) {
6360 for (k
= 1; k
< NSLABSP16KB
; k
++) {
6362 /* Next slab must already be present */
6364 VERIFY(!slab_is_detached(sp
));
6371 slab_inrange(mcl_slab_t
*sp
, void *buf
)
6373 return ((uintptr_t)buf
>= (uintptr_t)sp
->sl_base
&&
6374 (uintptr_t)buf
< ((uintptr_t)sp
->sl_base
+ sp
->sl_len
));
6380 slab_nextptr_panic(mcl_slab_t
*sp
, void *addr
)
6383 unsigned int chunk_len
= sp
->sl_len
/ sp
->sl_chunks
;
6384 uintptr_t buf
= (uintptr_t)sp
->sl_base
;
6386 for (i
= 0; i
< sp
->sl_chunks
; i
++, buf
+= chunk_len
) {
6387 void *next
= ((mcache_obj_t
*)buf
)->obj_next
;
6391 if (next
!= NULL
&& !MBUF_IN_MAP(next
)) {
6392 mcache_t
*cp
= m_cache(sp
->sl_class
);
6393 panic("%s: %s buffer %p in slab %p modified "
6394 "after free at offset 0: %p out of range "
6395 "[%p-%p)\n", __func__
, cp
->mc_name
,
6396 (void *)buf
, sp
, next
, mbutl
, embutl
);
6400 mcache_audit_t
*mca
= mcl_audit_buf2mca(sp
->sl_class
,
6401 (mcache_obj_t
*)buf
);
6402 mcl_audit_verify_nextptr(next
, mca
);
6408 slab_detach(mcl_slab_t
*sp
)
6410 sp
->sl_link
.tqe_next
= (mcl_slab_t
*)-1;
6411 sp
->sl_link
.tqe_prev
= (mcl_slab_t
**)-1;
6412 sp
->sl_flags
|= SLF_DETACHED
;
6416 slab_is_detached(mcl_slab_t
*sp
)
6418 return ((intptr_t)sp
->sl_link
.tqe_next
== -1 &&
6419 (intptr_t)sp
->sl_link
.tqe_prev
== -1 &&
6420 (sp
->sl_flags
& SLF_DETACHED
));
6424 mcl_audit_init(void *buf
, mcache_audit_t
**mca_list
,
6425 mcache_obj_t
**con_list
, size_t con_size
, unsigned int num
)
6427 mcache_audit_t
*mca
, *mca_tail
;
6428 mcache_obj_t
*con
= NULL
;
6429 boolean_t save_contents
= (con_list
!= NULL
);
6432 ASSERT(num
<= NMBPBG
);
6433 ASSERT(con_list
== NULL
|| con_size
!= 0);
6436 VERIFY(ix
< maxclaudit
);
6438 /* Make sure we haven't been here before */
6439 for (i
= 0; i
< NMBPBG
; i
++)
6440 VERIFY(mclaudit
[ix
].cl_audit
[i
] == NULL
);
6442 mca
= mca_tail
= *mca_list
;
6446 for (i
= 0; i
< num
; i
++) {
6447 mcache_audit_t
*next
;
6449 next
= mca
->mca_next
;
6450 bzero(mca
, sizeof (*mca
));
6451 mca
->mca_next
= next
;
6452 mclaudit
[ix
].cl_audit
[i
] = mca
;
6454 /* Attach the contents buffer if requested */
6455 if (save_contents
) {
6456 mcl_saved_contents_t
*msc
=
6457 (mcl_saved_contents_t
*)(void *)con
;
6459 VERIFY(msc
!= NULL
);
6460 VERIFY(IS_P2ALIGNED(msc
, sizeof (u_int64_t
)));
6461 VERIFY(con_size
== sizeof (*msc
));
6462 mca
->mca_contents_size
= con_size
;
6463 mca
->mca_contents
= msc
;
6464 con
= con
->obj_next
;
6465 bzero(mca
->mca_contents
, mca
->mca_contents_size
);
6469 mca
= mca
->mca_next
;
6475 *mca_list
= mca_tail
->mca_next
;
6476 mca_tail
->mca_next
= NULL
;
6480 mcl_audit_free(void *buf
, unsigned int num
)
6483 mcache_audit_t
*mca
, *mca_list
;
6486 VERIFY(ix
< maxclaudit
);
6488 if (mclaudit
[ix
].cl_audit
[0] != NULL
) {
6489 mca_list
= mclaudit
[ix
].cl_audit
[0];
6490 for (i
= 0; i
< num
; i
++) {
6491 mca
= mclaudit
[ix
].cl_audit
[i
];
6492 mclaudit
[ix
].cl_audit
[i
] = NULL
;
6493 if (mca
->mca_contents
)
6494 mcache_free(mcl_audit_con_cache
,
6497 mcache_free_ext(mcache_audit_cache
,
6498 (mcache_obj_t
*)mca_list
);
6503 * Given an address of a buffer (mbuf/2KB/4KB/16KB), return
6504 * the corresponding audit structure for that buffer.
6506 static mcache_audit_t
*
6507 mcl_audit_buf2mca(mbuf_class_t
class, mcache_obj_t
*o
)
6509 mcache_audit_t
*mca
= NULL
;
6512 VERIFY(ix
< maxclaudit
);
6513 VERIFY(IS_P2ALIGNED(o
, MIN(m_maxsize(class), NBPG
)));
6518 * For the mbuf case, find the index of the page
6519 * used by the mbuf and use that index to locate the
6520 * base address of the page. Then find out the
6521 * mbuf index relative to the page base and use
6522 * it to locate the audit structure.
6524 VERIFY(MCLIDX(BGTOM(ix
), o
) < (int)NMBPBG
);
6525 mca
= mclaudit
[ix
].cl_audit
[MCLIDX(BGTOM(ix
), o
)];
6530 * Same thing as above, but for 2KB clusters in a page.
6532 VERIFY(CLBGIDX(BGTOM(ix
), o
) < (int)NCLPBG
);
6533 mca
= mclaudit
[ix
].cl_audit
[CLBGIDX(BGTOM(ix
), o
)];
6539 * Same as above, but only return the first element.
6541 mca
= mclaudit
[ix
].cl_audit
[0];
6553 mcl_audit_mbuf(mcache_audit_t
*mca
, void *addr
, boolean_t composite
,
6556 struct mbuf
*m
= addr
;
6557 mcache_obj_t
*next
= ((mcache_obj_t
*)m
)->obj_next
;
6559 VERIFY(mca
->mca_contents
!= NULL
&&
6560 mca
->mca_contents_size
== AUDIT_CONTENTS_SIZE
);
6563 mcl_audit_verify_nextptr(next
, mca
);
6566 /* Save constructed mbuf fields */
6567 mcl_audit_save_mbuf(m
, mca
);
6569 mcache_set_pattern(MCACHE_FREE_PATTERN
, m
,
6570 m_maxsize(MC_MBUF
));
6572 ((mcache_obj_t
*)m
)->obj_next
= next
;
6576 /* Check if the buffer has been corrupted while in freelist */
6578 mcache_audit_free_verify_set(mca
, addr
, 0, m_maxsize(MC_MBUF
));
6580 /* Restore constructed mbuf fields */
6581 mcl_audit_restore_mbuf(m
, mca
, composite
);
6585 mcl_audit_restore_mbuf(struct mbuf
*m
, mcache_audit_t
*mca
, boolean_t composite
)
6587 struct mbuf
*ms
= MCA_SAVED_MBUF_PTR(mca
);
6590 struct mbuf
*next
= m
->m_next
;
6591 VERIFY(ms
->m_flags
== M_EXT
&& MEXT_RFA(ms
) != NULL
&&
6592 MBUF_IS_COMPOSITE(ms
));
6593 VERIFY(mca
->mca_contents_size
== AUDIT_CONTENTS_SIZE
);
6595 * We could have hand-picked the mbuf fields and restore
6596 * them individually, but that will be a maintenance
6597 * headache. Instead, restore everything that was saved;
6598 * the mbuf layer will recheck and reinitialize anyway.
6600 bcopy(ms
, m
, MCA_SAVED_MBUF_SIZE
);
6604 * For a regular mbuf (no cluster attached) there's nothing
6605 * to restore other than the type field, which is expected
6608 m
->m_type
= ms
->m_type
;
6614 mcl_audit_save_mbuf(struct mbuf
*m
, mcache_audit_t
*mca
)
6616 VERIFY(mca
->mca_contents_size
== AUDIT_CONTENTS_SIZE
);
6618 bcopy(m
, MCA_SAVED_MBUF_PTR(mca
), MCA_SAVED_MBUF_SIZE
);
6622 mcl_audit_cluster(mcache_audit_t
*mca
, void *addr
, size_t size
, boolean_t alloc
,
6623 boolean_t save_next
)
6625 mcache_obj_t
*next
= ((mcache_obj_t
*)addr
)->obj_next
;
6629 mcache_set_pattern(MCACHE_FREE_PATTERN
, addr
, size
);
6632 mcl_audit_verify_nextptr(next
, mca
);
6633 ((mcache_obj_t
*)addr
)->obj_next
= next
;
6635 } else if (mclverify
) {
6636 /* Check if the buffer has been corrupted while in freelist */
6637 mcl_audit_verify_nextptr(next
, mca
);
6638 mcache_audit_free_verify_set(mca
, addr
, 0, size
);
6643 mcl_audit_scratch(mcache_audit_t
*mca
)
6645 void *stack
[MCACHE_STACK_DEPTH
+ 1];
6646 mcl_scratch_audit_t
*msa
;
6649 VERIFY(mca
->mca_contents
!= NULL
);
6650 msa
= MCA_SAVED_SCRATCH_PTR(mca
);
6652 msa
->msa_pthread
= msa
->msa_thread
;
6653 msa
->msa_thread
= current_thread();
6654 bcopy(msa
->msa_stack
, msa
->msa_pstack
, sizeof (msa
->msa_pstack
));
6655 msa
->msa_pdepth
= msa
->msa_depth
;
6656 bzero(stack
, sizeof (stack
));
6657 msa
->msa_depth
= OSBacktrace(stack
, MCACHE_STACK_DEPTH
+ 1) - 1;
6658 bcopy(&stack
[1], msa
->msa_stack
, sizeof (msa
->msa_stack
));
6660 msa
->msa_ptstamp
= msa
->msa_tstamp
;
6662 /* tstamp is in ms relative to base_ts */
6663 msa
->msa_tstamp
= ((now
.tv_usec
- mb_start
.tv_usec
) / 1000);
6664 if ((now
.tv_sec
- mb_start
.tv_sec
) > 0)
6665 msa
->msa_tstamp
+= ((now
.tv_sec
- mb_start
.tv_sec
) * 1000);
6669 mcl_audit_mcheck_panic(struct mbuf
*m
)
6671 mcache_audit_t
*mca
;
6674 mca
= mcl_audit_buf2mca(MC_MBUF
, (mcache_obj_t
*)m
);
6676 panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n",
6677 m
, (u_int16_t
)m
->m_type
, MT_FREE
, mcache_dump_mca(mca
));
6682 mcl_audit_verify_nextptr(void *next
, mcache_audit_t
*mca
)
6684 if (next
!= NULL
&& !MBUF_IN_MAP(next
) &&
6685 (next
!= (void *)MCACHE_FREE_PATTERN
|| !mclverify
)) {
6686 panic("mcl_audit: buffer %p modified after free at offset 0: "
6687 "%p out of range [%p-%p)\n%s\n",
6688 mca
->mca_addr
, next
, mbutl
, embutl
, mcache_dump_mca(mca
));
6693 /* This function turns on mbuf leak detection */
6695 mleak_activate(void)
6697 mleak_table
.mleak_sample_factor
= MLEAK_SAMPLE_FACTOR
;
6698 PE_parse_boot_argn("mleak_sample_factor",
6699 &mleak_table
.mleak_sample_factor
,
6700 sizeof (mleak_table
.mleak_sample_factor
));
6702 if (mleak_table
.mleak_sample_factor
== 0)
6705 if (mclfindleak
== 0)
6708 vm_size_t alloc_size
=
6709 mleak_alloc_buckets
* sizeof (struct mallocation
);
6710 vm_size_t trace_size
= mleak_trace_buckets
* sizeof (struct mtrace
);
6712 MALLOC(mleak_allocations
, struct mallocation
*, alloc_size
,
6713 M_TEMP
, M_WAITOK
| M_ZERO
);
6714 VERIFY(mleak_allocations
!= NULL
);
6716 MALLOC(mleak_traces
, struct mtrace
*, trace_size
,
6717 M_TEMP
, M_WAITOK
| M_ZERO
);
6718 VERIFY(mleak_traces
!= NULL
);
6720 MALLOC(mleak_stat
, mleak_stat_t
*, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES
),
6721 M_TEMP
, M_WAITOK
| M_ZERO
);
6722 VERIFY(mleak_stat
!= NULL
);
6723 mleak_stat
->ml_cnt
= MLEAK_NUM_TRACES
;
6725 mleak_stat
->ml_isaddr64
= 1;
6726 #endif /* __LP64__ */
6730 mleak_logger(u_int32_t num
, mcache_obj_t
*addr
, boolean_t alloc
)
6734 if (mclfindleak
== 0)
6738 return (mleak_free(addr
));
6740 temp
= atomic_add_32_ov(&mleak_table
.mleak_capture
, 1);
6742 if ((temp
% mleak_table
.mleak_sample_factor
) == 0 && addr
!= NULL
) {
6743 uintptr_t bt
[MLEAK_STACK_DEPTH
];
6744 int logged
= fastbacktrace(bt
, MLEAK_STACK_DEPTH
);
6745 mleak_log(bt
, addr
, logged
, num
);
6750 * This function records the allocation in the mleak_allocations table
6751 * and the backtrace in the mleak_traces table; if allocation slot is in use,
6752 * replace old allocation with new one if the trace slot is in use, return
6753 * (or increment refcount if same trace).
6756 mleak_log(uintptr_t *bt
, mcache_obj_t
*addr
, uint32_t depth
, int num
)
6758 struct mallocation
*allocation
;
6759 struct mtrace
*trace
;
6760 uint32_t trace_index
;
6762 /* Quit if someone else modifying the tables */
6763 if (!lck_mtx_try_lock_spin(mleak_lock
)) {
6764 mleak_table
.total_conflicts
++;
6768 allocation
= &mleak_allocations
[hashaddr((uintptr_t)addr
,
6769 mleak_alloc_buckets
)];
6770 trace_index
= hashbacktrace(bt
, depth
, mleak_trace_buckets
);
6771 trace
= &mleak_traces
[trace_index
];
6773 VERIFY(allocation
<= &mleak_allocations
[mleak_alloc_buckets
- 1]);
6774 VERIFY(trace
<= &mleak_traces
[mleak_trace_buckets
- 1]);
6776 allocation
->hitcount
++;
6780 * If the allocation bucket we want is occupied
6781 * and the occupier has the same trace, just bail.
6783 if (allocation
->element
!= NULL
&&
6784 trace_index
== allocation
->trace_index
) {
6785 mleak_table
.alloc_collisions
++;
6786 lck_mtx_unlock(mleak_lock
);
6791 * Store the backtrace in the traces array;
6792 * Size of zero = trace bucket is free.
6794 if (trace
->allocs
> 0 &&
6795 bcmp(trace
->addr
, bt
, (depth
* sizeof (uintptr_t))) != 0) {
6796 /* Different, unique trace, but the same hash! Bail out. */
6797 trace
->collisions
++;
6798 mleak_table
.trace_collisions
++;
6799 lck_mtx_unlock(mleak_lock
);
6801 } else if (trace
->allocs
> 0) {
6802 /* Same trace, already added, so increment refcount */
6805 /* Found an unused trace bucket, so record the trace here */
6806 if (trace
->depth
!= 0) {
6807 /* this slot previously used but not currently in use */
6808 mleak_table
.trace_overwrites
++;
6810 mleak_table
.trace_recorded
++;
6812 memcpy(trace
->addr
, bt
, (depth
* sizeof (uintptr_t)));
6813 trace
->depth
= depth
;
6814 trace
->collisions
= 0;
6817 /* Step 2: Store the allocation record in the allocations array */
6818 if (allocation
->element
!= NULL
) {
6820 * Replace an existing allocation. No need to preserve
6821 * because only a subset of the allocations are being
6824 mleak_table
.alloc_collisions
++;
6825 } else if (allocation
->trace_index
!= 0) {
6826 mleak_table
.alloc_overwrites
++;
6828 allocation
->element
= addr
;
6829 allocation
->trace_index
= trace_index
;
6830 allocation
->count
= num
;
6831 mleak_table
.alloc_recorded
++;
6832 mleak_table
.outstanding_allocs
++;
6834 lck_mtx_unlock(mleak_lock
);
6839 mleak_free(mcache_obj_t
*addr
)
6841 while (addr
!= NULL
) {
6842 struct mallocation
*allocation
= &mleak_allocations
6843 [hashaddr((uintptr_t)addr
, mleak_alloc_buckets
)];
6845 if (allocation
->element
== addr
&&
6846 allocation
->trace_index
< mleak_trace_buckets
) {
6847 lck_mtx_lock_spin(mleak_lock
);
6848 if (allocation
->element
== addr
&&
6849 allocation
->trace_index
< mleak_trace_buckets
) {
6850 struct mtrace
*trace
;
6851 trace
= &mleak_traces
[allocation
->trace_index
];
6852 /* allocs = 0 means trace bucket is unused */
6853 if (trace
->allocs
> 0)
6855 if (trace
->allocs
== 0)
6857 /* NULL element means alloc bucket is unused */
6858 allocation
->element
= NULL
;
6859 mleak_table
.outstanding_allocs
--;
6861 lck_mtx_unlock(mleak_lock
);
6863 addr
= addr
->obj_next
;
6871 struct mtrace
*swap
;
6873 for(i
= 0; i
< MLEAK_NUM_TRACES
; i
++)
6874 mleak_top_trace
[i
] = NULL
;
6876 for(i
= 0, j
= 0; j
< MLEAK_NUM_TRACES
&& i
< mleak_trace_buckets
; i
++)
6878 if (mleak_traces
[i
].allocs
<= 0)
6881 mleak_top_trace
[j
] = &mleak_traces
[i
];
6882 for (k
= j
; k
> 0; k
--) {
6883 if (mleak_top_trace
[k
]->allocs
<=
6884 mleak_top_trace
[k
-1]->allocs
)
6887 swap
= mleak_top_trace
[k
-1];
6888 mleak_top_trace
[k
-1] = mleak_top_trace
[k
];
6889 mleak_top_trace
[k
] = swap
;
6895 for(; i
< mleak_trace_buckets
; i
++) {
6896 if (mleak_traces
[i
].allocs
<= mleak_top_trace
[j
]->allocs
)
6899 mleak_top_trace
[j
] = &mleak_traces
[i
];
6901 for (k
= j
; k
> 0; k
--) {
6902 if (mleak_top_trace
[k
]->allocs
<=
6903 mleak_top_trace
[k
-1]->allocs
)
6906 swap
= mleak_top_trace
[k
-1];
6907 mleak_top_trace
[k
-1] = mleak_top_trace
[k
];
6908 mleak_top_trace
[k
] = swap
;
6914 mleak_update_stats()
6916 mleak_trace_stat_t
*mltr
;
6919 VERIFY(mleak_stat
!= NULL
);
6921 VERIFY(mleak_stat
->ml_isaddr64
);
6923 VERIFY(!mleak_stat
->ml_isaddr64
);
6924 #endif /* !__LP64__ */
6925 VERIFY(mleak_stat
->ml_cnt
== MLEAK_NUM_TRACES
);
6927 mleak_sort_traces();
6929 mltr
= &mleak_stat
->ml_trace
[0];
6930 bzero(mltr
, sizeof (*mltr
) * MLEAK_NUM_TRACES
);
6931 for (i
= 0; i
< MLEAK_NUM_TRACES
; i
++) {
6934 if (mleak_top_trace
[i
] == NULL
||
6935 mleak_top_trace
[i
]->allocs
== 0)
6938 mltr
->mltr_collisions
= mleak_top_trace
[i
]->collisions
;
6939 mltr
->mltr_hitcount
= mleak_top_trace
[i
]->hitcount
;
6940 mltr
->mltr_allocs
= mleak_top_trace
[i
]->allocs
;
6941 mltr
->mltr_depth
= mleak_top_trace
[i
]->depth
;
6943 VERIFY(mltr
->mltr_depth
<= MLEAK_STACK_DEPTH
);
6944 for (j
= 0; j
< mltr
->mltr_depth
; j
++)
6945 mltr
->mltr_addr
[j
] = mleak_top_trace
[i
]->addr
[j
];
6951 static struct mbtypes
{
6953 const char *mt_name
;
6955 { MT_DATA
, "data" },
6956 { MT_OOBDATA
, "oob data" },
6957 { MT_CONTROL
, "ancillary data" },
6958 { MT_HEADER
, "packet headers" },
6959 { MT_SOCKET
, "socket structures" },
6960 { MT_PCB
, "protocol control blocks" },
6961 { MT_RTABLE
, "routing table entries" },
6962 { MT_HTABLE
, "IMP host table entries" },
6963 { MT_ATABLE
, "address resolution tables" },
6964 { MT_FTABLE
, "fragment reassembly queue headers" },
6965 { MT_SONAME
, "socket names and addresses" },
6966 { MT_SOOPTS
, "socket options" },
6967 { MT_RIGHTS
, "access rights" },
6968 { MT_IFADDR
, "interface addresses" },
6969 { MT_TAG
, "packet tags" },
6973 #define MBUF_DUMP_BUF_CHK() { \
6983 unsigned long totmem
= 0, totfree
= 0, totmbufs
, totused
, totpct
;
6984 u_int32_t m_mbufs
= 0, m_clfree
= 0, m_bigclfree
= 0;
6985 u_int32_t m_mbufclfree
= 0, m_mbufbigclfree
= 0;
6986 u_int32_t m_16kclusters
= 0, m_16kclfree
= 0, m_mbuf16kclfree
= 0;
6987 int nmbtypes
= sizeof (mbstat
.m_mtypes
) / sizeof (short);
6990 mb_class_stat_t
*sp
;
6991 mleak_trace_stat_t
*mltr
;
6992 char *c
= mbuf_dump_buf
;
6993 int i
, k
, clen
= MBUF_DUMP_BUF_SIZE
;
6995 mbuf_dump_buf
[0] = '\0';
6997 /* synchronize all statistics in the mbuf table */
6999 mbuf_mtypes_sync(TRUE
);
7001 sp
= &mb_stat
->mbs_class
[0];
7002 for (i
= 0; i
< mb_stat
->mbs_cnt
; i
++, sp
++) {
7005 if (m_class(i
) == MC_MBUF
) {
7006 m_mbufs
= sp
->mbcl_active
;
7007 } else if (m_class(i
) == MC_CL
) {
7008 m_clfree
= sp
->mbcl_total
- sp
->mbcl_active
;
7009 } else if (m_class(i
) == MC_BIGCL
) {
7010 m_bigclfree
= sp
->mbcl_total
- sp
->mbcl_active
;
7011 } else if (njcl
> 0 && m_class(i
) == MC_16KCL
) {
7012 m_16kclfree
= sp
->mbcl_total
- sp
->mbcl_active
;
7013 m_16kclusters
= sp
->mbcl_total
;
7014 } else if (m_class(i
) == MC_MBUF_CL
) {
7015 m_mbufclfree
= sp
->mbcl_total
- sp
->mbcl_active
;
7016 } else if (m_class(i
) == MC_MBUF_BIGCL
) {
7017 m_mbufbigclfree
= sp
->mbcl_total
- sp
->mbcl_active
;
7018 } else if (njcl
> 0 && m_class(i
) == MC_MBUF_16KCL
) {
7019 m_mbuf16kclfree
= sp
->mbcl_total
- sp
->mbcl_active
;
7022 mem
= sp
->mbcl_ctotal
* sp
->mbcl_size
;
7024 totfree
+= (sp
->mbcl_mc_cached
+ sp
->mbcl_infree
) *
7029 /* adjust free counts to include composite caches */
7030 m_clfree
+= m_mbufclfree
;
7031 m_bigclfree
+= m_mbufbigclfree
;
7032 m_16kclfree
+= m_mbuf16kclfree
;
7035 for (mp
= mbtypes
; mp
->mt_name
!= NULL
; mp
++)
7036 totmbufs
+= mbstat
.m_mtypes
[mp
->mt_type
];
7037 if (totmbufs
> m_mbufs
)
7039 k
= snprintf(c
, clen
, "%lu/%u mbufs in use:\n", totmbufs
, m_mbufs
);
7040 MBUF_DUMP_BUF_CHK();
7042 bzero(&seen
, sizeof (seen
));
7043 for (mp
= mbtypes
; mp
->mt_name
!= NULL
; mp
++) {
7044 if (mbstat
.m_mtypes
[mp
->mt_type
] != 0) {
7045 seen
[mp
->mt_type
] = 1;
7046 k
= snprintf(c
, clen
, "\t%u mbufs allocated to %s\n",
7047 mbstat
.m_mtypes
[mp
->mt_type
], mp
->mt_name
);
7048 MBUF_DUMP_BUF_CHK();
7052 for (i
= 0; i
< nmbtypes
; i
++)
7053 if (!seen
[i
] && mbstat
.m_mtypes
[i
] != 0) {
7054 k
= snprintf(c
, clen
, "\t%u mbufs allocated to "
7055 "<mbuf type %d>\n", mbstat
.m_mtypes
[i
], i
);
7056 MBUF_DUMP_BUF_CHK();
7058 if ((m_mbufs
- totmbufs
) > 0) {
7059 k
= snprintf(c
, clen
, "\t%lu mbufs allocated to caches\n",
7060 m_mbufs
- totmbufs
);
7061 MBUF_DUMP_BUF_CHK();
7063 k
= snprintf(c
, clen
, "%u/%u mbuf 2KB clusters in use\n"
7064 "%u/%u mbuf 4KB clusters in use\n",
7065 (unsigned int)(mbstat
.m_clusters
- m_clfree
),
7066 (unsigned int)mbstat
.m_clusters
,
7067 (unsigned int)(mbstat
.m_bigclusters
- m_bigclfree
),
7068 (unsigned int)mbstat
.m_bigclusters
);
7069 MBUF_DUMP_BUF_CHK();
7072 k
= snprintf(c
, clen
, "%u/%u mbuf %uKB clusters in use\n",
7073 m_16kclusters
- m_16kclfree
, m_16kclusters
,
7075 MBUF_DUMP_BUF_CHK();
7077 totused
= totmem
- totfree
;
7080 } else if (totused
< (ULONG_MAX
/ 100)) {
7081 totpct
= (totused
* 100) / totmem
;
7083 u_long totmem1
= totmem
/ 100;
7084 u_long totused1
= totused
/ 100;
7085 totpct
= (totused1
* 100) / totmem1
;
7087 k
= snprintf(c
, clen
, "%lu KB allocated to network (approx. %lu%% "
7088 "in use)\n", totmem
/ 1024, totpct
);
7089 MBUF_DUMP_BUF_CHK();
7091 /* mbuf leak detection statistics */
7092 mleak_update_stats();
7094 k
= snprintf(c
, clen
, "\nmbuf leak detection table:\n");
7095 MBUF_DUMP_BUF_CHK();
7096 k
= snprintf(c
, clen
, "\ttotal captured: %u (one per %u)\n",
7097 mleak_table
.mleak_capture
/ mleak_table
.mleak_sample_factor
,
7098 mleak_table
.mleak_sample_factor
);
7099 MBUF_DUMP_BUF_CHK();
7100 k
= snprintf(c
, clen
, "\ttotal allocs outstanding: %llu\n",
7101 mleak_table
.outstanding_allocs
);
7102 MBUF_DUMP_BUF_CHK();
7103 k
= snprintf(c
, clen
, "\tnew hash recorded: %llu allocs, %llu traces\n",
7104 mleak_table
.alloc_recorded
, mleak_table
.trace_recorded
);
7105 MBUF_DUMP_BUF_CHK();
7106 k
= snprintf(c
, clen
, "\thash collisions: %llu allocs, %llu traces\n",
7107 mleak_table
.alloc_collisions
, mleak_table
.trace_collisions
);
7108 MBUF_DUMP_BUF_CHK();
7109 k
= snprintf(c
, clen
, "\toverwrites: %llu allocs, %llu traces\n",
7110 mleak_table
.alloc_overwrites
, mleak_table
.trace_overwrites
);
7111 MBUF_DUMP_BUF_CHK();
7112 k
= snprintf(c
, clen
, "\tlock conflicts: %llu\n\n",
7113 mleak_table
.total_conflicts
);
7114 MBUF_DUMP_BUF_CHK();
7116 k
= snprintf(c
, clen
, "top %d outstanding traces:\n",
7117 mleak_stat
->ml_cnt
);
7118 MBUF_DUMP_BUF_CHK();
7119 for (i
= 0; i
< mleak_stat
->ml_cnt
; i
++) {
7120 mltr
= &mleak_stat
->ml_trace
[i
];
7121 k
= snprintf(c
, clen
, "[%d] %llu outstanding alloc(s), "
7122 "%llu hit(s), %llu collision(s)\n", (i
+ 1),
7123 mltr
->mltr_allocs
, mltr
->mltr_hitcount
,
7124 mltr
->mltr_collisions
);
7125 MBUF_DUMP_BUF_CHK();
7128 if (mleak_stat
->ml_isaddr64
)
7129 k
= snprintf(c
, clen
, MB_LEAK_HDR_64
);
7131 k
= snprintf(c
, clen
, MB_LEAK_HDR_32
);
7132 MBUF_DUMP_BUF_CHK();
7134 for (i
= 0; i
< MLEAK_STACK_DEPTH
; i
++) {
7136 k
= snprintf(c
, clen
, "%2d: ", (i
+ 1));
7137 MBUF_DUMP_BUF_CHK();
7138 for (j
= 0; j
< mleak_stat
->ml_cnt
; j
++) {
7139 mltr
= &mleak_stat
->ml_trace
[j
];
7140 if (i
< mltr
->mltr_depth
) {
7141 if (mleak_stat
->ml_isaddr64
) {
7142 k
= snprintf(c
, clen
, "0x%0llx ",
7143 (uint64_t)VM_KERNEL_UNSLIDE(
7144 mltr
->mltr_addr
[i
]));
7146 k
= snprintf(c
, clen
,
7148 (uint32_t)VM_KERNEL_UNSLIDE(
7149 mltr
->mltr_addr
[i
]));
7152 if (mleak_stat
->ml_isaddr64
)
7153 k
= snprintf(c
, clen
,
7154 MB_LEAK_SPACING_64
);
7156 k
= snprintf(c
, clen
,
7157 MB_LEAK_SPACING_32
);
7159 MBUF_DUMP_BUF_CHK();
7161 k
= snprintf(c
, clen
, "\n");
7162 MBUF_DUMP_BUF_CHK();
7165 return (mbuf_dump_buf
);
7168 #undef MBUF_DUMP_BUF_CHK
7171 * Convert between a regular and a packet header mbuf. Caller is responsible
7172 * for setting or clearing M_PKTHDR; this routine does the rest of the work.
7175 m_reinit(struct mbuf
*m
, int hdr
)
7180 VERIFY(!(m
->m_flags
& M_PKTHDR
));
7181 if (!(m
->m_flags
& M_EXT
) &&
7182 (m
->m_data
!= m
->m_dat
|| m
->m_len
> 0)) {
7184 * If there's no external cluster attached and the
7185 * mbuf appears to contain user data, we cannot
7186 * safely convert this to a packet header mbuf,
7187 * as the packet header structure might overlap
7190 printf("%s: cannot set M_PKTHDR on altered mbuf %llx, "
7191 "m_data %llx (expected %llx), "
7192 "m_len %d (expected 0)\n",
7194 (uint64_t)VM_KERNEL_ADDRPERM(m
),
7195 (uint64_t)VM_KERNEL_ADDRPERM(m
->m_data
),
7196 (uint64_t)VM_KERNEL_ADDRPERM(m
->m_dat
), m
->m_len
);
7199 VERIFY((m
->m_flags
& M_EXT
) || m
->m_data
== m
->m_dat
);
7200 m
->m_flags
|= M_PKTHDR
;
7201 MBUF_INIT_PKTHDR(m
);
7204 /* Check for scratch area overflow */
7205 m_redzone_verify(m
);
7206 /* Free the aux data and tags if there is any */
7207 m_tag_delete_chain(m
, NULL
);
7208 m
->m_flags
&= ~M_PKTHDR
;
7215 m_scratch_init(struct mbuf
*m
)
7217 struct pkthdr
*pkt
= &m
->m_pkthdr
;
7219 VERIFY(m
->m_flags
& M_PKTHDR
);
7221 /* See comments in <rdar://problem/14040693> */
7222 if (pkt
->pkt_flags
& PKTF_PRIV_GUARDED
) {
7223 panic_plain("Invalid attempt to modify guarded module-private "
7224 "area: mbuf %p, pkt_flags 0x%x\n", m
, pkt
->pkt_flags
);
7228 bzero(&pkt
->pkt_mpriv
, sizeof (pkt
->pkt_mpriv
));
7232 * This routine is reserved for mbuf_get_driver_scratch(); clients inside
7233 * xnu that intend on utilizing the module-private area should directly
7234 * refer to the pkt_mpriv structure in the pkthdr. They are also expected
7235 * to set and clear PKTF_PRIV_GUARDED, while owning the packet and prior
7236 * to handing it off to another module, respectively.
7239 m_scratch_get(struct mbuf
*m
, u_int8_t
**p
)
7241 struct pkthdr
*pkt
= &m
->m_pkthdr
;
7243 VERIFY(m
->m_flags
& M_PKTHDR
);
7245 /* See comments in <rdar://problem/14040693> */
7246 if (pkt
->pkt_flags
& PKTF_PRIV_GUARDED
) {
7247 panic_plain("Invalid attempt to access guarded module-private "
7248 "area: mbuf %p, pkt_flags 0x%x\n", m
, pkt
->pkt_flags
);
7253 mcache_audit_t
*mca
;
7255 lck_mtx_lock(mbuf_mlock
);
7256 mca
= mcl_audit_buf2mca(MC_MBUF
, (mcache_obj_t
*)m
);
7257 if (mca
->mca_uflags
& MB_SCVALID
)
7258 mcl_audit_scratch(mca
);
7259 lck_mtx_unlock(mbuf_mlock
);
7262 *p
= (u_int8_t
*)&pkt
->pkt_mpriv
;
7263 return (sizeof (pkt
->pkt_mpriv
));
7267 m_redzone_init(struct mbuf
*m
)
7269 VERIFY(m
->m_flags
& M_PKTHDR
);
7271 * Each mbuf has a unique red zone pattern, which is a XOR
7272 * of the red zone cookie and the address of the mbuf.
7274 m
->m_pkthdr
.redzone
= ((u_int32_t
)(uintptr_t)m
) ^ mb_redzone_cookie
;
7278 m_redzone_verify(struct mbuf
*m
)
7280 u_int32_t mb_redzone
;
7282 VERIFY(m
->m_flags
& M_PKTHDR
);
7284 mb_redzone
= ((u_int32_t
)(uintptr_t)m
) ^ mb_redzone_cookie
;
7285 if (m
->m_pkthdr
.redzone
!= mb_redzone
) {
7286 panic("mbuf %p redzone violation with value 0x%x "
7287 "(instead of 0x%x, using cookie 0x%x)\n",
7288 m
, m
->m_pkthdr
.redzone
, mb_redzone
, mb_redzone_cookie
);
7294 * Send a report of mbuf usage if the usage is at least 6% of max limit
7295 * or if there has been at least 3% increase since the last report.
7297 * The values 6% and 3% are chosen so that we can do simple arithmetic
7298 * with shift operations.
7301 mbuf_report_usage(mbuf_class_t cl
)
7303 /* if a report is already in progress, nothing to do */
7304 if (mb_peak_newreport
)
7307 if (m_total(cl
) > m_peak(cl
) &&
7308 m_total(cl
) >= (m_maxlimit(cl
) >> 4) &&
7309 (m_total(cl
) - m_peak(cl
)) >= (m_peak(cl
) >> 5))
7314 __private_extern__
void
7315 mbuf_report_peak_usage(void)
7319 struct nstat_sysinfo_data ns_data
;
7320 uint32_t memreleased
= 0;
7322 uptime
= net_uptime();
7323 lck_mtx_lock(mbuf_mlock
);
7325 /* Generate an initial report after 1 week of uptime */
7326 if (!mb_peak_firstreport
&&
7327 uptime
> MBUF_PEAK_FIRST_REPORT_THRESHOLD
) {
7328 mb_peak_newreport
= TRUE
;
7329 mb_peak_firstreport
= TRUE
;
7332 if (!mb_peak_newreport
) {
7333 lck_mtx_unlock(mbuf_mlock
);
7338 * Since a report is being generated before 1 week,
7339 * we do not need to force another one later
7341 if (uptime
< MBUF_PEAK_FIRST_REPORT_THRESHOLD
)
7342 mb_peak_firstreport
= TRUE
;
7344 for (i
= 0; i
< NELEM(mbuf_table
); i
++) {
7345 m_peak(m_class(i
)) = m_total(m_class(i
));
7346 memreleased
+= m_release_cnt(i
);
7348 mb_peak_newreport
= FALSE
;
7349 lck_mtx_unlock(mbuf_mlock
);
7351 bzero(&ns_data
, sizeof(ns_data
));
7352 ns_data
.flags
= NSTAT_SYSINFO_MBUF_STATS
;
7353 ns_data
.u
.mb_stats
.total_256b
= m_peak(MC_MBUF
);
7354 ns_data
.u
.mb_stats
.total_2kb
= m_peak(MC_CL
);
7355 ns_data
.u
.mb_stats
.total_4kb
= m_peak(MC_BIGCL
);
7356 ns_data
.u
.mb_stats
.sbmb_total
= total_sbmb_cnt_peak
;
7357 ns_data
.u
.mb_stats
.sb_atmbuflimit
= sbmb_limreached
;
7358 ns_data
.u
.mb_stats
.draincnt
= mbstat
.m_drain
;
7359 ns_data
.u
.mb_stats
.memreleased
= memreleased
;
7361 nstat_sysinfo_send_data(&ns_data
);
7365 * Called by the VM when there's memory pressure.
7367 __private_extern__
void
7371 mcl_slab_t
*sp
, *sp_tmp
, *nsp
;
7372 unsigned int num
, k
, interval
, released
= 0;
7373 unsigned int total_mem
= 0, use_mem
= 0;
7374 boolean_t ret
, purge_caches
= FALSE
;
7378 static uint64_t last_drain
= 0;
7379 static unsigned char scratch
[32];
7380 static ppnum_t scratch_pa
= 0;
7382 if (mb_drain_maxint
== 0 || mb_waiters
)
7384 if (scratch_pa
== 0) {
7385 bzero(scratch
, sizeof(scratch
));
7386 scratch_pa
= pmap_find_phys(kernel_pmap
, (addr64_t
)scratch
);
7388 } else if (mclverify
) {
7390 * Panic if a driver wrote to our scratch memory.
7392 for (k
= 0; k
< sizeof(scratch
); k
++)
7394 panic("suspect DMA to freed address");
7397 * Don't free memory too often as that could cause excessive
7398 * waiting times for mbufs. Purge caches if we were asked to drain
7399 * in the last 5 minutes.
7401 lck_mtx_lock(mbuf_mlock
);
7402 if (last_drain
== 0) {
7403 last_drain
= net_uptime();
7404 lck_mtx_unlock(mbuf_mlock
);
7407 interval
= net_uptime() - last_drain
;
7408 if (interval
<= mb_drain_maxint
) {
7409 lck_mtx_unlock(mbuf_mlock
);
7412 if (interval
<= mb_drain_maxint
* 5)
7413 purge_caches
= TRUE
;
7414 last_drain
= net_uptime();
7416 * Don't free any memory if we're using 60% or more.
7418 for (mc
= 0; mc
< NELEM(mbuf_table
); mc
++) {
7419 total_mem
+= m_total(mc
) * m_maxsize(mc
);
7420 use_mem
+= m_active(mc
) * m_maxsize(mc
);
7422 per
= (float)use_mem
/ (float)total_mem
;
7424 lck_mtx_unlock(mbuf_mlock
);
7428 * Purge all the caches. This effectively disables
7429 * caching for a few seconds, but the mbuf worker thread will
7430 * re-enable them again.
7432 if (purge_caches
== TRUE
)
7433 for (mc
= 0; mc
< NELEM(mbuf_table
); mc
++) {
7434 if (m_total(mc
) < m_avgtotal(mc
))
7436 lck_mtx_unlock(mbuf_mlock
);
7437 ret
= mcache_purge_cache(m_cache(mc
), FALSE
);
7438 lck_mtx_lock(mbuf_mlock
);
7443 * Move the objects from the composite class freelist to
7444 * the rudimentary slabs list, but keep at least 10% of the average
7445 * total in the freelist.
7447 for (mc
= 0; mc
< NELEM(mbuf_table
); mc
++) {
7448 while (m_cobjlist(mc
) &&
7449 m_total(mc
) < m_avgtotal(mc
) &&
7450 m_infree(mc
) > 0.1 * m_avgtotal(mc
) + m_minlimit(mc
)) {
7451 obj
= m_cobjlist(mc
);
7452 m_cobjlist(mc
) = obj
->obj_next
;
7453 obj
->obj_next
= NULL
;
7454 num
= cslab_free(mc
, obj
, 1);
7458 /* cslab_free() handles m_total */
7462 * Free the buffers present in the slab list up to 10% of the total
7463 * average per class.
7465 * We walk the list backwards in an attempt to reduce fragmentation.
7467 for (mc
= NELEM(mbuf_table
) - 1; (int)mc
>= 0; mc
--) {
7468 TAILQ_FOREACH_SAFE(sp
, &m_slablist(mc
), sl_link
, sp_tmp
) {
7470 * Process only unused slabs occupying memory.
7472 if (sp
->sl_refcnt
!= 0 || sp
->sl_len
== 0 ||
7473 sp
->sl_base
== NULL
)
7475 if (m_total(mc
) < m_avgtotal(mc
) ||
7476 m_infree(mc
) < 0.1 * m_avgtotal(mc
) + m_minlimit(mc
))
7478 slab_remove(sp
, mc
);
7481 m_infree(mc
) -= NMBPBG
;
7482 m_total(mc
) -= NMBPBG
;
7483 if (mclaudit
!= NULL
)
7484 mcl_audit_free(sp
->sl_base
, NMBPBG
);
7487 m_infree(mc
) -= NCLPBG
;
7488 m_total(mc
) -= NCLPBG
;
7489 if (mclaudit
!= NULL
)
7490 mcl_audit_free(sp
->sl_base
, NMBPBG
);
7495 if (mclaudit
!= NULL
)
7496 mcl_audit_free(sp
->sl_base
, NMBPBG
);
7501 for (nsp
= sp
, k
= 1; k
< NSLABSP16KB
; k
++) {
7503 VERIFY(nsp
->sl_refcnt
== 0 &&
7504 nsp
->sl_base
!= NULL
&&
7506 slab_init(nsp
, 0, 0, NULL
, NULL
, 0, 0,
7510 if (mclaudit
!= NULL
)
7511 mcl_audit_free(sp
->sl_base
, 1);
7515 * The composite classes have their own
7516 * freelist (m_cobjlist), so we only
7517 * process rudimentary classes here.
7521 m_release_cnt(mc
) += m_size(mc
);
7522 released
+= m_size(mc
);
7523 offset
= ((char *)sp
->sl_base
- (char *)mbutl
) / NBPG
;
7525 * Make sure the IOMapper points to a valid, but
7526 * bogus, address. This should prevent further DMA
7527 * accesses to freed memory.
7529 IOMapperInsertPage(mcl_paddr_base
, offset
, scratch_pa
);
7530 mcl_paddr
[offset
] = 0;
7531 kmem_free(mb_map
, (vm_offset_t
)sp
->sl_base
,
7533 slab_init(sp
, 0, 0, NULL
, NULL
, 0, 0, 0);
7538 mbstat
.m_bigclusters
= m_total(MC_BIGCL
);
7539 mbstat
.m_clusters
= m_total(MC_CL
);
7540 mbstat
.m_mbufs
= m_total(MC_MBUF
);
7542 mbuf_mtypes_sync(TRUE
);
7543 lck_mtx_unlock(mbuf_mlock
);
7547 m_drain_force_sysctl SYSCTL_HANDLER_ARGS
7549 #pragma unused(arg1, arg2)
7552 err
= sysctl_handle_int(oidp
, &val
, 0, req
);
7553 if (err
!= 0 || req
->newptr
== USER_ADDR_NULL
)
7561 SYSCTL_DECL(_kern_ipc
);
7562 SYSCTL_PROC(_kern_ipc
, KIPC_MBSTAT
, mbstat
,
7563 CTLTYPE_STRUCT
| CTLFLAG_RD
| CTLFLAG_LOCKED
,
7564 0, 0, mbstat_sysctl
, "S,mbstat", "");
7565 SYSCTL_PROC(_kern_ipc
, OID_AUTO
, mb_stat
,
7566 CTLTYPE_STRUCT
| CTLFLAG_RD
| CTLFLAG_LOCKED
,
7567 0, 0, mb_stat_sysctl
, "S,mb_stat", "");
7568 SYSCTL_PROC(_kern_ipc
, OID_AUTO
, mleak_top_trace
,
7569 CTLTYPE_STRUCT
| CTLFLAG_RD
| CTLFLAG_LOCKED
,
7570 0, 0, mleak_top_trace_sysctl
, "S,mb_top_trace", "");
7571 SYSCTL_PROC(_kern_ipc
, OID_AUTO
, mleak_table
,
7572 CTLTYPE_STRUCT
| CTLFLAG_RD
| CTLFLAG_LOCKED
,
7573 0, 0, mleak_table_sysctl
, "S,mleak_table", "");
7574 SYSCTL_INT(_kern_ipc
, OID_AUTO
, mleak_sample_factor
,
7575 CTLFLAG_RW
| CTLFLAG_LOCKED
, &mleak_table
.mleak_sample_factor
, 0, "");
7576 SYSCTL_INT(_kern_ipc
, OID_AUTO
, mb_normalized
,
7577 CTLFLAG_RD
| CTLFLAG_LOCKED
, &mb_normalized
, 0, "");
7578 SYSCTL_INT(_kern_ipc
, OID_AUTO
, mb_watchdog
,
7579 CTLFLAG_RW
| CTLFLAG_LOCKED
, &mb_watchdog
, 0, "");
7580 SYSCTL_PROC(_kern_ipc
, OID_AUTO
, mb_drain_force
,
7581 CTLTYPE_INT
| CTLFLAG_RW
| CTLFLAG_LOCKED
, NULL
, 0,
7582 m_drain_force_sysctl
, "I",
7583 "Forces the mbuf garbage collection to run");
7584 SYSCTL_INT(_kern_ipc
, OID_AUTO
, mb_drain_maxint
,
7585 CTLFLAG_RW
| CTLFLAG_LOCKED
, &mb_drain_maxint
, 0,
7586 "Minimum time interval between garbage collection");