2 * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
30 * Copyright (c) 1982, 1986, 1988, 1991, 1993
31 * The Regents of the University of California. All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/malloc.h>
74 #include <sys/kernel.h>
75 #include <sys/sysctl.h>
76 #include <sys/syslog.h>
77 #include <sys/protosw.h>
78 #include <sys/domain.h>
79 #include <sys/queue.h>
81 #include <kern/kern_types.h>
82 #include <kern/simple_lock.h>
83 #include <kern/queue.h>
84 #include <kern/sched_prim.h>
85 #include <kern/cpu_number.h>
87 #include <libkern/OSAtomic.h>
88 #include <libkern/libkern.h>
90 #include <IOKit/IOMapper.h>
92 #include <machine/limits.h>
93 #include <machine/machine_routines.h>
96 #include <security/mac_framework.h>
99 #include <sys/mcache.h>
102 * MBUF IMPLEMENTATION NOTES.
104 * There is a total of 5 per-CPU caches:
107 * This is a cache of rudimentary objects of MSIZE in size; each
108 * object represents an mbuf structure. This cache preserves only
109 * the m_type field of the mbuf during its transactions.
112 * This is a cache of rudimentary objects of MCLBYTES in size; each
113 * object represents a mcluster structure. This cache does not
114 * preserve the contents of the objects during its transactions.
117 * This is a cache of rudimentary objects of NBPG in size; each
118 * object represents a mbigcluster structure. This cache does not
119 * preserve the contents of the objects during its transaction.
122 * This is a cache of mbufs each having a cluster attached to it.
123 * It is backed by MC_MBUF and MC_CL rudimentary caches. Several
124 * fields of the mbuf related to the external cluster are preserved
125 * during transactions.
128 * This is a cache of mbufs each having a big cluster attached to it.
129 * It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several
130 * fields of the mbuf related to the external cluster are preserved
131 * during transactions.
135 * Allocation requests are handled first at the per-CPU (mcache) layer
136 * before falling back to the slab layer. Performance is optimal when
137 * the request is satisfied at the CPU layer because global data/lock
138 * never gets accessed. When the slab layer is entered for allocation,
139 * the slab freelist will be checked first for available objects before
140 * the VM backing store is invoked. Slab layer operations are serialized
141 * for all of the caches as the mbuf global lock is held most of the time.
142 * Allocation paths are different depending on the class of objects:
144 * a. Rudimentary object:
146 * { m_get_common(), m_clattach(), m_mclget(),
147 * m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
148 * composite object allocation }
151 * | +-----------------------+
153 * mcache_alloc/mcache_alloc_ext() mbuf_slab_audit()
156 * [CPU cache] -------> (found?) -------+
159 * mbuf_slab_alloc() |
162 * +---------> [freelist] -------> (found?) -------+
168 * +---<<---- kmem_mb_alloc()
170 * b. Composite object:
172 * { m_getpackets_internal(), m_allocpacket_internal() }
175 * | +------ (done) ---------+
177 * mcache_alloc/mcache_alloc_ext() mbuf_cslab_audit()
180 * [CPU cache] -------> (found?) -------+
183 * mbuf_cslab_alloc() |
186 * [freelist] -------> (found?) -------+
189 * (rudimentary object) |
190 * mcache_alloc/mcache_alloc_ext() ------>>-----+
192 * Auditing notes: If auditing is enabled, buffers will be subjected to
193 * integrity checks by the audit routine. This is done by verifying their
194 * contents against DEADBEEF (free) pattern before returning them to caller.
195 * As part of this step, the routine will also record the transaction and
196 * pattern-fill the buffers with BADDCAFE (uninitialized) pattern. It will
197 * also restore any constructed data structure fields if necessary.
199 * OBJECT DEALLOCATION:
201 * Freeing an object simply involves placing it into the CPU cache; this
202 * pollutes the cache to benefit subsequent allocations. The slab layer
203 * will only be entered if the object is to be purged out of the cache.
204 * During normal operations, this happens only when the CPU layer resizes
205 * its bucket while it's adjusting to the allocation load. Deallocation
206 * paths are different depending on the class of objects:
208 * a. Rudimentary object:
210 * { m_free(), m_freem_list(), composite object deallocation }
213 * | +------ (done) ---------+
215 * mcache_free/mcache_free_ext() |
218 * mbuf_slab_audit() |
221 * [CPU cache] ---> (not purging?) -----+
227 * [freelist] ----------->>------------+
228 * (objects never get purged to VM)
230 * b. Composite object:
232 * { m_free(), m_freem_list() }
235 * | +------ (done) ---------+
237 * mcache_free/mcache_free_ext() |
240 * mbuf_cslab_audit() |
243 * [CPU cache] ---> (not purging?) -----+
246 * mbuf_cslab_free() |
249 * [freelist] ---> (not purging?) -----+
252 * (rudimentary object) |
253 * mcache_free/mcache_free_ext() ------->>------+
255 * Auditing notes: If auditing is enabled, the audit routine will save
256 * any constructed data structure fields (if necessary) before filling the
257 * contents of the buffers with DEADBEEF (free) pattern and recording the
258 * transaction. Buffers that are freed (whether at CPU or slab layer) are
259 * expected to contain the free pattern.
263 * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
264 * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT). Additionally,
265 * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
266 * i.e. modify the boot argument parameter to "mbuf_debug=0x13". Note
267 * that debugging consumes more CPU and memory.
269 * Each object is associated with exactly one mcache_audit_t structure that
270 * contains the information related to its last buffer transaction. Given
271 * an address of an object, the audit structure can be retrieved by finding
272 * the position of the object relevant to the base address of the cluster:
274 * +------------+ +=============+
275 * | mbuf addr | | mclaudit[i] |
276 * +------------+ +=============+
278 * i = MTOCL(addr) +-------------+
279 * | +-----> | cl_audit[1] | -----> mcache_audit_t
280 * b = CLTOM(i) | +-------------+
282 * x = MCLIDX(b, addr) | +-------------+
283 * | | | cl_audit[7] |
284 * +-----------------+ +-------------+
287 * The mclaudit[] array is allocated at initialization time, but its contents
288 * get populated when the corresponding cluster is created. Because a cluster
289 * can be turned into NMBPCL number of mbufs, we preserve enough space for the
290 * mbufs so that there is a 1-to-1 mapping between them. A cluster that never
291 * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
292 * remaining entries unused. For big clusters, only one entry is allocated
293 * and used for the entire cluster pair.
296 /* TODO: should be in header file */
297 /* kernel translater */
298 extern vm_offset_t
kmem_mb_alloc(vm_map_t
, int);
299 extern ppnum_t
pmap_find_phys(pmap_t pmap
, addr64_t va
);
300 extern vm_map_t mb_map
; /* special map */
303 static lck_mtx_t
*mbuf_mlock
;
304 static lck_attr_t
*mbuf_mlock_attr
;
305 static lck_grp_t
*mbuf_mlock_grp
;
306 static lck_grp_attr_t
*mbuf_mlock_grp_attr
;
308 /* Back-end (common) layer */
309 static void *mbuf_worker_run
; /* wait channel for worker thread */
310 static int mbuf_worker_ready
; /* worker thread is runnable */
311 static int mbuf_expand_mcl
; /* number of cluster creation requets */
312 static int mbuf_expand_big
; /* number of big cluster creation requests */
313 static int mbuf_expand_16k
; /* number of 16K cluster creation requests */
314 static int ncpu
; /* number of CPUs */
315 static int *mcl_paddr
; /* Array of cluster physical addresses */
316 static ppnum_t mcl_paddr_base
; /* Handle returned by IOMapper::iovmAlloc() */
317 static mcache_t
*ref_cache
; /* Cache of cluster reference & flags */
318 static mcache_t
*mcl_audit_con_cache
; /* Audit contents cache */
319 static unsigned int mbuf_debug
; /* patchable mbuf mcache flags */
320 static unsigned int mb_normalized
; /* number of packets "normalized" */
323 MC_MBUF
= 0, /* Regular mbuf */
325 MC_BIGCL
, /* Large (4K) cluster */
326 MC_16KCL
, /* Jumbo (16K) cluster */
327 MC_MBUF_CL
, /* mbuf + cluster */
328 MC_MBUF_BIGCL
, /* mbuf + large (4K) cluster */
329 MC_MBUF_16KCL
/* mbuf + jumbo (16K) cluster */
332 #define MBUF_CLASS_MIN MC_MBUF
333 #define MBUF_CLASS_MAX MC_MBUF_16KCL
334 #define MBUF_CLASS_LAST MC_16KCL
335 #define MBUF_CLASS_VALID(c) \
336 ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
337 #define MBUF_CLASS_COMPOSITE(c) \
338 ((int)(c) > MBUF_CLASS_LAST)
342 * mbuf specific mcache allocation request flags.
344 #define MCR_COMP MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
347 * Per-cluster slab structure.
349 * A slab is a cluster control structure that contains one or more object
350 * chunks; the available chunks are chained in the slab's freelist (sl_head).
351 * Each time a chunk is taken out of the slab, the slab's reference count
352 * gets incremented. When all chunks have been taken out, the empty slab
353 * gets removed (SLF_DETACHED) from the class's slab list. A chunk that is
354 * returned to a slab causes the slab's reference count to be decremented;
355 * it also causes the slab to be reinserted back to class's slab list, if
356 * it's not already done.
358 * Compartmentalizing of the object chunks into slabs allows us to easily
359 * merge one or more slabs together when the adjacent slabs are idle, as
360 * well as to convert or move a slab from one class to another; e.g. the
361 * mbuf cluster slab can be converted to a regular cluster slab when all
362 * mbufs in the slab have been freed.
364 * A slab may also span across multiple clusters for chunks larger than
365 * a cluster's size. In this case, only the slab of the first cluster is
366 * used. The rest of the slabs are marked with SLF_PARTIAL to indicate
367 * that they are part of the larger slab.
369 typedef struct mcl_slab
{
370 struct mcl_slab
*sl_next
; /* neighboring slab */
371 u_int8_t sl_class
; /* controlling mbuf class */
372 int8_t sl_refcnt
; /* outstanding allocations */
373 int8_t sl_chunks
; /* chunks (bufs) in this slab */
374 u_int16_t sl_flags
; /* slab flags (see below) */
375 u_int16_t sl_len
; /* slab length */
376 void *sl_base
; /* base of allocated memory */
377 void *sl_head
; /* first free buffer */
378 TAILQ_ENTRY(mcl_slab
) sl_link
; /* next/prev slab on freelist */
381 #define SLF_MAPPED 0x0001 /* backed by a mapped page */
382 #define SLF_PARTIAL 0x0002 /* part of another slab */
383 #define SLF_DETACHED 0x0004 /* not in slab freelist */
386 * The array of slabs are broken into groups of arrays per 1MB of kernel
387 * memory to reduce the footprint. Each group is allocated on demand
388 * whenever a new piece of memory mapped in from the VM crosses the 1MB
391 #define NSLABSPMB ((1 << MBSHIFT) >> MCLSHIFT) /* 512 slabs/grp */
393 typedef struct mcl_slabg
{
394 mcl_slab_t slg_slab
[NSLABSPMB
]; /* group of slabs */
398 * Per-cluster audit structure.
401 mcache_audit_t
*cl_audit
[NMBPCL
]; /* array of audits */
404 #if CONFIG_MBUF_NOEXPAND
405 static unsigned int maxmbufcl
;
406 #endif /* CONFIG_MBUF_NOEXPAND */
409 * Size of data from the beginning of an mbuf that covers m_hdr, pkthdr
410 * and m_ext structures. If auditing is enabled, we allocate a shadow
411 * mbuf structure of this size inside each audit structure, and the
412 * contents of the real mbuf gets copied into it when the mbuf is freed.
413 * This allows us to pattern-fill the mbuf for integrity check, and to
414 * preserve any constructed mbuf fields (e.g. mbuf + cluster cache case).
415 * Note that we don't save the contents of clusters when they are freed;
416 * we simply pattern-fill them.
418 #if defined(__LP64__)
419 #define AUDIT_CONTENTS_SIZE 160
421 #define AUDIT_CONTENTS_SIZE 80
422 #endif /* __LP64__ */
425 * mbuf specific mcache audit flags
427 #define MB_INUSE 0x01 /* object has not been returned to slab */
428 #define MB_COMP_INUSE 0x02 /* object has not been returned to cslab */
429 #define MB_SCVALID 0x04 /* object has valid saved contents */
432 * Each of the following two arrays hold up to nmbclusters elements.
434 static mcl_audit_t
*mclaudit
; /* array of cluster audit information */
435 static mcl_slabg_t
**slabstbl
; /* cluster slabs table */
436 static unsigned int maxslabgrp
; /* max # of entries in slabs table */
437 static unsigned int slabgrp
; /* # of entries in slabs table */
440 int nclusters
; /* # of clusters for non-jumbo (legacy) sizes */
441 int njcl
; /* # of clusters for jumbo sizes */
442 int njclbytes
; /* size of a jumbo cluster */
443 union mcluster
*mbutl
; /* first mapped cluster address */
444 union mcluster
*embutl
; /* ending virtual address of mclusters */
445 int max_linkhdr
; /* largest link-level header */
446 int max_protohdr
; /* largest protocol header */
447 int max_hdr
; /* largest link+protocol header */
448 int max_datalen
; /* MHLEN - max_hdr */
450 /* TODO: should be in header file */
453 /* The minimum number of objects that are allocated, to start. */
455 #define MINBIGCL (MINCL >> 1)
456 #define MIN16KCL (MINCL >> 2)
458 /* Low watermarks (only map in pages once free counts go below) */
459 #define MCL_LOWAT MINCL
460 #define MBIGCL_LOWAT MINBIGCL
461 #define M16KCL_LOWAT MIN16KCL
464 mbuf_class_t mtbl_class
; /* class type */
465 mcache_t
*mtbl_cache
; /* mcache for this buffer class */
466 TAILQ_HEAD(mcl_slhead
, mcl_slab
) mtbl_slablist
; /* slab list */
467 mcache_obj_t
*mtbl_cobjlist
; /* composite objects freelist */
468 mb_class_stat_t
*mtbl_stats
; /* statistics fetchable via sysctl */
469 u_int32_t mtbl_maxsize
; /* maximum buffer size */
470 int mtbl_minlimit
; /* minimum allowed */
471 int mtbl_maxlimit
; /* maximum allowed */
472 u_int32_t mtbl_wantpurge
; /* purge during next reclaim */
475 #define m_class(c) mbuf_table[c].mtbl_class
476 #define m_cache(c) mbuf_table[c].mtbl_cache
477 #define m_slablist(c) mbuf_table[c].mtbl_slablist
478 #define m_cobjlist(c) mbuf_table[c].mtbl_cobjlist
479 #define m_maxsize(c) mbuf_table[c].mtbl_maxsize
480 #define m_minlimit(c) mbuf_table[c].mtbl_minlimit
481 #define m_maxlimit(c) mbuf_table[c].mtbl_maxlimit
482 #define m_wantpurge(c) mbuf_table[c].mtbl_wantpurge
483 #define m_cname(c) mbuf_table[c].mtbl_stats->mbcl_cname
484 #define m_size(c) mbuf_table[c].mtbl_stats->mbcl_size
485 #define m_total(c) mbuf_table[c].mtbl_stats->mbcl_total
486 #define m_active(c) mbuf_table[c].mtbl_stats->mbcl_active
487 #define m_infree(c) mbuf_table[c].mtbl_stats->mbcl_infree
488 #define m_slab_cnt(c) mbuf_table[c].mtbl_stats->mbcl_slab_cnt
489 #define m_alloc_cnt(c) mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
490 #define m_free_cnt(c) mbuf_table[c].mtbl_stats->mbcl_free_cnt
491 #define m_notified(c) mbuf_table[c].mtbl_stats->mbcl_notified
492 #define m_purge_cnt(c) mbuf_table[c].mtbl_stats->mbcl_purge_cnt
493 #define m_fail_cnt(c) mbuf_table[c].mtbl_stats->mbcl_fail_cnt
494 #define m_ctotal(c) mbuf_table[c].mtbl_stats->mbcl_ctotal
496 static mbuf_table_t mbuf_table
[] = {
498 * The caches for mbufs, regular clusters and big clusters.
500 { MC_MBUF
, NULL
, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF
)),
501 NULL
, NULL
, 0, 0, 0, 0 },
502 { MC_CL
, NULL
, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL
)),
503 NULL
, NULL
, 0, 0, 0, 0 },
504 { MC_BIGCL
, NULL
, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL
)),
505 NULL
, NULL
, 0, 0, 0, 0 },
506 { MC_16KCL
, NULL
, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL
)),
507 NULL
, NULL
, 0, 0, 0, 0 },
509 * The following are special caches; they serve as intermediate
510 * caches backed by the above rudimentary caches. Each object
511 * in the cache is an mbuf with a cluster attached to it. Unlike
512 * the above caches, these intermediate caches do not directly
513 * deal with the slab structures; instead, the constructed
514 * cached elements are simply stored in the freelists.
516 { MC_MBUF_CL
, NULL
, { NULL
, NULL
}, NULL
, NULL
, 0, 0, 0, 0 },
517 { MC_MBUF_BIGCL
, NULL
, { NULL
, NULL
}, NULL
, NULL
, 0, 0, 0, 0 },
518 { MC_MBUF_16KCL
, NULL
, { NULL
, NULL
}, NULL
, NULL
, 0, 0, 0, 0 },
521 #define NELEM(a) (sizeof (a) / sizeof ((a)[0]))
523 static void *mb_waitchan
= &mbuf_table
; /* wait channel for all caches */
524 static int mb_waiters
; /* number of sleepers */
526 /* The following are used to serialize m_clalloc() */
527 static boolean_t mb_clalloc_busy
;
528 static void *mb_clalloc_waitchan
= &mb_clalloc_busy
;
529 static int mb_clalloc_waiters
;
531 static int mbstat_sysctl SYSCTL_HANDLER_ARGS
;
532 static int mb_stat_sysctl SYSCTL_HANDLER_ARGS
;
533 static void mbuf_table_init(void);
534 static inline void m_incref(struct mbuf
*);
535 static inline u_int32_t
m_decref(struct mbuf
*);
536 static int m_clalloc(const u_int32_t
, const int, const u_int32_t
);
537 static void mbuf_worker_thread_init(void);
538 static mcache_obj_t
*slab_alloc(mbuf_class_t
, int);
539 static void slab_free(mbuf_class_t
, mcache_obj_t
*);
540 static unsigned int mbuf_slab_alloc(void *, mcache_obj_t
***,
542 static void mbuf_slab_free(void *, mcache_obj_t
*, int);
543 static void mbuf_slab_audit(void *, mcache_obj_t
*, boolean_t
);
544 static void mbuf_slab_notify(void *, u_int32_t
);
545 static unsigned int cslab_alloc(mbuf_class_t
, mcache_obj_t
***,
547 static unsigned int cslab_free(mbuf_class_t
, mcache_obj_t
*, int);
548 static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t
***,
550 static void mbuf_cslab_free(void *, mcache_obj_t
*, int);
551 static void mbuf_cslab_audit(void *, mcache_obj_t
*, boolean_t
);
552 static int freelist_populate(mbuf_class_t
, unsigned int, int);
553 static boolean_t
mbuf_cached_above(mbuf_class_t
, int);
554 static boolean_t
mbuf_steal(mbuf_class_t
, unsigned int);
555 static void m_reclaim(mbuf_class_t
, unsigned int, boolean_t
);
556 static int m_howmany(int, size_t);
557 static void mbuf_worker_thread(void);
558 static boolean_t
mbuf_sleep(mbuf_class_t
, unsigned int, int);
560 static void mcl_audit_init(void *, mcache_audit_t
**, mcache_obj_t
**,
561 size_t, unsigned int);
562 static mcache_audit_t
*mcl_audit_buf2mca(mbuf_class_t
, mcache_obj_t
*);
563 static void mcl_audit_mbuf(mcache_audit_t
*, void *, boolean_t
, boolean_t
);
564 static void mcl_audit_cluster(mcache_audit_t
*, void *, size_t, boolean_t
,
566 static void mcl_audit_restore_mbuf(struct mbuf
*, mcache_audit_t
*, boolean_t
);
567 static void mcl_audit_save_mbuf(struct mbuf
*, mcache_audit_t
*);
568 static void mcl_audit_mcheck_panic(struct mbuf
*);
569 static void mcl_audit_verify_nextptr(void *, mcache_audit_t
*);
571 static mcl_slab_t
*slab_get(void *);
572 static void slab_init(mcl_slab_t
*, mbuf_class_t
, u_int32_t
,
573 void *, void *, unsigned int, int, int);
574 static void slab_insert(mcl_slab_t
*, mbuf_class_t
);
575 static void slab_remove(mcl_slab_t
*, mbuf_class_t
);
576 static boolean_t
slab_inrange(mcl_slab_t
*, void *);
577 static void slab_nextptr_panic(mcl_slab_t
*, void *);
578 static void slab_detach(mcl_slab_t
*);
579 static boolean_t
slab_is_detached(mcl_slab_t
*);
582 * This flag is set for all mbufs that come out of and into the composite
583 * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL. mbufs that
584 * are marked with such a flag have clusters attached to them, and will be
585 * treated differently when they are freed; instead of being placed back
586 * into the mbuf and cluster freelists, the composite mbuf + cluster objects
587 * are placed back into the appropriate composite cache's freelist, and the
588 * actual freeing is deferred until the composite objects are purged. At
589 * such a time, this flag will be cleared from the mbufs and the objects
590 * will be freed into their own separate freelists.
592 #define EXTF_COMPOSITE 0x1
594 #define MEXT_RFA(m) ((m)->m_ext.ext_refflags)
595 #define MEXT_REF(m) (MEXT_RFA(m)->refcnt)
596 #define MEXT_FLAGS(m) (MEXT_RFA(m)->flags)
597 #define MBUF_IS_COMPOSITE(m) \
598 (MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_COMPOSITE))
601 * Macros used to verify the integrity of the mbuf.
603 #define _MCHECK(m) { \
604 if ((m)->m_type != MT_FREE) { \
605 if (mclaudit == NULL) \
606 panic("MCHECK: m_type=%d m=%p", \
607 (u_int16_t)(m)->m_type, m); \
609 mcl_audit_mcheck_panic(m); \
613 #define MBUF_IN_MAP(addr) \
614 ((void *)(addr) >= (void *)mbutl && (void *)(addr) < (void *)embutl)
616 #define MRANGE(addr) { \
617 if (!MBUF_IN_MAP(addr)) \
618 panic("MRANGE: address out of range 0x%p", addr); \
622 * Macro version of mtod.
624 #define MTOD(m, t) ((t)((m)->m_data))
627 * Macros to obtain cluster index and base cluster address.
629 #define MTOCL(x) (((char *)(x) - (char *)mbutl) >> MCLSHIFT)
630 #define CLTOM(x) ((union mcluster *)(mbutl + (x)))
633 * Macro to find the mbuf index relative to the cluster base.
635 #define MCLIDX(c, m) (((char *)(m) - (char *)(c)) >> 8)
638 * Macros used during mbuf and cluster initialization.
640 #define MBUF_INIT(m, pkthdr, type) { \
642 (m)->m_next = (m)->m_nextpkt = NULL; \
644 (m)->m_type = type; \
645 if ((pkthdr) == 0) { \
646 (m)->m_data = (m)->m_dat; \
649 (m)->m_data = (m)->m_pktdat; \
650 (m)->m_flags = M_PKTHDR; \
651 (m)->m_pkthdr.rcvif = NULL; \
652 (m)->m_pkthdr.len = 0; \
653 (m)->m_pkthdr.header = NULL; \
654 (m)->m_pkthdr.csum_flags = 0; \
655 (m)->m_pkthdr.csum_data = 0; \
656 (m)->m_pkthdr.reserved0 = NULL; \
657 (m)->m_pkthdr.vlan_tag = 0; \
658 (m)->m_pkthdr.socket_id = 0; \
663 #define MEXT_INIT(m, buf, size, free, arg, rfa, ref, flag) { \
664 (m)->m_data = (m)->m_ext.ext_buf = (buf); \
665 (m)->m_flags |= M_EXT; \
666 (m)->m_ext.ext_size = (size); \
667 (m)->m_ext.ext_free = (free); \
668 (m)->m_ext.ext_arg = (arg); \
669 (m)->m_ext.ext_refs.forward = (m)->m_ext.ext_refs.backward = \
670 &(m)->m_ext.ext_refs; \
671 MEXT_RFA(m) = (rfa); \
672 MEXT_REF(m) = (ref); \
673 MEXT_FLAGS(m) = (flag); \
676 #define MBUF_CL_INIT(m, buf, rfa, ref, flag) \
677 MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, ref, flag)
679 #define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
680 MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, ref, flag)
682 #define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
683 MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, ref, flag)
686 * Macro to convert BSD malloc sleep flag to mcache's
688 #define MSLEEPF(f) ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
691 * The structure that holds all mbuf class statistics exportable via sysctl.
692 * Similar to mbstat structure, the mb_stat structure is protected by the
693 * global mbuf lock. It contains additional information about the classes
694 * that allows for a more accurate view of the state of the allocator.
696 struct mb_stat
*mb_stat
;
698 #define MB_STAT_SIZE(n) \
699 ((size_t)(&((mb_stat_t *)0)->mbs_class[n]))
702 * The legacy structure holding all of the mbuf allocation statistics.
703 * The actual statistics used by the kernel are stored in the mbuf_table
704 * instead, and are updated atomically while the global mbuf lock is held.
705 * They are mirrored in mbstat to support legacy applications (e.g. netstat).
706 * Unlike before, the kernel no longer relies on the contents of mbstat for
707 * its operations (e.g. cluster expansion) because the structure is exposed
708 * to outside and could possibly be modified, therefore making it unsafe.
709 * With the exception of the mbstat.m_mtypes array (see below), all of the
710 * statistics are updated as they change.
712 struct mbstat mbstat
;
714 #define MBSTAT_MTYPES_MAX \
715 (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
718 * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
719 * atomically and stored in a per-CPU structure which is lock-free; this is
720 * done in order to avoid writing to the global mbstat data structure which
721 * would cause false sharing. During sysctl request for kern.ipc.mbstat,
722 * the statistics across all CPUs will be converged into the mbstat.m_mtypes
723 * array and returned to the application. Any updates for types greater or
724 * equal than MT_MAX would be done atomically to the mbstat; this slows down
725 * performance but is okay since the kernel uses only up to MT_MAX-1 while
726 * anything beyond that (up to type 255) is considered a corner case.
729 unsigned int cpu_mtypes
[MT_MAX
];
730 } __attribute__((aligned(CPU_CACHE_SIZE
), packed
)) mtypes_cpu_t
;
733 mtypes_cpu_t mbs_cpu
[1];
736 static mbuf_mtypes_t
*mbuf_mtypes
; /* per-CPU statistics */
738 #define MBUF_MTYPES_SIZE(n) \
739 ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n]))
741 #define MTYPES_CPU(p) \
742 ((mtypes_cpu_t *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number())))
744 /* This should be in a header file */
745 #define atomic_add_32(a, n) ((void) OSAddAtomic(n, (volatile SInt32 *)a))
747 #define mtype_stat_add(type, n) { \
748 if ((unsigned)(type) < MT_MAX) { \
749 mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes); \
750 atomic_add_32(&mbs->cpu_mtypes[type], n); \
751 } else if ((unsigned)(type) < MBSTAT_MTYPES_MAX) { \
752 atomic_add_32(&mbstat.m_mtypes[type], n); \
756 #define mtype_stat_sub(t, n) mtype_stat_add(t, -(n))
757 #define mtype_stat_inc(t) mtype_stat_add(t, 1)
758 #define mtype_stat_dec(t) mtype_stat_sub(t, 1)
761 mbstat_sysctl SYSCTL_HANDLER_ARGS
763 #pragma unused(oidp, arg1, arg2)
767 bzero(&mtc
, sizeof (mtc
));
768 for (m
= 0; m
< ncpu
; m
++) {
769 mtypes_cpu_t
*scp
= &mbuf_mtypes
->mbs_cpu
[m
];
772 bcopy(&scp
->cpu_mtypes
, &temp
.cpu_mtypes
,
773 sizeof (temp
.cpu_mtypes
));
775 for (n
= 0; n
< MT_MAX
; n
++)
776 mtc
.cpu_mtypes
[n
] += temp
.cpu_mtypes
[n
];
778 lck_mtx_lock(mbuf_mlock
);
779 for (n
= 0; n
< MT_MAX
; n
++)
780 mbstat
.m_mtypes
[n
] = mtc
.cpu_mtypes
[n
];
781 lck_mtx_unlock(mbuf_mlock
);
783 return (SYSCTL_OUT(req
, &mbstat
, sizeof (mbstat
)));
787 mb_stat_sysctl SYSCTL_HANDLER_ARGS
789 #pragma unused(oidp, arg1, arg2)
795 lck_mtx_lock(mbuf_mlock
);
796 for (k
= 0; k
< NELEM(mbuf_table
); k
++) {
798 ccp
= &cp
->mc_cpu
[0];
799 bktsize
= ccp
->cc_bktsize
;
800 sp
= mbuf_table
[k
].mtbl_stats
;
802 if (cp
->mc_flags
& MCF_NOCPUCACHE
)
803 sp
->mbcl_mc_state
= MCS_DISABLED
;
804 else if (cp
->mc_purge_cnt
> 0)
805 sp
->mbcl_mc_state
= MCS_PURGING
;
806 else if (bktsize
== 0)
807 sp
->mbcl_mc_state
= MCS_OFFLINE
;
809 sp
->mbcl_mc_state
= MCS_ONLINE
;
811 sp
->mbcl_mc_cached
= 0;
812 for (m
= 0; m
< ncpu
; m
++) {
813 ccp
= &cp
->mc_cpu
[m
];
814 if (ccp
->cc_objs
> 0)
815 sp
->mbcl_mc_cached
+= ccp
->cc_objs
;
816 if (ccp
->cc_pobjs
> 0)
817 sp
->mbcl_mc_cached
+= ccp
->cc_pobjs
;
819 sp
->mbcl_mc_cached
+= (cp
->mc_full
.bl_total
* bktsize
);
820 sp
->mbcl_active
= sp
->mbcl_total
- sp
->mbcl_mc_cached
-
823 sp
->mbcl_mc_waiter_cnt
= cp
->mc_waiter_cnt
;
824 sp
->mbcl_mc_wretry_cnt
= cp
->mc_wretry_cnt
;
825 sp
->mbcl_mc_nwretry_cnt
= cp
->mc_nwretry_cnt
;
827 /* Calculate total count specific to each class */
828 sp
->mbcl_ctotal
= sp
->mbcl_total
;
829 switch (m_class(k
)) {
831 /* Deduct mbufs used in composite caches */
832 sp
->mbcl_ctotal
-= (m_total(MC_MBUF_CL
) +
833 m_total(MC_MBUF_BIGCL
));
837 /* Deduct clusters used in composite cache and mbufs */
838 sp
->mbcl_ctotal
-= (m_total(MC_MBUF_CL
) +
839 (P2ROUNDUP(m_total(MC_MBUF
), NMBPCL
)/NMBPCL
));
843 /* Deduct clusters used in composite cache */
844 sp
->mbcl_ctotal
-= m_total(MC_MBUF_BIGCL
);
848 /* Deduct clusters used in composite cache */
849 sp
->mbcl_ctotal
-= m_total(MC_MBUF_16KCL
);
856 lck_mtx_unlock(mbuf_mlock
);
858 return (SYSCTL_OUT(req
, mb_stat
, MB_STAT_SIZE(NELEM(mbuf_table
))));
862 m_incref(struct mbuf
*m
)
865 volatile UInt32
*addr
= (volatile UInt32
*)&MEXT_REF(m
);
871 } while (!OSCompareAndSwap(old
, new, addr
));
874 static inline u_int32_t
875 m_decref(struct mbuf
*m
)
878 volatile UInt32
*addr
= (volatile UInt32
*)&MEXT_REF(m
);
884 } while (!OSCompareAndSwap(old
, new, addr
));
890 mbuf_table_init(void)
894 MALLOC(mb_stat
, mb_stat_t
*, MB_STAT_SIZE(NELEM(mbuf_table
)),
895 M_TEMP
, M_WAITOK
| M_ZERO
);
896 VERIFY(mb_stat
!= NULL
);
898 mb_stat
->mbs_cnt
= NELEM(mbuf_table
);
899 for (m
= 0; m
< NELEM(mbuf_table
); m
++)
900 mbuf_table
[m
].mtbl_stats
= &mb_stat
->mbs_class
[m
];
902 #if CONFIG_MBUF_JUMBO
904 * Set aside 1/3 of the mbuf cluster map for jumbo clusters; we do
905 * this only on platforms where jumbo cluster pool is enabled.
907 njcl
= nmbclusters
/ 3;
908 njclbytes
= M16KCLBYTES
;
909 #endif /* CONFIG_MBUF_JUMBO */
912 * nclusters is going to be split in 2 to hold both the 2K
913 * and the 4K pools, so make sure each half is even.
915 nclusters
= P2ROUNDDOWN(nmbclusters
- njcl
, 4);
918 * Each jumbo cluster takes 8 2K clusters, so make
919 * sure that the pool size is evenly divisible by 8.
921 njcl
= P2ROUNDDOWN(nmbclusters
- nclusters
, 8);
924 #if CONFIG_MBUF_NOEXPAND
925 /* Only use 4k clusters if we're setting aside more than 256k */
926 if (nmbclusters
<= 128) {
927 maxmbufcl
= nmbclusters
/ 4;
929 /* Half to big clusters, half to small */
930 maxmbufcl
= (nmbclusters
/ 4) * 3;
932 #endif /* CONFIG_MBUF_NOEXPAND */
935 * 1/2 of the map is reserved for 2K clusters. Out of this, 1/16th
936 * of the total number of 2K clusters allocated is reserved and cannot
937 * be turned into mbufs. It can only be used for pure cluster objects.
939 m_minlimit(MC_CL
) = (nclusters
>> 5);
940 m_maxlimit(MC_CL
) = (nclusters
>> 1);
941 m_maxsize(MC_CL
) = m_size(MC_CL
) = MCLBYTES
;
942 (void) snprintf(m_cname(MC_CL
), MAX_MBUF_CNAME
, "cl");
945 * The remaining (15/16th) can be turned into mbufs.
947 m_minlimit(MC_MBUF
) = 0;
948 m_maxlimit(MC_MBUF
) = (m_maxlimit(MC_CL
) - m_minlimit(MC_CL
)) * NMBPCL
;
949 m_maxsize(MC_MBUF
) = m_size(MC_MBUF
) = MSIZE
;
950 (void) snprintf(m_cname(MC_MBUF
), MAX_MBUF_CNAME
, "mbuf");
953 * The other 1/2 of the map is reserved for 4K clusters.
955 m_minlimit(MC_BIGCL
) = 0;
956 m_maxlimit(MC_BIGCL
) = m_maxlimit(MC_CL
) >> 1;
957 m_maxsize(MC_BIGCL
) = m_size(MC_BIGCL
) = NBPG
;
958 (void) snprintf(m_cname(MC_BIGCL
), MAX_MBUF_CNAME
, "bigcl");
961 * Set limits for the composite classes.
963 m_minlimit(MC_MBUF_CL
) = 0;
964 m_maxlimit(MC_MBUF_CL
) = m_maxlimit(MC_CL
) - m_minlimit(MC_CL
);
965 m_maxsize(MC_MBUF_CL
) = MCLBYTES
;
966 m_size(MC_MBUF_CL
) = m_size(MC_MBUF
) + m_size(MC_CL
);
967 (void) snprintf(m_cname(MC_MBUF_CL
), MAX_MBUF_CNAME
, "mbuf_cl");
969 m_minlimit(MC_MBUF_BIGCL
) = 0;
970 m_maxlimit(MC_MBUF_BIGCL
) = m_maxlimit(MC_BIGCL
);
971 m_maxsize(MC_MBUF_BIGCL
) = NBPG
;
972 m_size(MC_MBUF_BIGCL
) = m_size(MC_MBUF
) + m_size(MC_BIGCL
);
973 (void) snprintf(m_cname(MC_MBUF_BIGCL
), MAX_MBUF_CNAME
, "mbuf_bigcl");
976 * And for jumbo classes.
978 m_minlimit(MC_16KCL
) = 0;
979 m_maxlimit(MC_16KCL
) = (njcl
>> 3);
980 m_maxsize(MC_16KCL
) = m_size(MC_16KCL
) = M16KCLBYTES
;
981 (void) snprintf(m_cname(MC_16KCL
), MAX_MBUF_CNAME
, "16kcl");
983 m_minlimit(MC_MBUF_16KCL
) = 0;
984 m_maxlimit(MC_MBUF_16KCL
) = m_maxlimit(MC_16KCL
);
985 m_maxsize(MC_MBUF_16KCL
) = M16KCLBYTES
;
986 m_size(MC_MBUF_16KCL
) = m_size(MC_MBUF
) + m_size(MC_16KCL
);
987 (void) snprintf(m_cname(MC_MBUF_16KCL
), MAX_MBUF_CNAME
, "mbuf_16kcl");
990 * Initialize the legacy mbstat structure.
992 bzero(&mbstat
, sizeof (mbstat
));
993 mbstat
.m_msize
= m_maxsize(MC_MBUF
);
994 mbstat
.m_mclbytes
= m_maxsize(MC_CL
);
995 mbstat
.m_minclsize
= MINCLSIZE
;
996 mbstat
.m_mlen
= MLEN
;
997 mbstat
.m_mhlen
= MHLEN
;
998 mbstat
.m_bigmclbytes
= m_maxsize(MC_BIGCL
);
1001 __private_extern__
void
1005 int initmcl
= MINCL
;
1009 if (nmbclusters
== 0)
1010 nmbclusters
= NMBCLUSTERS
;
1012 /* Setup the mbuf table */
1015 /* Global lock for common layer */
1016 mbuf_mlock_grp_attr
= lck_grp_attr_alloc_init();
1017 mbuf_mlock_grp
= lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr
);
1018 mbuf_mlock_attr
= lck_attr_alloc_init();
1019 mbuf_mlock
= lck_mtx_alloc_init(mbuf_mlock_grp
, mbuf_mlock_attr
);
1021 /* Allocate cluster slabs table */
1022 maxslabgrp
= P2ROUNDUP(nmbclusters
, NSLABSPMB
) / NSLABSPMB
;
1023 MALLOC(slabstbl
, mcl_slabg_t
**, maxslabgrp
* sizeof (mcl_slabg_t
*),
1024 M_TEMP
, M_WAITOK
| M_ZERO
);
1025 VERIFY(slabstbl
!= NULL
);
1027 /* Allocate audit structures if needed */
1028 PE_parse_boot_argn("mbuf_debug", &mbuf_debug
, sizeof (mbuf_debug
));
1029 mbuf_debug
|= mcache_getflags();
1030 if (mbuf_debug
& MCF_AUDIT
) {
1031 MALLOC(mclaudit
, mcl_audit_t
*,
1032 nmbclusters
* sizeof (*mclaudit
), M_TEMP
,
1034 VERIFY(mclaudit
!= NULL
);
1036 mcl_audit_con_cache
= mcache_create("mcl_audit_contents",
1037 AUDIT_CONTENTS_SIZE
, 0, 0, MCR_SLEEP
);
1038 VERIFY(mcl_audit_con_cache
!= NULL
);
1041 /* Calculate the number of pages assigned to the cluster pool */
1042 mcl_pages
= nmbclusters
/(NBPG
/CLBYTES
);
1043 MALLOC(mcl_paddr
, int *, mcl_pages
* sizeof (int), M_TEMP
, M_WAITOK
);
1044 VERIFY(mcl_paddr
!= NULL
);
1046 /* Register with the I/O Bus mapper */
1047 mcl_paddr_base
= IOMapperIOVMAlloc(mcl_pages
);
1048 bzero((char *)mcl_paddr
, mcl_pages
* sizeof (int));
1050 embutl
= (union mcluster
*)
1051 ((unsigned char *)mbutl
+ (nmbclusters
* MCLBYTES
));
1053 PE_parse_boot_argn("initmcl", &initmcl
, sizeof (initmcl
));
1055 lck_mtx_lock(mbuf_mlock
);
1057 if (m_clalloc(MAX(NBPG
/CLBYTES
, 1) * initmcl
, M_WAIT
, MCLBYTES
) == 0)
1058 panic("mbinit: m_clalloc failed\n");
1060 lck_mtx_unlock(mbuf_mlock
);
1062 (void) kernel_thread(kernel_task
, mbuf_worker_thread_init
);
1064 ref_cache
= mcache_create("mext_ref", sizeof (struct ext_ref
),
1067 /* Create the cache for each class */
1068 for (m
= 0; m
< NELEM(mbuf_table
); m
++) {
1069 void *allocfunc
, *freefunc
, *auditfunc
;
1073 if (m_class(m
) == MC_MBUF_CL
|| m_class(m
) == MC_MBUF_BIGCL
||
1074 m_class(m
) == MC_MBUF_16KCL
) {
1075 allocfunc
= mbuf_cslab_alloc
;
1076 freefunc
= mbuf_cslab_free
;
1077 auditfunc
= mbuf_cslab_audit
;
1079 allocfunc
= mbuf_slab_alloc
;
1080 freefunc
= mbuf_slab_free
;
1081 auditfunc
= mbuf_slab_audit
;
1085 * Disable per-CPU caches for jumbo classes if there
1086 * is no jumbo cluster pool available in the system.
1087 * The cache itself is still created (but will never
1088 * be populated) since it simplifies the code.
1090 if ((m_class(m
) == MC_MBUF_16KCL
|| m_class(m
) == MC_16KCL
) &&
1092 flags
|= MCF_NOCPUCACHE
;
1094 m_cache(m
) = mcache_create_ext(m_cname(m
), m_maxsize(m
),
1095 allocfunc
, freefunc
, auditfunc
, mbuf_slab_notify
,
1096 (void *)m
, flags
, MCR_SLEEP
);
1100 * Allocate structure for per-CPU statistics that's aligned
1101 * on the CPU cache boundary; this code assumes that we never
1102 * uninitialize this framework, since the original address
1103 * before alignment is not saved.
1105 ncpu
= ml_get_max_cpus();
1106 MALLOC(buf
, void *, MBUF_MTYPES_SIZE(ncpu
) + CPU_CACHE_SIZE
,
1108 VERIFY(buf
!= NULL
);
1110 mbuf_mtypes
= (mbuf_mtypes_t
*)P2ROUNDUP((intptr_t)buf
, CPU_CACHE_SIZE
);
1111 bzero(mbuf_mtypes
, MBUF_MTYPES_SIZE(ncpu
));
1113 printf("mbinit: done\n");
1117 * Obtain a slab of object(s) from the class's freelist.
1119 static mcache_obj_t
*
1120 slab_alloc(mbuf_class_t
class, int wait
)
1125 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
1127 VERIFY(class != MC_16KCL
|| njcl
> 0);
1129 /* This should always be NULL for us */
1130 VERIFY(m_cobjlist(class) == NULL
);
1133 * Treat composite objects as having longer lifespan by using
1134 * a slab from the reverse direction, in hoping that this could
1135 * reduce the probability of fragmentation for slabs that hold
1136 * more than one buffer chunks (e.g. mbuf slabs). For other
1137 * slabs, this probably doesn't make much of a difference.
1139 if (class == MC_MBUF
&& (wait
& MCR_COMP
))
1140 sp
= (mcl_slab_t
*)TAILQ_LAST(&m_slablist(class), mcl_slhead
);
1142 sp
= (mcl_slab_t
*)TAILQ_FIRST(&m_slablist(class));
1145 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1146 /* The slab list for this class is empty */
1150 VERIFY(m_infree(class) > 0);
1151 VERIFY(!slab_is_detached(sp
));
1152 VERIFY(sp
->sl_class
== class &&
1153 (sp
->sl_flags
& (SLF_MAPPED
| SLF_PARTIAL
)) == SLF_MAPPED
);
1155 VERIFY(slab_inrange(sp
, buf
) && sp
== slab_get(buf
));
1157 if (class == MC_MBUF
) {
1158 sp
->sl_head
= buf
->obj_next
;
1159 VERIFY(sp
->sl_head
!= NULL
|| sp
->sl_refcnt
== (NMBPCL
- 1));
1163 if (sp
->sl_head
!= NULL
&& !slab_inrange(sp
, sp
->sl_head
)) {
1164 slab_nextptr_panic(sp
, sp
->sl_head
);
1165 /* In case sl_head is in the map but not in the slab */
1166 VERIFY(slab_inrange(sp
, sp
->sl_head
));
1170 /* Increment slab reference */
1173 if (mclaudit
!= NULL
) {
1174 mcache_audit_t
*mca
= mcl_audit_buf2mca(class, buf
);
1175 mca
->mca_uflags
= 0;
1176 /* Save contents on mbuf objects only */
1177 if (class == MC_MBUF
)
1178 mca
->mca_uflags
|= MB_SCVALID
;
1181 if (class == MC_CL
) {
1182 mbstat
.m_clfree
= (--m_infree(MC_CL
)) + m_infree(MC_MBUF_CL
);
1184 * A 2K cluster slab can have at most 1 reference.
1186 VERIFY(sp
->sl_refcnt
== 1 && sp
->sl_chunks
== 1 &&
1187 sp
->sl_len
== m_maxsize(MC_CL
) && sp
->sl_head
== NULL
);
1188 } else if (class == MC_BIGCL
) {
1189 mcl_slab_t
*nsp
= sp
->sl_next
;
1190 mbstat
.m_bigclfree
= (--m_infree(MC_BIGCL
)) +
1191 m_infree(MC_MBUF_BIGCL
);
1193 * Increment 2nd slab. A 4K big cluster takes
1194 * 2 slabs, each having at most 1 reference.
1196 VERIFY(sp
->sl_refcnt
== 1 && sp
->sl_chunks
== 1 &&
1197 sp
->sl_len
== m_maxsize(MC_BIGCL
) && sp
->sl_head
== NULL
);
1198 /* Next slab must already be present */
1199 VERIFY(nsp
!= NULL
);
1201 VERIFY(!slab_is_detached(nsp
));
1202 VERIFY(nsp
->sl_class
== MC_BIGCL
&&
1203 nsp
->sl_flags
== (SLF_MAPPED
| SLF_PARTIAL
) &&
1204 nsp
->sl_refcnt
== 1 && nsp
->sl_chunks
== 0 &&
1205 nsp
->sl_len
== 0 && nsp
->sl_base
== sp
->sl_base
&&
1206 nsp
->sl_head
== NULL
);
1207 } else if (class == MC_16KCL
) {
1211 --m_infree(MC_16KCL
);
1212 VERIFY(sp
->sl_refcnt
== 1 && sp
->sl_chunks
== 1 &&
1213 sp
->sl_len
== m_maxsize(MC_16KCL
) && sp
->sl_head
== NULL
);
1215 * Increment 2nd-8th slab. A 16K big cluster takes
1216 * 8 cluster slabs, each having at most 1 reference.
1218 for (nsp
= sp
, k
= 1; k
< (M16KCLBYTES
/ MCLBYTES
); k
++) {
1220 /* Next slab must already be present */
1221 VERIFY(nsp
!= NULL
);
1223 VERIFY(!slab_is_detached(nsp
));
1224 VERIFY(nsp
->sl_class
== MC_16KCL
&&
1225 nsp
->sl_flags
== (SLF_MAPPED
| SLF_PARTIAL
) &&
1226 nsp
->sl_refcnt
== 1 && nsp
->sl_chunks
== 0 &&
1227 nsp
->sl_len
== 0 && nsp
->sl_base
== sp
->sl_base
&&
1228 nsp
->sl_head
== NULL
);
1231 ASSERT(class == MC_MBUF
);
1232 --m_infree(MC_MBUF
);
1234 * If auditing is turned on, this check is
1235 * deferred until later in mbuf_slab_audit().
1237 if (mclaudit
== NULL
)
1238 _MCHECK((struct mbuf
*)buf
);
1240 * Since we have incremented the reference count above,
1241 * an mbuf slab (formerly a 2K cluster slab that was cut
1242 * up into mbufs) must have a reference count between 1
1243 * and NMBPCL at this point.
1245 VERIFY(sp
->sl_refcnt
>= 1 &&
1246 (unsigned short)sp
->sl_refcnt
<= NMBPCL
&&
1247 sp
->sl_chunks
== NMBPCL
&& sp
->sl_len
== m_maxsize(MC_CL
));
1248 VERIFY((unsigned short)sp
->sl_refcnt
< NMBPCL
||
1249 sp
->sl_head
== NULL
);
1252 /* If empty, remove this slab from the class's freelist */
1253 if (sp
->sl_head
== NULL
) {
1254 VERIFY(class != MC_MBUF
|| sp
->sl_refcnt
== NMBPCL
);
1255 slab_remove(sp
, class);
1262 * Place a slab of object(s) back into a class's slab list.
1265 slab_free(mbuf_class_t
class, mcache_obj_t
*buf
)
1269 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
1271 VERIFY(class != MC_16KCL
|| njcl
> 0);
1272 VERIFY(buf
->obj_next
== NULL
);
1274 VERIFY(sp
->sl_class
== class && slab_inrange(sp
, buf
) &&
1275 (sp
->sl_flags
& (SLF_MAPPED
| SLF_PARTIAL
)) == SLF_MAPPED
);
1277 /* Decrement slab reference */
1280 if (class == MC_CL
|| class == MC_BIGCL
) {
1281 VERIFY(IS_P2ALIGNED(buf
, MCLBYTES
));
1283 * A 2K cluster slab can have at most 1 reference
1284 * which must be 0 at this point.
1286 VERIFY(sp
->sl_refcnt
== 0 && sp
->sl_chunks
== 1 &&
1287 sp
->sl_len
== m_maxsize(class) && sp
->sl_head
== NULL
);
1288 VERIFY(slab_is_detached(sp
));
1289 if (class == MC_BIGCL
) {
1290 mcl_slab_t
*nsp
= sp
->sl_next
;
1291 VERIFY(IS_P2ALIGNED(buf
, NBPG
));
1292 /* Next slab must already be present */
1293 VERIFY(nsp
!= NULL
);
1294 /* Decrement 2nd slab reference */
1297 * A 4K big cluster takes 2 slabs, both
1298 * must now have 0 reference.
1300 VERIFY(slab_is_detached(nsp
));
1301 VERIFY(nsp
->sl_class
== MC_BIGCL
&&
1302 (nsp
->sl_flags
& (SLF_MAPPED
| SLF_PARTIAL
)) &&
1303 nsp
->sl_refcnt
== 0 && nsp
->sl_chunks
== 0 &&
1304 nsp
->sl_len
== 0 && nsp
->sl_base
== sp
->sl_base
&&
1305 nsp
->sl_head
== NULL
);
1307 } else if (class == MC_16KCL
) {
1311 * A 16K cluster takes 8 cluster slabs, all must
1312 * now have 0 reference.
1314 VERIFY(IS_P2ALIGNED(buf
, NBPG
));
1315 VERIFY(sp
->sl_refcnt
== 0 && sp
->sl_chunks
== 1 &&
1316 sp
->sl_len
== m_maxsize(MC_16KCL
) && sp
->sl_head
== NULL
);
1317 VERIFY(slab_is_detached(sp
));
1318 for (nsp
= sp
, k
= 1; k
< (M16KCLBYTES
/ MCLBYTES
); k
++) {
1320 /* Next slab must already be present */
1321 VERIFY(nsp
!= NULL
);
1323 VERIFY(slab_is_detached(nsp
));
1324 VERIFY(nsp
->sl_class
== MC_16KCL
&&
1325 (nsp
->sl_flags
& (SLF_MAPPED
| SLF_PARTIAL
)) &&
1326 nsp
->sl_refcnt
== 0 && nsp
->sl_chunks
== 0 &&
1327 nsp
->sl_len
== 0 && nsp
->sl_base
== sp
->sl_base
&&
1328 nsp
->sl_head
== NULL
);
1332 * An mbuf slab has a total of NMBPL reference counts.
1333 * Since we have decremented the reference above, it
1334 * must now be between 0 and NMBPCL-1.
1336 VERIFY(sp
->sl_refcnt
>= 0 &&
1337 (unsigned short)sp
->sl_refcnt
<= (NMBPCL
- 1) &&
1338 sp
->sl_chunks
== NMBPCL
&& sp
->sl_len
== m_maxsize(MC_CL
));
1339 VERIFY(sp
->sl_refcnt
< (NMBPCL
- 1) ||
1340 (slab_is_detached(sp
) && sp
->sl_head
== NULL
));
1344 * When auditing is enabled, ensure that the buffer still
1345 * contains the free pattern. Otherwise it got corrupted
1346 * while at the CPU cache layer.
1348 if (mclaudit
!= NULL
) {
1349 mcache_audit_t
*mca
= mcl_audit_buf2mca(class, buf
);
1350 mcache_audit_free_verify(mca
, buf
, 0, m_maxsize(class));
1351 mca
->mca_uflags
&= ~MB_SCVALID
;
1354 if (class == MC_CL
) {
1355 mbstat
.m_clfree
= (++m_infree(MC_CL
)) + m_infree(MC_MBUF_CL
);
1356 } else if (class == MC_BIGCL
) {
1357 mbstat
.m_bigclfree
= (++m_infree(MC_BIGCL
)) +
1358 m_infree(MC_MBUF_BIGCL
);
1359 } else if (class == MC_16KCL
) {
1360 ++m_infree(MC_16KCL
);
1362 ++m_infree(MC_MBUF
);
1363 buf
->obj_next
= sp
->sl_head
;
1367 /* All mbufs are freed; return the cluster that we stole earlier */
1368 if (sp
->sl_refcnt
== 0 && class == MC_MBUF
) {
1371 m_total(MC_MBUF
) -= NMBPCL
;
1372 mbstat
.m_mbufs
= m_total(MC_MBUF
);
1373 m_infree(MC_MBUF
) -= NMBPCL
;
1374 mtype_stat_add(MT_FREE
, -NMBPCL
);
1377 struct mbuf
*m
= sp
->sl_head
;
1379 sp
->sl_head
= m
->m_next
;
1382 VERIFY(sp
->sl_head
== NULL
);
1384 /* Remove the slab from the mbuf class's slab list */
1385 slab_remove(sp
, class);
1387 /* Reinitialize it as a 2K cluster slab */
1388 slab_init(sp
, MC_CL
, sp
->sl_flags
, sp
->sl_base
, sp
->sl_base
,
1391 if (mclaudit
!= NULL
)
1392 mcache_set_pattern(MCACHE_FREE_PATTERN
,
1393 (caddr_t
)sp
->sl_head
, m_maxsize(MC_CL
));
1395 mbstat
.m_clfree
= (++m_infree(MC_CL
)) + m_infree(MC_MBUF_CL
);
1397 VERIFY(slab_is_detached(sp
));
1398 /* And finally switch class */
1402 /* Reinsert the slab to the class's slab list */
1403 if (slab_is_detached(sp
))
1404 slab_insert(sp
, class);
1408 * Common allocator for rudimentary objects called by the CPU cache layer
1409 * during an allocation request whenever there is no available element in the
1410 * bucket layer. It returns one or more elements from the appropriate global
1411 * freelist. If the freelist is empty, it will attempt to populate it and
1412 * retry the allocation.
1415 mbuf_slab_alloc(void *arg
, mcache_obj_t
***plist
, unsigned int num
, int wait
)
1417 mbuf_class_t
class = (mbuf_class_t
)arg
;
1418 unsigned int need
= num
;
1419 mcache_obj_t
**list
= *plist
;
1421 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1424 lck_mtx_lock(mbuf_mlock
);
1427 if ((*list
= slab_alloc(class, wait
)) != NULL
) {
1428 (*list
)->obj_next
= NULL
;
1429 list
= *plist
= &(*list
)->obj_next
;
1433 * If the number of elements in freelist has
1434 * dropped below low watermark, asynchronously
1435 * populate the freelist now rather than doing
1436 * it later when we run out of elements.
1438 if (!mbuf_cached_above(class, wait
) &&
1439 m_infree(class) < m_total(class) >> 5) {
1440 (void) freelist_populate(class, 1,
1446 VERIFY(m_infree(class) == 0 || class == MC_CL
);
1448 (void) freelist_populate(class, 1,
1449 (wait
& MCR_NOSLEEP
) ? M_DONTWAIT
: M_WAIT
);
1451 if (m_infree(class) > 0)
1454 /* Check if there's anything at the cache layer */
1455 if (mbuf_cached_above(class, wait
))
1458 /* We have nothing and cannot block; give up */
1459 if (wait
& MCR_NOSLEEP
) {
1460 if (!(wait
& MCR_TRYHARD
)) {
1461 m_fail_cnt(class)++;
1468 * If the freelist is still empty and the caller is
1469 * willing to be blocked, sleep on the wait channel
1470 * until an element is available. Otherwise, if
1471 * MCR_TRYHARD is set, do our best to satisfy the
1472 * request without having to go to sleep.
1474 if (mbuf_worker_ready
&&
1475 mbuf_sleep(class, need
, wait
))
1478 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
1482 m_alloc_cnt(class) += num
- need
;
1483 lck_mtx_unlock(mbuf_mlock
);
1485 return (num
- need
);
1489 * Common de-allocator for rudimentary objects called by the CPU cache
1490 * layer when one or more elements need to be returned to the appropriate
1494 mbuf_slab_free(void *arg
, mcache_obj_t
*list
, __unused
int purged
)
1496 mbuf_class_t
class = (mbuf_class_t
)arg
;
1497 mcache_obj_t
*nlist
;
1498 unsigned int num
= 0;
1501 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1503 lck_mtx_lock(mbuf_mlock
);
1506 nlist
= list
->obj_next
;
1507 list
->obj_next
= NULL
;
1508 slab_free(class, list
);
1510 if ((list
= nlist
) == NULL
)
1513 m_free_cnt(class) += num
;
1515 if ((w
= mb_waiters
) > 0)
1518 lck_mtx_unlock(mbuf_mlock
);
1521 wakeup(mb_waitchan
);
1525 * Common auditor for rudimentary objects called by the CPU cache layer
1526 * during an allocation or free request. For the former, this is called
1527 * after the objects are obtained from either the bucket or slab layer
1528 * and before they are returned to the caller. For the latter, this is
1529 * called immediately during free and before placing the objects into
1530 * the bucket or slab layer.
1533 mbuf_slab_audit(void *arg
, mcache_obj_t
*list
, boolean_t alloc
)
1535 mbuf_class_t
class = (mbuf_class_t
)arg
;
1536 mcache_audit_t
*mca
;
1538 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1540 while (list
!= NULL
) {
1541 lck_mtx_lock(mbuf_mlock
);
1542 mca
= mcl_audit_buf2mca(class, list
);
1544 /* Do the sanity checks */
1545 if (class == MC_MBUF
) {
1546 mcl_audit_mbuf(mca
, list
, FALSE
, alloc
);
1547 ASSERT(mca
->mca_uflags
& MB_SCVALID
);
1549 mcl_audit_cluster(mca
, list
, m_maxsize(class),
1551 ASSERT(!(mca
->mca_uflags
& MB_SCVALID
));
1553 /* Record this transaction */
1554 mcache_buffer_log(mca
, list
, m_cache(class));
1556 mca
->mca_uflags
|= MB_INUSE
;
1558 mca
->mca_uflags
&= ~MB_INUSE
;
1559 /* Unpair the object (unconditionally) */
1560 mca
->mca_uptr
= NULL
;
1561 lck_mtx_unlock(mbuf_mlock
);
1563 list
= list
->obj_next
;
1568 * Common notify routine for all caches. It is called by mcache when
1569 * one or more objects get freed. We use this indication to trigger
1570 * the wakeup of any sleeping threads so that they can retry their
1571 * allocation requests.
1574 mbuf_slab_notify(void *arg
, u_int32_t reason
)
1576 mbuf_class_t
class = (mbuf_class_t
)arg
;
1579 ASSERT(MBUF_CLASS_VALID(class));
1581 if (reason
!= MCN_RETRYALLOC
)
1584 lck_mtx_lock(mbuf_mlock
);
1585 if ((w
= mb_waiters
) > 0) {
1586 m_notified(class)++;
1589 lck_mtx_unlock(mbuf_mlock
);
1592 wakeup(mb_waitchan
);
1596 * Obtain object(s) from the composite class's freelist.
1599 cslab_alloc(mbuf_class_t
class, mcache_obj_t
***plist
, unsigned int num
)
1601 unsigned int need
= num
;
1602 mcl_slab_t
*sp
, *clsp
, *nsp
;
1604 mcache_obj_t
**list
= *plist
;
1608 VERIFY(class != MC_MBUF_16KCL
|| njcl
> 0);
1609 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
1611 /* Get what we can from the freelist */
1612 while ((*list
= m_cobjlist(class)) != NULL
) {
1615 m
= (struct mbuf
*)*list
;
1617 cl
= m
->m_ext
.ext_buf
;
1618 clsp
= slab_get(cl
);
1619 VERIFY(m
->m_flags
== M_EXT
&& cl
!= NULL
);
1620 VERIFY(MEXT_RFA(m
) != NULL
&& MBUF_IS_COMPOSITE(m
));
1621 VERIFY(clsp
->sl_refcnt
== 1);
1622 if (class == MC_MBUF_BIGCL
) {
1623 nsp
= clsp
->sl_next
;
1624 /* Next slab must already be present */
1625 VERIFY(nsp
!= NULL
);
1626 VERIFY(nsp
->sl_refcnt
== 1);
1627 } else if (class == MC_MBUF_16KCL
) {
1629 for (nsp
= clsp
, k
= 1;
1630 k
< (M16KCLBYTES
/ MCLBYTES
); k
++) {
1632 /* Next slab must already be present */
1633 VERIFY(nsp
!= NULL
);
1634 VERIFY(nsp
->sl_refcnt
== 1);
1638 if ((m_cobjlist(class) = (*list
)->obj_next
) != NULL
&&
1639 !MBUF_IN_MAP(m_cobjlist(class))) {
1640 slab_nextptr_panic(sp
, m_cobjlist(class));
1643 (*list
)->obj_next
= NULL
;
1644 list
= *plist
= &(*list
)->obj_next
;
1649 m_infree(class) -= (num
- need
);
1651 return (num
- need
);
1655 * Place object(s) back into a composite class's freelist.
1658 cslab_free(mbuf_class_t
class, mcache_obj_t
*list
, int purged
)
1660 mcache_obj_t
*o
, *tail
;
1661 unsigned int num
= 0;
1662 struct mbuf
*m
, *ms
;
1663 mcache_audit_t
*mca
= NULL
;
1664 mcache_obj_t
*ref_list
= NULL
;
1665 mcl_slab_t
*clsp
, *nsp
;
1668 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
1669 VERIFY(class != MC_MBUF_16KCL
|| njcl
> 0);
1670 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
1674 while ((m
= ms
= (struct mbuf
*)o
) != NULL
) {
1675 mcache_obj_t
*rfa
, *nexto
= o
->obj_next
;
1677 /* Do the mbuf sanity checks */
1678 if (mclaudit
!= NULL
) {
1679 mca
= mcl_audit_buf2mca(MC_MBUF
, (mcache_obj_t
*)m
);
1680 mcache_audit_free_verify(mca
, m
, 0, m_maxsize(MC_MBUF
));
1681 ms
= (struct mbuf
*)mca
->mca_contents
;
1684 /* Do the cluster sanity checks */
1685 cl
= ms
->m_ext
.ext_buf
;
1686 clsp
= slab_get(cl
);
1687 if (mclaudit
!= NULL
) {
1689 if (class == MC_MBUF_CL
)
1690 size
= m_maxsize(MC_CL
);
1691 else if (class == MC_MBUF_BIGCL
)
1692 size
= m_maxsize(MC_BIGCL
);
1694 size
= m_maxsize(MC_16KCL
);
1695 mcache_audit_free_verify(mcl_audit_buf2mca(MC_CL
,
1696 (mcache_obj_t
*)cl
), cl
, 0, size
);
1698 VERIFY(ms
->m_type
== MT_FREE
);
1699 VERIFY(ms
->m_flags
== M_EXT
);
1700 VERIFY(MEXT_RFA(ms
) != NULL
&& MBUF_IS_COMPOSITE(ms
));
1701 VERIFY(clsp
->sl_refcnt
== 1);
1702 if (class == MC_MBUF_BIGCL
) {
1703 nsp
= clsp
->sl_next
;
1704 /* Next slab must already be present */
1705 VERIFY(nsp
!= NULL
);
1706 VERIFY(nsp
->sl_refcnt
== 1);
1707 } else if (class == MC_MBUF_16KCL
) {
1709 for (nsp
= clsp
, k
= 1;
1710 k
< (M16KCLBYTES
/ MCLBYTES
); k
++) {
1712 /* Next slab must already be present */
1713 VERIFY(nsp
!= NULL
);
1714 VERIFY(nsp
->sl_refcnt
== 1);
1719 * If we're asked to purge, restore the actual mbuf using
1720 * contents of the shadow structure (if auditing is enabled)
1721 * and clear EXTF_COMPOSITE flag from the mbuf, as we are
1722 * about to free it and the attached cluster into their caches.
1725 /* Restore constructed mbuf fields */
1726 if (mclaudit
!= NULL
)
1727 mcl_audit_restore_mbuf(m
, mca
, TRUE
);
1732 rfa
= (mcache_obj_t
*)MEXT_RFA(m
);
1733 rfa
->obj_next
= ref_list
;
1737 m
->m_type
= MT_FREE
;
1738 m
->m_flags
= m
->m_len
= 0;
1739 m
->m_next
= m
->m_nextpkt
= NULL
;
1741 /* Save mbuf fields and make auditing happy */
1742 if (mclaudit
!= NULL
)
1743 mcl_audit_mbuf(mca
, o
, FALSE
, FALSE
);
1745 VERIFY(m_total(class) > 0);
1750 slab_free(MC_MBUF
, o
);
1752 /* And free the cluster */
1753 ((mcache_obj_t
*)cl
)->obj_next
= NULL
;
1754 if (class == MC_MBUF_CL
)
1755 slab_free(MC_CL
, cl
);
1756 else if (class == MC_MBUF_BIGCL
)
1757 slab_free(MC_BIGCL
, cl
);
1759 slab_free(MC_16KCL
, cl
);
1768 tail
->obj_next
= m_cobjlist(class);
1769 m_cobjlist(class) = list
;
1770 m_infree(class) += num
;
1771 } else if (ref_list
!= NULL
) {
1772 mcache_free_ext(ref_cache
, ref_list
);
1779 * Common allocator for composite objects called by the CPU cache layer
1780 * during an allocation request whenever there is no available element in
1781 * the bucket layer. It returns one or more composite elements from the
1782 * appropriate global freelist. If the freelist is empty, it will attempt
1783 * to obtain the rudimentary objects from their caches and construct them
1784 * into composite mbuf + cluster objects.
1787 mbuf_cslab_alloc(void *arg
, mcache_obj_t
***plist
, unsigned int needed
,
1790 mbuf_class_t
class = (mbuf_class_t
)arg
;
1791 mcache_t
*cp
= NULL
;
1792 unsigned int num
= 0, cnum
= 0, want
= needed
;
1793 mcache_obj_t
*ref_list
= NULL
;
1794 mcache_obj_t
*mp_list
= NULL
;
1795 mcache_obj_t
*clp_list
= NULL
;
1796 mcache_obj_t
**list
;
1797 struct ext_ref
*rfa
;
1801 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
1804 VERIFY(class != MC_MBUF_16KCL
|| njcl
> 0);
1806 /* There should not be any slab for this class */
1807 VERIFY(m_slab_cnt(class) == 0 &&
1808 m_slablist(class).tqh_first
== NULL
&&
1809 m_slablist(class).tqh_last
== NULL
);
1811 lck_mtx_lock(mbuf_mlock
);
1813 /* Try using the freelist first */
1814 num
= cslab_alloc(class, plist
, needed
);
1816 if (num
== needed
) {
1817 m_alloc_cnt(class) += num
;
1818 lck_mtx_unlock(mbuf_mlock
);
1822 lck_mtx_unlock(mbuf_mlock
);
1825 * We could not satisfy the request using the freelist alone;
1826 * allocate from the appropriate rudimentary caches and use
1827 * whatever we can get to construct the composite objects.
1832 * Mark these allocation requests as coming from a composite cache.
1833 * Also, if the caller is willing to be blocked, mark the request
1834 * with MCR_FAILOK such that we don't end up sleeping at the mbuf
1835 * slab layer waiting for the individual object when one or more
1836 * of the already-constructed composite objects are available.
1839 if (!(wait
& MCR_NOSLEEP
))
1842 needed
= mcache_alloc_ext(m_cache(MC_MBUF
), &mp_list
, needed
, wait
);
1844 ASSERT(mp_list
== NULL
);
1847 if (class == MC_MBUF_CL
)
1848 cp
= m_cache(MC_CL
);
1849 else if (class == MC_MBUF_BIGCL
)
1850 cp
= m_cache(MC_BIGCL
);
1852 cp
= m_cache(MC_16KCL
);
1853 needed
= mcache_alloc_ext(cp
, &clp_list
, needed
, wait
);
1855 ASSERT(clp_list
== NULL
);
1858 needed
= mcache_alloc_ext(ref_cache
, &ref_list
, needed
, wait
);
1860 ASSERT(ref_list
== NULL
);
1865 * By this time "needed" is MIN(mbuf, cluster, ref). Any left
1866 * overs will get freed accordingly before we return to caller.
1868 for (cnum
= 0; cnum
< needed
; cnum
++) {
1871 m
= ms
= (struct mbuf
*)mp_list
;
1872 mp_list
= mp_list
->obj_next
;
1875 clp_list
= clp_list
->obj_next
;
1876 ((mcache_obj_t
*)cl
)->obj_next
= NULL
;
1878 rfa
= (struct ext_ref
*)ref_list
;
1879 ref_list
= ref_list
->obj_next
;
1880 ((mcache_obj_t
*)rfa
)->obj_next
= NULL
;
1883 * If auditing is enabled, construct the shadow mbuf
1884 * in the audit structure instead of in the actual one.
1885 * mbuf_cslab_audit() will take care of restoring the
1886 * contents after the integrity check.
1888 if (mclaudit
!= NULL
) {
1889 mcache_audit_t
*mca
, *cl_mca
;
1892 lck_mtx_lock(mbuf_mlock
);
1893 mca
= mcl_audit_buf2mca(MC_MBUF
, (mcache_obj_t
*)m
);
1894 ms
= ((struct mbuf
*)mca
->mca_contents
);
1895 cl_mca
= mcl_audit_buf2mca(MC_CL
, (mcache_obj_t
*)cl
);
1898 * Pair them up. Note that this is done at the time
1899 * the mbuf+cluster objects are constructed. This
1900 * information should be treated as "best effort"
1901 * debugging hint since more than one mbufs can refer
1902 * to a cluster. In that case, the cluster might not
1903 * be freed along with the mbuf it was paired with.
1905 mca
->mca_uptr
= cl_mca
;
1906 cl_mca
->mca_uptr
= mca
;
1908 ASSERT(mca
->mca_uflags
& MB_SCVALID
);
1909 ASSERT(!(cl_mca
->mca_uflags
& MB_SCVALID
));
1910 lck_mtx_unlock(mbuf_mlock
);
1912 /* Technically, they are in the freelist */
1913 mcache_set_pattern(MCACHE_FREE_PATTERN
, m
,
1914 m_maxsize(MC_MBUF
));
1915 if (class == MC_MBUF_CL
)
1916 size
= m_maxsize(MC_CL
);
1917 else if (class == MC_MBUF_BIGCL
)
1918 size
= m_maxsize(MC_BIGCL
);
1920 size
= m_maxsize(MC_16KCL
);
1921 mcache_set_pattern(MCACHE_FREE_PATTERN
, cl
, size
);
1924 MBUF_INIT(ms
, 0, MT_FREE
);
1925 if (class == MC_MBUF_16KCL
) {
1926 MBUF_16KCL_INIT(ms
, cl
, rfa
, 0, EXTF_COMPOSITE
);
1927 } else if (class == MC_MBUF_BIGCL
) {
1928 MBUF_BIGCL_INIT(ms
, cl
, rfa
, 0, EXTF_COMPOSITE
);
1930 MBUF_CL_INIT(ms
, cl
, rfa
, 0, EXTF_COMPOSITE
);
1932 VERIFY(ms
->m_flags
== M_EXT
);
1933 VERIFY(MEXT_RFA(ms
) != NULL
&& MBUF_IS_COMPOSITE(ms
));
1935 *list
= (mcache_obj_t
*)m
;
1936 (*list
)->obj_next
= NULL
;
1937 list
= *plist
= &(*list
)->obj_next
;
1942 * Free up what's left of the above.
1944 if (mp_list
!= NULL
)
1945 mcache_free_ext(m_cache(MC_MBUF
), mp_list
);
1946 if (clp_list
!= NULL
)
1947 mcache_free_ext(cp
, clp_list
);
1948 if (ref_list
!= NULL
)
1949 mcache_free_ext(ref_cache
, ref_list
);
1951 lck_mtx_lock(mbuf_mlock
);
1952 if (num
> 0 || cnum
> 0) {
1953 m_total(class) += cnum
;
1954 VERIFY(m_total(class) <= m_maxlimit(class));
1955 m_alloc_cnt(class) += num
+ cnum
;
1957 if ((num
+ cnum
) < want
)
1958 m_fail_cnt(class) += (want
- (num
+ cnum
));
1959 lck_mtx_unlock(mbuf_mlock
);
1961 return (num
+ cnum
);
1965 * Common de-allocator for composite objects called by the CPU cache
1966 * layer when one or more elements need to be returned to the appropriate
1970 mbuf_cslab_free(void *arg
, mcache_obj_t
*list
, int purged
)
1972 mbuf_class_t
class = (mbuf_class_t
)arg
;
1976 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
1978 lck_mtx_lock(mbuf_mlock
);
1980 num
= cslab_free(class, list
, purged
);
1981 m_free_cnt(class) += num
;
1983 if ((w
= mb_waiters
) > 0)
1986 lck_mtx_unlock(mbuf_mlock
);
1989 wakeup(mb_waitchan
);
1993 * Common auditor for composite objects called by the CPU cache layer
1994 * during an allocation or free request. For the former, this is called
1995 * after the objects are obtained from either the bucket or slab layer
1996 * and before they are returned to the caller. For the latter, this is
1997 * called immediately during free and before placing the objects into
1998 * the bucket or slab layer.
2001 mbuf_cslab_audit(void *arg
, mcache_obj_t
*list
, boolean_t alloc
)
2003 mbuf_class_t
class = (mbuf_class_t
)arg
;
2004 mcache_audit_t
*mca
;
2005 struct mbuf
*m
, *ms
;
2006 mcl_slab_t
*clsp
, *nsp
;
2010 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2012 while ((m
= ms
= (struct mbuf
*)list
) != NULL
) {
2013 lck_mtx_lock(mbuf_mlock
);
2014 /* Do the mbuf sanity checks and record its transaction */
2015 mca
= mcl_audit_buf2mca(MC_MBUF
, (mcache_obj_t
*)m
);
2016 mcl_audit_mbuf(mca
, m
, TRUE
, alloc
);
2017 mcache_buffer_log(mca
, m
, m_cache(class));
2019 mca
->mca_uflags
|= MB_COMP_INUSE
;
2021 mca
->mca_uflags
&= ~MB_COMP_INUSE
;
2024 * Use the shadow mbuf in the audit structure if we are
2025 * freeing, since the contents of the actual mbuf has been
2026 * pattern-filled by the above call to mcl_audit_mbuf().
2029 ms
= (struct mbuf
*)mca
->mca_contents
;
2031 /* Do the cluster sanity checks and record its transaction */
2032 cl
= ms
->m_ext
.ext_buf
;
2033 clsp
= slab_get(cl
);
2034 VERIFY(ms
->m_flags
== M_EXT
&& cl
!= NULL
);
2035 VERIFY(MEXT_RFA(ms
) != NULL
&& MBUF_IS_COMPOSITE(ms
));
2036 VERIFY(clsp
->sl_refcnt
== 1);
2037 if (class == MC_MBUF_BIGCL
) {
2038 nsp
= clsp
->sl_next
;
2039 /* Next slab must already be present */
2040 VERIFY(nsp
!= NULL
);
2041 VERIFY(nsp
->sl_refcnt
== 1);
2042 } else if (class == MC_MBUF_16KCL
) {
2044 for (nsp
= clsp
, k
= 1;
2045 k
< (M16KCLBYTES
/ MCLBYTES
); k
++) {
2047 /* Next slab must already be present */
2048 VERIFY(nsp
!= NULL
);
2049 VERIFY(nsp
->sl_refcnt
== 1);
2053 mca
= mcl_audit_buf2mca(MC_CL
, cl
);
2054 if (class == MC_MBUF_CL
)
2055 size
= m_maxsize(MC_CL
);
2056 else if (class == MC_MBUF_BIGCL
)
2057 size
= m_maxsize(MC_BIGCL
);
2059 size
= m_maxsize(MC_16KCL
);
2060 mcl_audit_cluster(mca
, cl
, size
, alloc
, FALSE
);
2061 mcache_buffer_log(mca
, cl
, m_cache(class));
2063 mca
->mca_uflags
|= MB_COMP_INUSE
;
2065 mca
->mca_uflags
&= ~MB_COMP_INUSE
;
2066 lck_mtx_unlock(mbuf_mlock
);
2068 list
= list
->obj_next
;
2073 * Allocate some number of mbuf clusters and place on cluster freelist.
2076 m_clalloc(const u_int32_t num
, const int wait
, const u_int32_t bufsize
)
2081 vm_offset_t page
= 0;
2082 mcache_audit_t
*mca_list
= NULL
;
2083 mcache_obj_t
*con_list
= NULL
;
2086 VERIFY(bufsize
== m_maxsize(MC_CL
) ||
2087 bufsize
== m_maxsize(MC_BIGCL
) || bufsize
== m_maxsize(MC_16KCL
));
2089 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
2092 * Multiple threads may attempt to populate the cluster map one
2093 * after another. Since we drop the lock below prior to acquiring
2094 * the physical page(s), our view of the cluster map may no longer
2095 * be accurate, and we could end up over-committing the pages beyond
2096 * the maximum allowed for each class. To prevent it, this entire
2097 * operation (including the page mapping) is serialized.
2099 while (mb_clalloc_busy
) {
2100 mb_clalloc_waiters
++;
2101 (void) msleep(mb_clalloc_waitchan
, mbuf_mlock
,
2102 (PZERO
-1), "m_clalloc", NULL
);
2103 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
2106 /* We are busy now; tell everyone else to go away */
2107 mb_clalloc_busy
= TRUE
;
2110 * Honor the caller's wish to block or not block. We have a way
2111 * to grow the pool asynchronously using the mbuf worker thread.
2113 i
= m_howmany(num
, bufsize
);
2114 if (i
== 0 || (wait
& M_DONTWAIT
))
2117 lck_mtx_unlock(mbuf_mlock
);
2119 size
= round_page_32(i
* bufsize
);
2120 page
= kmem_mb_alloc(mb_map
, size
);
2123 if (bufsize
<= m_maxsize(MC_BIGCL
)) {
2124 /* Try for 1 page if failed, only for 2KB/4KB request */
2126 page
= kmem_mb_alloc(mb_map
, size
);
2130 lck_mtx_lock(mbuf_mlock
);
2135 VERIFY(IS_P2ALIGNED(page
, NBPG
));
2136 numpages
= size
/ NBPG
;
2138 /* If auditing is enabled, allocate the audit structures now */
2139 if (mclaudit
!= NULL
) {
2143 * Yes, I realize this is a waste of memory for clusters
2144 * that never get transformed into mbufs, as we may end
2145 * up with NMBPCL-1 unused audit structures per cluster.
2146 * But doing so tremendously simplifies the allocation
2147 * strategy, since at this point we are not holding the
2148 * mbuf lock and the caller is okay to be blocked. For
2149 * the case of big clusters, we allocate one structure
2150 * for each as we never turn them into mbufs.
2152 if (bufsize
== m_maxsize(MC_CL
)) {
2153 needed
= numpages
* 2 * NMBPCL
;
2155 i
= mcache_alloc_ext(mcl_audit_con_cache
,
2156 &con_list
, needed
, MCR_SLEEP
);
2158 VERIFY(con_list
!= NULL
&& i
== needed
);
2159 } else if (bufsize
== m_maxsize(MC_BIGCL
)) {
2162 needed
= numpages
/ (M16KCLBYTES
/ NBPG
);
2165 i
= mcache_alloc_ext(mcache_audit_cache
,
2166 (mcache_obj_t
**)&mca_list
, needed
, MCR_SLEEP
);
2168 VERIFY(mca_list
!= NULL
&& i
== needed
);
2171 lck_mtx_lock(mbuf_mlock
);
2173 for (i
= 0; i
< numpages
; i
++, page
+= NBPG
) {
2174 ppnum_t offset
= ((char *)page
- (char *)mbutl
) / NBPG
;
2175 ppnum_t new_page
= pmap_find_phys(kernel_pmap
,
2176 (vm_address_t
)page
);
2179 * In the case of no mapper being available the following
2180 * code noops and returns the input page; if there is a
2181 * mapper the appropriate I/O page is returned.
2183 new_page
= IOMapperInsertPage(mcl_paddr_base
, offset
, new_page
);
2184 mcl_paddr
[offset
] = new_page
<< PGSHIFT
;
2186 /* Pattern-fill this fresh page */
2187 if (mclaudit
!= NULL
)
2188 mcache_set_pattern(MCACHE_FREE_PATTERN
,
2189 (caddr_t
)page
, NBPG
);
2191 if (bufsize
== m_maxsize(MC_CL
)) {
2192 union mcluster
*mcl
= (union mcluster
*)page
;
2194 /* 1st cluster in the page */
2196 if (mclaudit
!= NULL
)
2197 mcl_audit_init(mcl
, &mca_list
, &con_list
,
2198 AUDIT_CONTENTS_SIZE
, NMBPCL
);
2200 VERIFY(sp
->sl_refcnt
== 0 && sp
->sl_flags
== 0);
2201 slab_init(sp
, MC_CL
, SLF_MAPPED
,
2202 mcl
, mcl
, bufsize
, 0, 1);
2204 /* Insert this slab */
2205 slab_insert(sp
, MC_CL
);
2207 /* Update stats now since slab_get() drops the lock */
2208 mbstat
.m_clfree
= ++m_infree(MC_CL
) +
2209 m_infree(MC_MBUF_CL
);
2210 mbstat
.m_clusters
= ++m_total(MC_CL
);
2211 VERIFY(m_total(MC_CL
) <= m_maxlimit(MC_CL
));
2213 /* 2nd cluster in the page */
2214 sp
= slab_get(++mcl
);
2215 if (mclaudit
!= NULL
)
2216 mcl_audit_init(mcl
, &mca_list
, &con_list
,
2217 AUDIT_CONTENTS_SIZE
, NMBPCL
);
2219 VERIFY(sp
->sl_refcnt
== 0 && sp
->sl_flags
== 0);
2220 slab_init(sp
, MC_CL
, SLF_MAPPED
,
2221 mcl
, mcl
, bufsize
, 0, 1);
2223 /* Insert this slab */
2224 slab_insert(sp
, MC_CL
);
2226 /* Update stats now since slab_get() drops the lock */
2227 mbstat
.m_clfree
= ++m_infree(MC_CL
) +
2228 m_infree(MC_MBUF_CL
);
2229 mbstat
.m_clusters
= ++m_total(MC_CL
);
2230 VERIFY(m_total(MC_CL
) <= m_maxlimit(MC_CL
));
2231 } else if (bufsize
== m_maxsize(MC_BIGCL
)) {
2232 union mbigcluster
*mbc
= (union mbigcluster
*)page
;
2235 /* One for the entire page */
2237 if (mclaudit
!= NULL
)
2238 mcl_audit_init(mbc
, &mca_list
, NULL
, 0, 1);
2240 VERIFY(sp
->sl_refcnt
== 0 && sp
->sl_flags
== 0);
2241 slab_init(sp
, MC_BIGCL
, SLF_MAPPED
,
2242 mbc
, mbc
, bufsize
, 0, 1);
2244 /* 2nd cluster's slab is part of the previous one */
2245 nsp
= slab_get(((union mcluster
*)page
) + 1);
2246 slab_init(nsp
, MC_BIGCL
, SLF_MAPPED
| SLF_PARTIAL
,
2247 mbc
, NULL
, 0, 0, 0);
2249 /* Insert this slab */
2250 slab_insert(sp
, MC_BIGCL
);
2252 /* Update stats now since slab_get() drops the lock */
2253 mbstat
.m_bigclfree
= ++m_infree(MC_BIGCL
) +
2254 m_infree(MC_MBUF_BIGCL
);
2255 mbstat
.m_bigclusters
= ++m_total(MC_BIGCL
);
2256 VERIFY(m_total(MC_BIGCL
) <= m_maxlimit(MC_BIGCL
));
2257 } else if ((i
% (M16KCLBYTES
/ NBPG
)) == 0) {
2258 union m16kcluster
*m16kcl
= (union m16kcluster
*)page
;
2263 /* One for the entire 16KB */
2264 sp
= slab_get(m16kcl
);
2265 if (mclaudit
!= NULL
)
2266 mcl_audit_init(m16kcl
, &mca_list
, NULL
, 0, 1);
2268 VERIFY(sp
->sl_refcnt
== 0 && sp
->sl_flags
== 0);
2269 slab_init(sp
, MC_16KCL
, SLF_MAPPED
,
2270 m16kcl
, m16kcl
, bufsize
, 0, 1);
2272 /* 2nd-8th cluster's slab is part of the first one */
2273 for (k
= 1; k
< (M16KCLBYTES
/ MCLBYTES
); k
++) {
2274 nsp
= slab_get(((union mcluster
*)page
) + k
);
2275 VERIFY(nsp
->sl_refcnt
== 0 &&
2276 nsp
->sl_flags
== 0);
2277 slab_init(nsp
, MC_16KCL
,
2278 SLF_MAPPED
| SLF_PARTIAL
,
2279 m16kcl
, NULL
, 0, 0, 0);
2282 /* Insert this slab */
2283 slab_insert(sp
, MC_16KCL
);
2285 /* Update stats now since slab_get() drops the lock */
2286 m_infree(MC_16KCL
)++;
2287 m_total(MC_16KCL
)++;
2288 VERIFY(m_total(MC_16KCL
) <= m_maxlimit(MC_16KCL
));
2291 VERIFY(mca_list
== NULL
&& con_list
== NULL
);
2293 /* We're done; let others enter */
2294 mb_clalloc_busy
= FALSE
;
2295 if (mb_clalloc_waiters
> 0) {
2296 mb_clalloc_waiters
= 0;
2297 wakeup(mb_clalloc_waitchan
);
2300 if (bufsize
== m_maxsize(MC_CL
))
2301 return (numpages
<< 1);
2302 else if (bufsize
== m_maxsize(MC_BIGCL
))
2305 VERIFY(bufsize
== m_maxsize(MC_16KCL
));
2306 return (numpages
/ (M16KCLBYTES
/ NBPG
));
2309 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
2311 /* We're done; let others enter */
2312 mb_clalloc_busy
= FALSE
;
2313 if (mb_clalloc_waiters
> 0) {
2314 mb_clalloc_waiters
= 0;
2315 wakeup(mb_clalloc_waitchan
);
2319 * When non-blocking we kick a thread if we have to grow the
2320 * pool or if the number of free clusters is less than requested.
2322 if (bufsize
== m_maxsize(MC_CL
)) {
2325 * Remember total number of clusters needed
2328 i
+= m_total(MC_CL
);
2329 if (i
> mbuf_expand_mcl
) {
2330 mbuf_expand_mcl
= i
;
2331 if (mbuf_worker_ready
)
2332 wakeup((caddr_t
)&mbuf_worker_run
);
2336 if (m_infree(MC_CL
) >= num
)
2338 } else if (bufsize
== m_maxsize(MC_BIGCL
)) {
2341 * Remember total number of 4KB clusters needed
2344 i
+= m_total(MC_BIGCL
);
2345 if (i
> mbuf_expand_big
) {
2346 mbuf_expand_big
= i
;
2347 if (mbuf_worker_ready
)
2348 wakeup((caddr_t
)&mbuf_worker_run
);
2352 if (m_infree(MC_BIGCL
) >= num
)
2357 * Remember total number of 16KB clusters needed
2360 i
+= m_total(MC_16KCL
);
2361 if (i
> mbuf_expand_16k
) {
2362 mbuf_expand_16k
= i
;
2363 if (mbuf_worker_ready
)
2364 wakeup((caddr_t
)&mbuf_worker_run
);
2368 if (m_infree(MC_16KCL
) >= num
)
2375 * Populate the global freelist of the corresponding buffer class.
2378 freelist_populate(mbuf_class_t
class, unsigned int num
, int wait
)
2380 mcache_obj_t
*o
= NULL
;
2383 VERIFY(class == MC_MBUF
|| class == MC_CL
|| class == MC_BIGCL
||
2386 #if CONFIG_MBUF_NOEXPAND
2387 if ((mbstat
.m_mbufs
/ NMBPCL
) >= maxmbufcl
) {
2389 static int printonce
= 1;
2390 if (printonce
== 1) {
2392 printf("m_expand failed, allocated %ld out of %d "
2393 "clusters\n", mbstat
.m_mbufs
/ NMBPCL
,
2399 #endif /* CONFIG_MBUF_NOEXPAND */
2401 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
2406 i
= m_clalloc(num
, wait
, m_maxsize(MC_CL
));
2408 /* Respect the 2K clusters minimum limit */
2409 if (m_total(MC_CL
) == m_maxlimit(MC_CL
) &&
2410 m_infree(MC_CL
) <= m_minlimit(MC_CL
)) {
2411 if (class != MC_CL
|| (wait
& MCR_COMP
))
2420 return (m_clalloc(num
, wait
, m_maxsize(class)) != 0);
2428 /* Steal a cluster and cut it up to create NMBPCL mbufs */
2429 if ((o
= slab_alloc(MC_CL
, wait
)) != NULL
) {
2430 struct mbuf
*m
= (struct mbuf
*)o
;
2431 mcache_audit_t
*mca
= NULL
;
2432 mcl_slab_t
*sp
= slab_get(o
);
2434 VERIFY(slab_is_detached(sp
) &&
2435 (sp
->sl_flags
& (SLF_MAPPED
| SLF_PARTIAL
)) == SLF_MAPPED
);
2437 /* Make sure that the cluster is unmolested while in freelist */
2438 if (mclaudit
!= NULL
) {
2439 mca
= mcl_audit_buf2mca(MC_CL
, o
);
2440 mcache_audit_free_verify(mca
, o
, 0, m_maxsize(MC_CL
));
2443 /* Reinitialize it as an mbuf slab */
2444 slab_init(sp
, MC_MBUF
, sp
->sl_flags
, sp
->sl_base
, NULL
,
2445 sp
->sl_len
, 0, NMBPCL
);
2447 VERIFY(m
== (struct mbuf
*)sp
->sl_base
);
2448 VERIFY(sp
->sl_head
== NULL
);
2450 m_total(MC_MBUF
) += NMBPCL
;
2451 mbstat
.m_mbufs
= m_total(MC_MBUF
);
2452 m_infree(MC_MBUF
) += NMBPCL
;
2453 mtype_stat_add(MT_FREE
, NMBPCL
);
2458 * If auditing is enabled, construct the shadow mbuf
2459 * in the audit structure instead of the actual one.
2460 * mbuf_slab_audit() will take care of restoring the
2461 * contents after the integrity check.
2463 if (mclaudit
!= NULL
) {
2465 mca
= mcl_audit_buf2mca(MC_MBUF
,
2467 ms
= ((struct mbuf
*)mca
->mca_contents
);
2468 ms
->m_type
= MT_FREE
;
2470 m
->m_type
= MT_FREE
;
2472 m
->m_next
= sp
->sl_head
;
2473 sp
->sl_head
= (void *)m
++;
2476 /* Insert it into the mbuf class's slab list */
2477 slab_insert(sp
, MC_MBUF
);
2479 if ((i
= mb_waiters
) > 0)
2482 wakeup(mb_waitchan
);
2491 * (Inaccurately) check if it might be worth a trip back to the
2492 * mcache layer due the availability of objects there. We'll
2493 * end up back here if there's nothing up there.
2496 mbuf_cached_above(mbuf_class_t
class, int wait
)
2500 if (wait
& MCR_COMP
)
2501 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL
)) ||
2502 !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL
)));
2506 if (wait
& MCR_COMP
)
2507 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL
)));
2511 if (wait
& MCR_COMP
)
2512 return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL
)));
2516 if (wait
& MCR_COMP
)
2517 return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL
)));
2530 return (!mcache_bkt_isempty(m_cache(class)));
2534 * If possible, convert constructed objects to raw ones.
2537 mbuf_steal(mbuf_class_t
class, unsigned int num
)
2539 mcache_obj_t
*top
= NULL
;
2540 mcache_obj_t
**list
= &top
;
2541 unsigned int tot
= 0;
2543 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
2555 /* Get the required number of constructed objects if possible */
2556 if (m_infree(class) > m_minlimit(class)) {
2557 tot
= cslab_alloc(class, &list
,
2558 MIN(num
, m_infree(class)));
2561 /* And destroy them to get back the raw objects */
2563 (void) cslab_free(class, top
, 1);
2571 return (tot
== num
);
2575 m_reclaim(mbuf_class_t
class, unsigned int num
, boolean_t comp
)
2579 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
2581 VERIFY(m_total(MC_CL
) <= m_maxlimit(MC_CL
));
2582 VERIFY(m_total(MC_BIGCL
) <= m_maxlimit(MC_BIGCL
));
2583 VERIFY(m_total(MC_16KCL
) <= m_maxlimit(MC_16KCL
));
2586 * This logic can be made smarter; for now, simply mark
2587 * all other related classes as potential victims.
2591 m_wantpurge(MC_CL
)++;
2592 m_wantpurge(MC_MBUF_CL
)++;
2593 m_wantpurge(MC_MBUF_BIGCL
)++;
2597 m_wantpurge(MC_MBUF
)++;
2599 m_wantpurge(MC_MBUF_CL
)++;
2604 m_wantpurge(MC_MBUF_BIGCL
)++;
2609 m_wantpurge(MC_MBUF_16KCL
)++;
2618 * Run through each marked class and check if we really need to
2619 * purge (and therefore temporarily disable) the per-CPU caches
2620 * layer used by the class. If so, remember the classes since
2621 * we are going to drop the lock below prior to purging.
2623 for (m
= 0; m
< NELEM(mbuf_table
); m
++) {
2624 if (m_wantpurge(m
) > 0) {
2627 * Try hard to steal the required number of objects
2628 * from the freelist of other mbuf classes. Only
2629 * purge and disable the per-CPU caches layer when
2630 * we don't have enough; it's the last resort.
2632 if (!mbuf_steal(m
, num
))
2637 lck_mtx_unlock(mbuf_mlock
);
2640 /* drain is performed in pfslowtimo(), to avoid deadlocks */
2643 /* Sigh; we have no other choices but to ask mcache to purge */
2644 for (m
= 0; m
< NELEM(mbuf_table
); m
++) {
2645 if ((bmap
& (1 << m
)) &&
2646 mcache_purge_cache(m_cache(m
))) {
2647 lck_mtx_lock(mbuf_mlock
);
2650 lck_mtx_unlock(mbuf_mlock
);
2655 * Request mcache to reap extra elements from all of its caches;
2656 * note that all reaps are serialized and happen only at a fixed
2661 lck_mtx_lock(mbuf_mlock
);
2664 static inline struct mbuf
*
2665 m_get_common(int wait
, short type
, int hdr
)
2668 int mcflags
= MSLEEPF(wait
);
2670 /* Is this due to a non-blocking retry? If so, then try harder */
2671 if (mcflags
& MCR_NOSLEEP
)
2672 mcflags
|= MCR_TRYHARD
;
2674 m
= mcache_alloc(m_cache(MC_MBUF
), mcflags
);
2676 MBUF_INIT(m
, hdr
, type
);
2677 mtype_stat_inc(type
);
2678 mtype_stat_dec(MT_FREE
);
2680 if (hdr
&& mac_init_mbuf(m
, wait
) != 0) {
2684 #endif /* MAC_NET */
2690 * Space allocation routines; these are also available as macros
2691 * for critical paths.
2693 #define _M_GET(wait, type) m_get_common(wait, type, 0)
2694 #define _M_GETHDR(wait, type) m_get_common(wait, type, 1)
2695 #define _M_RETRY(wait, type) _M_GET(wait, type)
2696 #define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
2697 #define _MGET(m, how, type) ((m) = _M_GET(how, type))
2698 #define _MGETHDR(m, how, type) ((m) = _M_GETHDR(how, type))
2701 m_get(int wait
, int type
)
2703 return (_M_GET(wait
, type
));
2707 m_gethdr(int wait
, int type
)
2709 return (_M_GETHDR(wait
, type
));
2713 m_retry(int wait
, int type
)
2715 return (_M_RETRY(wait
, type
));
2719 m_retryhdr(int wait
, int type
)
2721 return (_M_RETRYHDR(wait
, type
));
2725 m_getclr(int wait
, int type
)
2729 _MGET(m
, wait
, type
);
2731 bzero(MTOD(m
, caddr_t
), MLEN
);
2736 m_free(struct mbuf
*m
)
2738 struct mbuf
*n
= m
->m_next
;
2740 if (m
->m_type
== MT_FREE
)
2741 panic("m_free: freeing an already freed mbuf");
2743 /* Free the aux data and tags if there is any */
2744 if (m
->m_flags
& M_PKTHDR
) {
2745 m_tag_delete_chain(m
, NULL
);
2748 if (m
->m_flags
& M_EXT
) {
2752 refcnt
= m_decref(m
);
2753 flags
= MEXT_FLAGS(m
);
2754 if (refcnt
== 0 && flags
== 0) {
2755 if (m
->m_ext
.ext_free
== NULL
) {
2756 mcache_free(m_cache(MC_CL
), m
->m_ext
.ext_buf
);
2757 } else if (m
->m_ext
.ext_free
== m_bigfree
) {
2758 mcache_free(m_cache(MC_BIGCL
),
2760 } else if (m
->m_ext
.ext_free
== m_16kfree
) {
2761 mcache_free(m_cache(MC_16KCL
),
2764 (*(m
->m_ext
.ext_free
))(m
->m_ext
.ext_buf
,
2765 m
->m_ext
.ext_size
, m
->m_ext
.ext_arg
);
2767 mcache_free(ref_cache
, MEXT_RFA(m
));
2769 } else if (refcnt
== 0 && (flags
& EXTF_COMPOSITE
)) {
2770 VERIFY(m
->m_type
!= MT_FREE
);
2772 mtype_stat_dec(m
->m_type
);
2773 mtype_stat_inc(MT_FREE
);
2775 m
->m_type
= MT_FREE
;
2778 m
->m_next
= m
->m_nextpkt
= NULL
;
2780 /* "Free" into the intermediate cache */
2781 if (m
->m_ext
.ext_free
== NULL
) {
2782 mcache_free(m_cache(MC_MBUF_CL
), m
);
2783 } else if (m
->m_ext
.ext_free
== m_bigfree
) {
2784 mcache_free(m_cache(MC_MBUF_BIGCL
), m
);
2786 VERIFY(m
->m_ext
.ext_free
== m_16kfree
);
2787 mcache_free(m_cache(MC_MBUF_16KCL
), m
);
2793 if (m
->m_type
!= MT_FREE
) {
2794 mtype_stat_dec(m
->m_type
);
2795 mtype_stat_inc(MT_FREE
);
2798 m
->m_type
= MT_FREE
;
2799 m
->m_flags
= m
->m_len
= 0;
2800 m
->m_next
= m
->m_nextpkt
= NULL
;
2802 mcache_free(m_cache(MC_MBUF
), m
);
2807 __private_extern__
struct mbuf
*
2808 m_clattach(struct mbuf
*m
, int type
, caddr_t extbuf
,
2809 void (*extfree
)(caddr_t
, u_int
, caddr_t
), u_int extsize
, caddr_t extarg
,
2812 struct ext_ref
*rfa
= NULL
;
2814 if (m
== NULL
&& (m
= _M_GETHDR(wait
, type
)) == NULL
)
2817 if (m
->m_flags
& M_EXT
) {
2821 refcnt
= m_decref(m
);
2822 flags
= MEXT_FLAGS(m
);
2823 if (refcnt
== 0 && flags
== 0) {
2824 if (m
->m_ext
.ext_free
== NULL
) {
2825 mcache_free(m_cache(MC_CL
), m
->m_ext
.ext_buf
);
2826 } else if (m
->m_ext
.ext_free
== m_bigfree
) {
2827 mcache_free(m_cache(MC_BIGCL
),
2829 } else if (m
->m_ext
.ext_free
== m_16kfree
) {
2830 mcache_free(m_cache(MC_16KCL
),
2833 (*(m
->m_ext
.ext_free
))(m
->m_ext
.ext_buf
,
2834 m
->m_ext
.ext_size
, m
->m_ext
.ext_arg
);
2836 /* Re-use the reference structure */
2838 } else if (refcnt
== 0 && (flags
& EXTF_COMPOSITE
)) {
2839 VERIFY(m
->m_type
!= MT_FREE
);
2841 mtype_stat_dec(m
->m_type
);
2842 mtype_stat_inc(MT_FREE
);
2844 m
->m_type
= MT_FREE
;
2847 m
->m_next
= m
->m_nextpkt
= NULL
;
2848 /* "Free" into the intermediate cache */
2849 if (m
->m_ext
.ext_free
== NULL
) {
2850 mcache_free(m_cache(MC_MBUF_CL
), m
);
2851 } else if (m
->m_ext
.ext_free
== m_bigfree
) {
2852 mcache_free(m_cache(MC_MBUF_BIGCL
), m
);
2854 VERIFY(m
->m_ext
.ext_free
== m_16kfree
);
2855 mcache_free(m_cache(MC_MBUF_16KCL
), m
);
2858 * Allocate a new mbuf, since we didn't divorce
2859 * the composite mbuf + cluster pair above.
2861 if ((m
= _M_GETHDR(wait
, type
)) == NULL
)
2867 (rfa
= mcache_alloc(ref_cache
, MSLEEPF(wait
))) == NULL
) {
2872 MEXT_INIT(m
, extbuf
, extsize
, extfree
, extarg
, rfa
, 1, 0);
2877 /* m_mclget() add an mbuf cluster to a normal mbuf */
2879 m_mclget(struct mbuf
*m
, int wait
)
2881 struct ext_ref
*rfa
;
2883 if ((rfa
= mcache_alloc(ref_cache
, MSLEEPF(wait
))) == NULL
)
2886 m
->m_ext
.ext_buf
= m_mclalloc(wait
);
2887 if (m
->m_ext
.ext_buf
!= NULL
) {
2888 MBUF_CL_INIT(m
, m
->m_ext
.ext_buf
, rfa
, 1, 0);
2890 mcache_free(ref_cache
, rfa
);
2895 /* Allocate an mbuf cluster */
2897 m_mclalloc(int wait
)
2899 int mcflags
= MSLEEPF(wait
);
2901 /* Is this due to a non-blocking retry? If so, then try harder */
2902 if (mcflags
& MCR_NOSLEEP
)
2903 mcflags
|= MCR_TRYHARD
;
2905 return (mcache_alloc(m_cache(MC_CL
), mcflags
));
2908 /* Free an mbuf cluster */
2910 m_mclfree(caddr_t p
)
2912 mcache_free(m_cache(MC_CL
), p
);
2916 * mcl_hasreference() checks if a cluster of an mbuf is referenced by
2920 m_mclhasreference(struct mbuf
*m
)
2922 if (!(m
->m_flags
& M_EXT
))
2925 ASSERT(MEXT_RFA(m
) != NULL
);
2927 return (MEXT_REF(m
) > 1);
2930 __private_extern__ caddr_t
2931 m_bigalloc(int wait
)
2933 int mcflags
= MSLEEPF(wait
);
2935 /* Is this due to a non-blocking retry? If so, then try harder */
2936 if (mcflags
& MCR_NOSLEEP
)
2937 mcflags
|= MCR_TRYHARD
;
2939 return (mcache_alloc(m_cache(MC_BIGCL
), mcflags
));
2942 __private_extern__
void
2943 m_bigfree(caddr_t p
, __unused u_int size
, __unused caddr_t arg
)
2945 mcache_free(m_cache(MC_BIGCL
), p
);
2948 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
2949 __private_extern__
struct mbuf
*
2950 m_mbigget(struct mbuf
*m
, int wait
)
2952 struct ext_ref
*rfa
;
2954 if ((rfa
= mcache_alloc(ref_cache
, MSLEEPF(wait
))) == NULL
)
2957 m
->m_ext
.ext_buf
= m_bigalloc(wait
);
2958 if (m
->m_ext
.ext_buf
!= NULL
) {
2959 MBUF_BIGCL_INIT(m
, m
->m_ext
.ext_buf
, rfa
, 1, 0);
2961 mcache_free(ref_cache
, rfa
);
2966 __private_extern__ caddr_t
2967 m_16kalloc(int wait
)
2969 int mcflags
= MSLEEPF(wait
);
2971 /* Is this due to a non-blocking retry? If so, then try harder */
2972 if (mcflags
& MCR_NOSLEEP
)
2973 mcflags
|= MCR_TRYHARD
;
2975 return (mcache_alloc(m_cache(MC_16KCL
), mcflags
));
2978 __private_extern__
void
2979 m_16kfree(caddr_t p
, __unused u_int size
, __unused caddr_t arg
)
2981 mcache_free(m_cache(MC_16KCL
), p
);
2984 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
2985 __private_extern__
struct mbuf
*
2986 m_m16kget(struct mbuf
*m
, int wait
)
2988 struct ext_ref
*rfa
;
2990 if ((rfa
= mcache_alloc(ref_cache
, MSLEEPF(wait
))) == NULL
)
2993 m
->m_ext
.ext_buf
= m_16kalloc(wait
);
2994 if (m
->m_ext
.ext_buf
!= NULL
) {
2995 MBUF_16KCL_INIT(m
, m
->m_ext
.ext_buf
, rfa
, 1, 0);
2997 mcache_free(ref_cache
, rfa
);
3004 m_copy_pkthdr(struct mbuf
*to
, struct mbuf
*from
)
3007 /* We will be taking over the tags of 'to' */
3008 if (to
->m_flags
& M_PKTHDR
)
3009 m_tag_delete_chain(to
, NULL
);
3010 #endif /* MAC_NET */
3011 to
->m_pkthdr
= from
->m_pkthdr
; /* especially tags */
3012 m_tag_init(from
); /* purge tags from src */
3013 to
->m_flags
= (from
->m_flags
& M_COPYFLAGS
) | (to
->m_flags
& M_EXT
);
3014 if ((to
->m_flags
& M_EXT
) == 0)
3015 to
->m_data
= to
->m_pktdat
;
3019 * Duplicate "from"'s mbuf pkthdr in "to".
3020 * "from" must have M_PKTHDR set, and "to" must be empty.
3021 * In particular, this does a deep copy of the packet tags.
3024 m_dup_pkthdr(struct mbuf
*to
, struct mbuf
*from
, int how
)
3027 if (to
->m_flags
& M_PKTHDR
)
3028 m_tag_delete_chain(to
, NULL
);
3029 #endif /* MAC_NET */
3030 to
->m_flags
= (from
->m_flags
& M_COPYFLAGS
) | (to
->m_flags
& M_EXT
);
3031 if ((to
->m_flags
& M_EXT
) == 0)
3032 to
->m_data
= to
->m_pktdat
;
3033 to
->m_pkthdr
= from
->m_pkthdr
;
3035 return (m_tag_copy_chain(to
, from
, how
));
3039 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
3040 * if wantall is not set, return whatever number were available. Set up the
3041 * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
3042 * are chained on the m_nextpkt field. Any packets requested beyond this
3043 * are chained onto the last packet header's m_next field. The size of
3044 * the cluster is controlled by the parameter bufsize.
3046 __private_extern__
struct mbuf
*
3047 m_getpackets_internal(unsigned int *num_needed
, int num_with_pkthdrs
,
3048 int wait
, int wantall
, size_t bufsize
)
3051 struct mbuf
**np
, *top
;
3052 unsigned int pnum
, needed
= *num_needed
;
3053 mcache_obj_t
*mp_list
= NULL
;
3054 int mcflags
= MSLEEPF(wait
);
3056 struct ext_ref
*rfa
;
3060 ASSERT(bufsize
== m_maxsize(MC_CL
) ||
3061 bufsize
== m_maxsize(MC_BIGCL
) ||
3062 bufsize
== m_maxsize(MC_16KCL
));
3065 * Caller must first check for njcl because this
3066 * routine is internal and not exposed/used via KPI.
3068 VERIFY(bufsize
!= m_maxsize(MC_16KCL
) || njcl
> 0);
3075 * The caller doesn't want all the requested buffers; only some.
3076 * Try hard to get what we can, but don't block. This effectively
3077 * overrides MCR_SLEEP, since this thread will not go to sleep
3078 * if we can't get all the buffers.
3080 if (!wantall
|| (mcflags
& MCR_NOSLEEP
))
3081 mcflags
|= MCR_TRYHARD
;
3083 /* Allocate the composite mbuf + cluster elements from the cache */
3084 if (bufsize
== m_maxsize(MC_CL
))
3085 cp
= m_cache(MC_MBUF_CL
);
3086 else if (bufsize
== m_maxsize(MC_BIGCL
))
3087 cp
= m_cache(MC_MBUF_BIGCL
);
3089 cp
= m_cache(MC_MBUF_16KCL
);
3090 needed
= mcache_alloc_ext(cp
, &mp_list
, needed
, mcflags
);
3092 for (pnum
= 0; pnum
< needed
; pnum
++) {
3093 m
= (struct mbuf
*)mp_list
;
3094 mp_list
= mp_list
->obj_next
;
3096 VERIFY(m
->m_type
== MT_FREE
&& m
->m_flags
== M_EXT
);
3097 cl
= m
->m_ext
.ext_buf
;
3100 ASSERT(cl
!= NULL
&& rfa
!= NULL
);
3101 VERIFY(MBUF_IS_COMPOSITE(m
));
3103 flag
= MEXT_FLAGS(m
);
3105 MBUF_INIT(m
, num_with_pkthdrs
, MT_DATA
);
3106 if (bufsize
== m_maxsize(MC_16KCL
)) {
3107 MBUF_16KCL_INIT(m
, cl
, rfa
, 1, flag
);
3108 } else if (bufsize
== m_maxsize(MC_BIGCL
)) {
3109 MBUF_BIGCL_INIT(m
, cl
, rfa
, 1, flag
);
3111 MBUF_CL_INIT(m
, cl
, rfa
, 1, flag
);
3114 if (num_with_pkthdrs
> 0) {
3117 if (mac_mbuf_label_init(m
, wait
) != 0) {
3121 #endif /* MAC_NET */
3125 if (num_with_pkthdrs
> 0)
3130 ASSERT(pnum
!= *num_needed
|| mp_list
== NULL
);
3131 if (mp_list
!= NULL
)
3132 mcache_free_ext(cp
, mp_list
);
3135 mtype_stat_add(MT_DATA
, pnum
);
3136 mtype_stat_sub(MT_FREE
, pnum
);
3139 if (wantall
&& (pnum
!= *num_needed
)) {
3150 * Return list of mbuf linked by m_nextpkt. Try for numlist, and if
3151 * wantall is not set, return whatever number were available. The size of
3152 * each mbuf in the list is controlled by the parameter packetlen. Each
3153 * mbuf of the list may have a chain of mbufs linked by m_next. Each mbuf
3154 * in the chain is called a segment. If maxsegments is not null and the
3155 * value pointed to is not null, this specify the maximum number of segments
3156 * for a chain of mbufs. If maxsegments is zero or the value pointed to
3157 * is zero the caller does not have any restriction on the number of segments.
3158 * The actual number of segments of a mbuf chain is return in the value
3159 * pointed to by maxsegments.
3161 __private_extern__
struct mbuf
*
3162 m_allocpacket_internal(unsigned int *numlist
, size_t packetlen
,
3163 unsigned int *maxsegments
, int wait
, int wantall
, size_t wantsize
)
3165 struct mbuf
**np
, *top
, *first
= NULL
;
3166 size_t bufsize
, r_bufsize
;
3167 unsigned int num
= 0;
3168 unsigned int nsegs
= 0;
3169 unsigned int needed
, resid
;
3170 int mcflags
= MSLEEPF(wait
);
3171 mcache_obj_t
*mp_list
= NULL
, *rmp_list
= NULL
;
3172 mcache_t
*cp
= NULL
, *rcp
= NULL
;
3180 if (wantsize
== 0) {
3181 if (packetlen
<= MINCLSIZE
) {
3182 bufsize
= packetlen
;
3183 } else if (packetlen
> m_maxsize(MC_CL
)) {
3184 /* Use 4KB if jumbo cluster pool isn't available */
3185 if (packetlen
<= m_maxsize(MC_BIGCL
) || njcl
== 0)
3186 bufsize
= m_maxsize(MC_BIGCL
);
3188 bufsize
= m_maxsize(MC_16KCL
);
3190 bufsize
= m_maxsize(MC_CL
);
3192 } else if (wantsize
== m_maxsize(MC_CL
) ||
3193 wantsize
== m_maxsize(MC_BIGCL
) ||
3194 (wantsize
== m_maxsize(MC_16KCL
) && njcl
> 0)) {
3200 if (bufsize
<= MHLEN
) {
3202 } else if (bufsize
<= MINCLSIZE
) {
3203 if (maxsegments
!= NULL
&& *maxsegments
== 1) {
3204 bufsize
= m_maxsize(MC_CL
);
3209 } else if (bufsize
== m_maxsize(MC_16KCL
)) {
3211 nsegs
= ((packetlen
- 1) >> (PGSHIFT
+ 2)) + 1;
3212 } else if (bufsize
== m_maxsize(MC_BIGCL
)) {
3213 nsegs
= ((packetlen
- 1) >> PGSHIFT
) + 1;
3215 nsegs
= ((packetlen
- 1) >> MCLSHIFT
) + 1;
3217 if (maxsegments
!= NULL
) {
3218 if (*maxsegments
&& nsegs
> *maxsegments
) {
3219 *maxsegments
= nsegs
;
3222 *maxsegments
= nsegs
;
3226 * The caller doesn't want all the requested buffers; only some.
3227 * Try hard to get what we can, but don't block. This effectively
3228 * overrides MCR_SLEEP, since this thread will not go to sleep
3229 * if we can't get all the buffers.
3231 if (!wantall
|| (mcflags
& MCR_NOSLEEP
))
3232 mcflags
|= MCR_TRYHARD
;
3235 * Simple case where all elements in the lists/chains are mbufs.
3236 * Unless bufsize is greater than MHLEN, each segment chain is made
3237 * up of exactly 1 mbuf. Otherwise, each segment chain is made up
3238 * of 2 mbufs; the second one is used for the residual data, i.e.
3239 * the remaining data that cannot fit into the first mbuf.
3241 if (bufsize
<= MINCLSIZE
) {
3242 /* Allocate the elements in one shot from the mbuf cache */
3243 ASSERT(bufsize
<= MHLEN
|| nsegs
== 2);
3244 cp
= m_cache(MC_MBUF
);
3245 needed
= mcache_alloc_ext(cp
, &mp_list
,
3246 (*numlist
) * nsegs
, mcflags
);
3249 * The number of elements must be even if we are to use an
3250 * mbuf (instead of a cluster) to store the residual data.
3251 * If we couldn't allocate the requested number of mbufs,
3252 * trim the number down (if it's odd) in order to avoid
3253 * creating a partial segment chain.
3255 if (bufsize
> MHLEN
&& (needed
& 0x1))
3258 while (num
< needed
) {
3261 m
= (struct mbuf
*)mp_list
;
3262 mp_list
= mp_list
->obj_next
;
3265 MBUF_INIT(m
, 1, MT_DATA
);
3267 if (mac_init_mbuf(m
, wait
) != 0) {
3271 #endif /* MAC_NET */
3273 if (bufsize
> MHLEN
) {
3274 /* A second mbuf for this segment chain */
3275 m
->m_next
= (struct mbuf
*)mp_list
;
3276 mp_list
= mp_list
->obj_next
;
3277 ASSERT(m
->m_next
!= NULL
);
3279 MBUF_INIT(m
->m_next
, 0, MT_DATA
);
3285 ASSERT(num
!= *numlist
|| mp_list
== NULL
);
3288 mtype_stat_add(MT_DATA
, num
);
3289 mtype_stat_sub(MT_FREE
, num
);
3293 /* We've got them all; return to caller */
3294 if (num
== *numlist
)
3301 * Complex cases where elements are made up of one or more composite
3302 * mbufs + cluster, depending on packetlen. Each N-segment chain can
3303 * be illustrated as follows:
3305 * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
3307 * Every composite mbuf + cluster element comes from the intermediate
3308 * cache (either MC_MBUF_CL or MC_MBUF_BIGCL). For space efficiency,
3309 * the last composite element will come from the MC_MBUF_CL cache,
3310 * unless the residual data is larger than 2KB where we use the
3311 * big cluster composite cache (MC_MBUF_BIGCL) instead. Residual
3312 * data is defined as extra data beyond the first element that cannot
3313 * fit into the previous element, i.e. there is no residual data if
3314 * the chain only has 1 segment.
3316 r_bufsize
= bufsize
;
3317 resid
= packetlen
> bufsize
? packetlen
% bufsize
: 0;
3319 /* There is residual data; figure out the cluster size */
3320 if (wantsize
== 0 && packetlen
> MINCLSIZE
) {
3322 * Caller didn't request that all of the segments
3323 * in the chain use the same cluster size; use the
3324 * smaller of the cluster sizes.
3326 if (njcl
> 0 && resid
> m_maxsize(MC_BIGCL
))
3327 r_bufsize
= m_maxsize(MC_16KCL
);
3328 else if (resid
> m_maxsize(MC_CL
))
3329 r_bufsize
= m_maxsize(MC_BIGCL
);
3331 r_bufsize
= m_maxsize(MC_CL
);
3333 /* Use the same cluster size as the other segments */
3341 * Attempt to allocate composite mbuf + cluster elements for
3342 * the residual data in each chain; record the number of such
3343 * elements that can be allocated so that we know how many
3344 * segment chains we can afford to create.
3346 if (r_bufsize
<= m_maxsize(MC_CL
))
3347 rcp
= m_cache(MC_MBUF_CL
);
3348 else if (r_bufsize
<= m_maxsize(MC_BIGCL
))
3349 rcp
= m_cache(MC_MBUF_BIGCL
);
3351 rcp
= m_cache(MC_MBUF_16KCL
);
3352 needed
= mcache_alloc_ext(rcp
, &rmp_list
, *numlist
, mcflags
);
3357 /* This is temporarily reduced for calculation */
3363 * Attempt to allocate the rest of the composite mbuf + cluster
3364 * elements for the number of segment chains that we need.
3366 if (bufsize
<= m_maxsize(MC_CL
))
3367 cp
= m_cache(MC_MBUF_CL
);
3368 else if (bufsize
<= m_maxsize(MC_BIGCL
))
3369 cp
= m_cache(MC_MBUF_BIGCL
);
3371 cp
= m_cache(MC_MBUF_16KCL
);
3372 needed
= mcache_alloc_ext(cp
, &mp_list
, needed
* nsegs
, mcflags
);
3374 /* Round it down to avoid creating a partial segment chain */
3375 needed
= (needed
/ nsegs
) * nsegs
;
3381 * We're about to construct the chain(s); take into account
3382 * the number of segments we have created above to hold the
3383 * residual data for each chain, as well as restore the
3384 * original count of segments per chain.
3387 needed
+= needed
/ nsegs
;
3394 struct ext_ref
*rfa
;
3399 if (nsegs
== 1 || (num
% nsegs
) != 0 || resid
== 0) {
3400 m
= (struct mbuf
*)mp_list
;
3401 mp_list
= mp_list
->obj_next
;
3403 m
= (struct mbuf
*)rmp_list
;
3404 rmp_list
= rmp_list
->obj_next
;
3407 VERIFY(m
->m_type
== MT_FREE
&& m
->m_flags
== M_EXT
);
3408 VERIFY(m
->m_ext
.ext_free
== NULL
||
3409 m
->m_ext
.ext_free
== m_bigfree
||
3410 m
->m_ext
.ext_free
== m_16kfree
);
3412 cl
= m
->m_ext
.ext_buf
;
3415 ASSERT(cl
!= NULL
&& rfa
!= NULL
);
3416 VERIFY(MBUF_IS_COMPOSITE(m
));
3418 flag
= MEXT_FLAGS(m
);
3420 pkthdr
= (nsegs
== 1 || (num
% nsegs
) == 1);
3423 MBUF_INIT(m
, pkthdr
, MT_DATA
);
3424 if (m
->m_ext
.ext_free
== m_16kfree
) {
3425 MBUF_16KCL_INIT(m
, cl
, rfa
, 1, flag
);
3426 } else if (m
->m_ext
.ext_free
== m_bigfree
) {
3427 MBUF_BIGCL_INIT(m
, cl
, rfa
, 1, flag
);
3429 MBUF_CL_INIT(m
, cl
, rfa
, 1, flag
);
3432 if (pkthdr
&& mac_init_mbuf(m
, wait
) != 0) {
3437 #endif /* MAC_NET */
3440 if ((num
% nsegs
) == 0)
3441 np
= &first
->m_nextpkt
;
3450 mtype_stat_add(MT_DATA
, num
);
3451 mtype_stat_sub(MT_FREE
, num
);
3456 /* We've got them all; return to caller */
3457 if (num
== *numlist
) {
3458 ASSERT(mp_list
== NULL
&& rmp_list
== NULL
);
3463 /* Free up what's left of the above */
3464 if (mp_list
!= NULL
)
3465 mcache_free_ext(cp
, mp_list
);
3466 if (rmp_list
!= NULL
)
3467 mcache_free_ext(rcp
, rmp_list
);
3468 if (wantall
&& top
!= NULL
) {
3477 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
3478 * packets on receive ring.
3480 __private_extern__
struct mbuf
*
3481 m_getpacket_how(int wait
)
3483 unsigned int num_needed
= 1;
3485 return (m_getpackets_internal(&num_needed
, 1, wait
, 1,
3490 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
3491 * packets on receive ring.
3496 unsigned int num_needed
= 1;
3498 return (m_getpackets_internal(&num_needed
, 1, M_WAIT
, 1,
3503 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
3504 * if this can't be met, return whatever number were available. Set up the
3505 * first num_with_pkthdrs with mbuf hdrs configured as packet headers. These
3506 * are chained on the m_nextpkt field. Any packets requested beyond this are
3507 * chained onto the last packet header's m_next field.
3510 m_getpackets(int num_needed
, int num_with_pkthdrs
, int how
)
3512 unsigned int n
= num_needed
;
3514 return (m_getpackets_internal(&n
, num_with_pkthdrs
, how
, 0,
3519 * Return a list of mbuf hdrs set up as packet hdrs chained together
3520 * on the m_nextpkt field
3523 m_getpackethdrs(int num_needed
, int how
)
3526 struct mbuf
**np
, *top
;
3531 while (num_needed
--) {
3532 m
= _M_RETRYHDR(how
, MT_DATA
);
3544 * Free an mbuf list (m_nextpkt) while following m_next. Returns the count
3545 * for mbufs packets freed. Used by the drivers.
3548 m_freem_list(struct mbuf
*m
)
3550 struct mbuf
*nextpkt
;
3551 mcache_obj_t
*mp_list
= NULL
;
3552 mcache_obj_t
*mcl_list
= NULL
;
3553 mcache_obj_t
*mbc_list
= NULL
;
3554 mcache_obj_t
*m16k_list
= NULL
;
3555 mcache_obj_t
*m_mcl_list
= NULL
;
3556 mcache_obj_t
*m_mbc_list
= NULL
;
3557 mcache_obj_t
*m_m16k_list
= NULL
;
3558 mcache_obj_t
*ref_list
= NULL
;
3560 int mt_free
= 0, mt_data
= 0, mt_header
= 0, mt_soname
= 0, mt_tag
= 0;
3565 nextpkt
= m
->m_nextpkt
;
3566 m
->m_nextpkt
= NULL
;
3569 struct mbuf
*next
= m
->m_next
;
3570 mcache_obj_t
*o
, *rfa
;
3571 u_int32_t refcnt
, flags
;
3573 if (m
->m_type
== MT_FREE
)
3574 panic("m_free: freeing an already freed mbuf");
3576 if (m
->m_type
!= MT_FREE
)
3579 if (m
->m_flags
& M_PKTHDR
) {
3580 m_tag_delete_chain(m
, NULL
);
3583 if (!(m
->m_flags
& M_EXT
))
3586 o
= (mcache_obj_t
*)m
->m_ext
.ext_buf
;
3587 refcnt
= m_decref(m
);
3588 flags
= MEXT_FLAGS(m
);
3589 if (refcnt
== 0 && flags
== 0) {
3590 if (m
->m_ext
.ext_free
== NULL
) {
3591 o
->obj_next
= mcl_list
;
3593 } else if (m
->m_ext
.ext_free
== m_bigfree
) {
3594 o
->obj_next
= mbc_list
;
3596 } else if (m
->m_ext
.ext_free
== m_16kfree
) {
3597 o
->obj_next
= m16k_list
;
3600 (*(m
->m_ext
.ext_free
))((caddr_t
)o
,
3604 rfa
= (mcache_obj_t
*)MEXT_RFA(m
);
3605 rfa
->obj_next
= ref_list
;
3608 } else if (refcnt
== 0 && (flags
& EXTF_COMPOSITE
)) {
3609 VERIFY(m
->m_type
!= MT_FREE
);
3611 * Amortize the costs of atomic operations
3612 * by doing them at the end, if possible.
3614 if (m
->m_type
== MT_DATA
)
3616 else if (m
->m_type
== MT_HEADER
)
3618 else if (m
->m_type
== MT_SONAME
)
3620 else if (m
->m_type
== MT_TAG
)
3623 mtype_stat_dec(m
->m_type
);
3625 m
->m_type
= MT_FREE
;
3628 m
->m_next
= m
->m_nextpkt
= NULL
;
3630 /* "Free" into the intermediate cache */
3631 o
= (mcache_obj_t
*)m
;
3632 if (m
->m_ext
.ext_free
== NULL
) {
3633 o
->obj_next
= m_mcl_list
;
3635 } else if (m
->m_ext
.ext_free
== m_bigfree
) {
3636 o
->obj_next
= m_mbc_list
;
3639 VERIFY(m
->m_ext
.ext_free
== m_16kfree
);
3640 o
->obj_next
= m_m16k_list
;
3648 * Amortize the costs of atomic operations
3649 * by doing them at the end, if possible.
3651 if (m
->m_type
== MT_DATA
)
3653 else if (m
->m_type
== MT_HEADER
)
3655 else if (m
->m_type
== MT_SONAME
)
3657 else if (m
->m_type
== MT_TAG
)
3659 else if (m
->m_type
!= MT_FREE
)
3660 mtype_stat_dec(m
->m_type
);
3662 m
->m_type
= MT_FREE
;
3663 m
->m_flags
= m
->m_len
= 0;
3664 m
->m_next
= m
->m_nextpkt
= NULL
;
3666 ((mcache_obj_t
*)m
)->obj_next
= mp_list
;
3667 mp_list
= (mcache_obj_t
*)m
;
3676 mtype_stat_add(MT_FREE
, mt_free
);
3678 mtype_stat_sub(MT_DATA
, mt_data
);
3680 mtype_stat_sub(MT_HEADER
, mt_header
);
3682 mtype_stat_sub(MT_SONAME
, mt_soname
);
3684 mtype_stat_sub(MT_TAG
, mt_tag
);
3686 if (mp_list
!= NULL
)
3687 mcache_free_ext(m_cache(MC_MBUF
), mp_list
);
3688 if (mcl_list
!= NULL
)
3689 mcache_free_ext(m_cache(MC_CL
), mcl_list
);
3690 if (mbc_list
!= NULL
)
3691 mcache_free_ext(m_cache(MC_BIGCL
), mbc_list
);
3692 if (m16k_list
!= NULL
)
3693 mcache_free_ext(m_cache(MC_16KCL
), m16k_list
);
3694 if (m_mcl_list
!= NULL
)
3695 mcache_free_ext(m_cache(MC_MBUF_CL
), m_mcl_list
);
3696 if (m_mbc_list
!= NULL
)
3697 mcache_free_ext(m_cache(MC_MBUF_BIGCL
), m_mbc_list
);
3698 if (m_m16k_list
!= NULL
)
3699 mcache_free_ext(m_cache(MC_MBUF_16KCL
), m_m16k_list
);
3700 if (ref_list
!= NULL
)
3701 mcache_free_ext(ref_cache
, ref_list
);
3707 m_freem(struct mbuf
*m
)
3714 * Mbuffer utility routines.
3718 * Compute the amount of space available before the current start
3719 * of data in an mbuf.
3722 m_leadingspace(struct mbuf
*m
)
3724 if (m
->m_flags
& M_EXT
) {
3725 if (MCLHASREFERENCE(m
))
3727 return (m
->m_data
- m
->m_ext
.ext_buf
);
3729 if (m
->m_flags
& M_PKTHDR
)
3730 return (m
->m_data
- m
->m_pktdat
);
3731 return (m
->m_data
- m
->m_dat
);
3735 * Compute the amount of space available after the end of data in an mbuf.
3738 m_trailingspace(struct mbuf
*m
)
3740 if (m
->m_flags
& M_EXT
) {
3741 if (MCLHASREFERENCE(m
))
3743 return (m
->m_ext
.ext_buf
+ m
->m_ext
.ext_size
-
3744 (m
->m_data
+ m
->m_len
));
3746 return (&m
->m_dat
[MLEN
] - (m
->m_data
+ m
->m_len
));
3750 * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
3751 * copy junk along. Does not adjust packet header length.
3754 m_prepend(struct mbuf
*m
, int len
, int how
)
3758 _MGET(mn
, how
, m
->m_type
);
3763 if (m
->m_flags
& M_PKTHDR
) {
3764 M_COPY_PKTHDR(mn
, m
);
3765 m
->m_flags
&= ~M_PKTHDR
;
3776 * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
3777 * chain, copy junk along, and adjust length.
3780 m_prepend_2(struct mbuf
*m
, int len
, int how
)
3782 if (M_LEADINGSPACE(m
) >= len
) {
3786 m
= m_prepend(m
, len
, how
);
3788 if ((m
) && (m
->m_flags
& M_PKTHDR
))
3789 m
->m_pkthdr
.len
+= len
;
3794 * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
3795 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
3796 * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
3801 m_copym(struct mbuf
*m
, int off0
, int len
, int wait
)
3803 struct mbuf
*n
, *mhdr
= NULL
, **np
;
3808 if (off
< 0 || len
< 0)
3809 panic("m_copym: invalid offset %d or len %d", off
, len
);
3811 if (off
== 0 && (m
->m_flags
& M_PKTHDR
)) {
3816 while (off
>= m
->m_len
) {
3817 if (m
->m_next
== NULL
)
3818 panic("m_copym: invalid mbuf chain");
3827 if (len
!= M_COPYALL
)
3828 panic("m_copym: len != M_COPYALL");
3832 n
= _M_RETRY(wait
, m
->m_type
);
3839 M_COPY_PKTHDR(n
, mhdr
);
3840 if (len
== M_COPYALL
)
3841 n
->m_pkthdr
.len
-= off0
;
3843 n
->m_pkthdr
.len
= len
;
3846 if (len
== M_COPYALL
) {
3847 if (MIN(len
, (m
->m_len
- off
)) == len
) {
3848 printf("m->m_len %ld - off %d = %ld, %ld\n",
3849 m
->m_len
, off
, m
->m_len
- off
,
3850 MIN(len
, (m
->m_len
- off
)));
3853 n
->m_len
= MIN(len
, (m
->m_len
- off
));
3854 if (n
->m_len
== M_COPYALL
) {
3855 printf("n->m_len == M_COPYALL, fixing\n");
3858 if (m
->m_flags
& M_EXT
) {
3859 n
->m_ext
= m
->m_ext
;
3861 n
->m_data
= m
->m_data
+ off
;
3862 n
->m_flags
|= M_EXT
;
3864 bcopy(MTOD(m
, caddr_t
)+off
, MTOD(n
, caddr_t
),
3865 (unsigned)n
->m_len
);
3867 if (len
!= M_COPYALL
)
3886 * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
3887 * within this routine also, the last mbuf and offset accessed are passed
3888 * out and can be passed back in to avoid having to rescan the entire mbuf
3889 * list (normally hung off of the socket)
3892 m_copym_with_hdrs(struct mbuf
*m
, int off0
, int len0
, int wait
,
3893 struct mbuf
**m_last
, int *m_off
)
3895 struct mbuf
*n
, **np
= NULL
;
3896 int off
= off0
, len
= len0
;
3897 struct mbuf
*top
= NULL
;
3898 int mcflags
= MSLEEPF(wait
);
3901 mcache_obj_t
*list
= NULL
;
3904 if (off
== 0 && (m
->m_flags
& M_PKTHDR
))
3907 if (*m_last
!= NULL
) {
3911 while (off
>= m
->m_len
) {
3921 len
-= MIN(len
, (n
->m_len
- ((needed
== 1) ? off
: 0)));
3928 * If the caller doesn't want to be put to sleep, mark it with
3929 * MCR_TRYHARD so that we may reclaim buffers from other places
3932 if (mcflags
& MCR_NOSLEEP
)
3933 mcflags
|= MCR_TRYHARD
;
3935 if (mcache_alloc_ext(m_cache(MC_MBUF
), &list
, needed
,
3941 n
= (struct mbuf
*)list
;
3942 list
= list
->obj_next
;
3943 ASSERT(n
!= NULL
&& m
!= NULL
);
3945 type
= (top
== NULL
) ? MT_HEADER
: m
->m_type
;
3946 MBUF_INIT(n
, (top
== NULL
), type
);
3948 if (top
== NULL
&& mac_mbuf_label_init(n
, wait
) != 0) {
3949 mtype_stat_inc(MT_HEADER
);
3950 mtype_stat_dec(MT_FREE
);
3954 #endif /* MAC_NET */
3966 M_COPY_PKTHDR(n
, m
);
3967 n
->m_pkthdr
.len
= len
;
3970 n
->m_len
= MIN(len
, (m
->m_len
- off
));
3972 if (m
->m_flags
& M_EXT
) {
3973 n
->m_ext
= m
->m_ext
;
3975 n
->m_data
= m
->m_data
+ off
;
3976 n
->m_flags
|= M_EXT
;
3978 bcopy(MTOD(m
, caddr_t
)+off
, MTOD(n
, caddr_t
),
3979 (unsigned)n
->m_len
);
3984 if ((off
+ n
->m_len
) == m
->m_len
) {
3985 *m_last
= m
->m_next
;
3989 *m_off
= off
+ n
->m_len
;
3998 mtype_stat_inc(MT_HEADER
);
3999 mtype_stat_add(type
, needed
);
4000 mtype_stat_sub(MT_FREE
, needed
+ 1);
4002 ASSERT(list
== NULL
);
4007 mcache_free_ext(m_cache(MC_MBUF
), list
);
4015 * Copy data from an mbuf chain starting "off" bytes from the beginning,
4016 * continuing for "len" bytes, into the indicated buffer.
4019 m_copydata(struct mbuf
*m
, int off
, int len
, caddr_t cp
)
4023 if (off
< 0 || len
< 0)
4024 panic("m_copydata: invalid offset %d or len %d", off
, len
);
4028 panic("m_copydata: invalid mbuf chain");
4036 panic("m_copydata: invalid mbuf chain");
4037 count
= MIN(m
->m_len
- off
, len
);
4038 bcopy(MTOD(m
, caddr_t
) + off
, cp
, count
);
4047 * Concatenate mbuf chain n to m. Both chains must be of the same type
4048 * (e.g. MT_DATA). Any m_pkthdr is not updated.
4051 m_cat(struct mbuf
*m
, struct mbuf
*n
)
4056 if ((m
->m_flags
& M_EXT
) ||
4057 m
->m_data
+ m
->m_len
+ n
->m_len
>= &m
->m_dat
[MLEN
]) {
4058 /* just join the two chains */
4062 /* splat the data from one into the other */
4063 bcopy(MTOD(n
, caddr_t
), MTOD(m
, caddr_t
) + m
->m_len
,
4065 m
->m_len
+= n
->m_len
;
4071 m_adj(struct mbuf
*mp
, int req_len
)
4077 if ((m
= mp
) == NULL
)
4083 while (m
!= NULL
&& len
> 0) {
4084 if (m
->m_len
<= len
) {
4095 if (m
->m_flags
& M_PKTHDR
)
4096 m
->m_pkthdr
.len
-= (req_len
- len
);
4099 * Trim from tail. Scan the mbuf chain,
4100 * calculating its length and finding the last mbuf.
4101 * If the adjustment only affects this mbuf, then just
4102 * adjust and return. Otherwise, rescan and truncate
4103 * after the remaining size.
4109 if (m
->m_next
== (struct mbuf
*)0)
4113 if (m
->m_len
>= len
) {
4116 if (m
->m_flags
& M_PKTHDR
)
4117 m
->m_pkthdr
.len
-= len
;
4124 * Correct length for chain is "count".
4125 * Find the mbuf with last data, adjust its length,
4126 * and toss data from remaining mbufs on chain.
4129 if (m
->m_flags
& M_PKTHDR
)
4130 m
->m_pkthdr
.len
= count
;
4131 for (; m
; m
= m
->m_next
) {
4132 if (m
->m_len
>= count
) {
4138 while ((m
= m
->m_next
))
4144 * Rearange an mbuf chain so that len bytes are contiguous
4145 * and in the data area of an mbuf (so that mtod and dtom
4146 * will work for a structure of size len). Returns the resulting
4147 * mbuf chain on success, frees it and returns null on failure.
4148 * If there is room, it will add up to max_protohdr-len extra bytes to the
4149 * contiguous region in an attempt to avoid being called next time.
4154 m_pullup(struct mbuf
*n
, int len
)
4161 * If first mbuf has no cluster, and has room for len bytes
4162 * without shifting current data, pullup into it,
4163 * otherwise allocate a new mbuf to prepend to the chain.
4165 if ((n
->m_flags
& M_EXT
) == 0 &&
4166 n
->m_data
+ len
< &n
->m_dat
[MLEN
] && n
->m_next
) {
4167 if (n
->m_len
>= len
)
4175 _MGET(m
, M_DONTWAIT
, n
->m_type
);
4179 if (n
->m_flags
& M_PKTHDR
) {
4180 M_COPY_PKTHDR(m
, n
);
4181 n
->m_flags
&= ~M_PKTHDR
;
4184 space
= &m
->m_dat
[MLEN
] - (m
->m_data
+ m
->m_len
);
4186 count
= MIN(MIN(MAX(len
, max_protohdr
), space
), n
->m_len
);
4187 bcopy(MTOD(n
, caddr_t
), MTOD(m
, caddr_t
) + m
->m_len
,
4197 } while (len
> 0 && n
);
4211 * Partition an mbuf chain in two pieces, returning the tail --
4212 * all but the first len0 bytes. In case of failure, it returns NULL and
4213 * attempts to restore the chain to its original state.
4216 m_split(struct mbuf
*m0
, int len0
, int wait
)
4219 unsigned len
= len0
, remain
;
4221 for (m
= m0
; m
&& len
> m
->m_len
; m
= m
->m_next
)
4225 remain
= m
->m_len
- len
;
4226 if (m0
->m_flags
& M_PKTHDR
) {
4227 _MGETHDR(n
, wait
, m0
->m_type
);
4230 n
->m_pkthdr
.rcvif
= m0
->m_pkthdr
.rcvif
;
4231 n
->m_pkthdr
.len
= m0
->m_pkthdr
.len
- len0
;
4232 m0
->m_pkthdr
.len
= len0
;
4233 if (m
->m_flags
& M_EXT
)
4235 if (remain
> MHLEN
) {
4236 /* m can't be the lead packet */
4238 n
->m_next
= m_split(m
, len
, wait
);
4239 if (n
->m_next
== NULL
) {
4245 MH_ALIGN(n
, remain
);
4246 } else if (remain
== 0) {
4251 _MGET(n
, wait
, m
->m_type
);
4257 if (m
->m_flags
& M_EXT
) {
4258 n
->m_flags
|= M_EXT
;
4259 n
->m_ext
= m
->m_ext
;
4261 n
->m_data
= m
->m_data
+ len
;
4263 bcopy(MTOD(m
, caddr_t
) + len
, MTOD(n
, caddr_t
), remain
);
4267 n
->m_next
= m
->m_next
;
4273 * Routine to copy from device local memory into mbufs.
4276 m_devget(char *buf
, int totlen
, int off0
, struct ifnet
*ifp
,
4277 void (*copy
)(const void *, void *, size_t))
4280 struct mbuf
*top
= NULL
, **mp
= &top
;
4281 int off
= off0
, len
;
4289 * If 'off' is non-zero, packet is trailer-encapsulated,
4290 * so we have to skip the type and length fields.
4292 cp
+= off
+ 2 * sizeof (u_int16_t
);
4293 totlen
-= 2 * sizeof (u_int16_t
);
4295 _MGETHDR(m
, M_DONTWAIT
, MT_DATA
);
4298 m
->m_pkthdr
.rcvif
= ifp
;
4299 m
->m_pkthdr
.len
= totlen
;
4302 while (totlen
> 0) {
4304 _MGET(m
, M_DONTWAIT
, MT_DATA
);
4311 len
= MIN(totlen
, epkt
- cp
);
4312 if (len
>= MINCLSIZE
) {
4313 MCLGET(m
, M_DONTWAIT
);
4314 if (m
->m_flags
& M_EXT
) {
4315 m
->m_len
= len
= MIN(len
, m_maxsize(MC_CL
));
4317 /* give up when it's out of cluster mbufs */
4325 * Place initial small packet/header at end of mbuf.
4327 if (len
< m
->m_len
) {
4329 len
+ max_linkhdr
<= m
->m_len
)
4330 m
->m_data
+= max_linkhdr
;
4337 copy(cp
, MTOD(m
, caddr_t
), (unsigned)len
);
4339 bcopy(cp
, MTOD(m
, caddr_t
), (unsigned)len
);
4351 * Cluster freelist allocation check.
4354 m_howmany(int num
, size_t bufsize
)
4357 u_int32_t m_clusters
, m_bigclusters
, m_16kclusters
;
4358 u_int32_t m_clfree
, m_bigclfree
, m_16kclfree
;
4360 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
4362 m_clusters
= m_total(MC_CL
);
4363 m_bigclusters
= m_total(MC_BIGCL
);
4364 m_16kclusters
= m_total(MC_16KCL
);
4365 m_clfree
= m_infree(MC_CL
);
4366 m_bigclfree
= m_infree(MC_BIGCL
);
4367 m_16kclfree
= m_infree(MC_16KCL
);
4369 /* Bail if we've maxed out the mbuf memory map */
4370 if ((bufsize
!= m_maxsize(MC_16KCL
) &&
4371 (m_clusters
+ (m_bigclusters
<< 1) >= nclusters
)) ||
4372 (njcl
> 0 && bufsize
== m_maxsize(MC_16KCL
) &&
4373 (m_16kclusters
<< 3) >= njcl
)) {
4375 if (bufsize
== MCLBYTES
&& num
> m_clfree
) {
4376 printf("m_howmany - out of small clusters, "
4377 "%d short\n", num
- mbstat
.m_clfree
);
4383 if (bufsize
== m_maxsize(MC_CL
)) {
4385 if (m_clusters
< MINCL
)
4386 return (MINCL
- m_clusters
);
4387 /* Too few (free < 1/16 total) and not over maximum */
4388 if (m_clusters
< m_maxlimit(MC_CL
)) {
4389 if (m_clfree
>= MCL_LOWAT
)
4391 if (num
>= m_clfree
)
4393 if (((m_clusters
+ num
) >> 4) > m_clfree
)
4394 j
= ((m_clusters
+ num
) >> 4) - m_clfree
;
4396 if (i
+ m_clusters
>= m_maxlimit(MC_CL
))
4397 i
= m_maxlimit(MC_CL
) - m_clusters
;
4399 VERIFY((m_total(MC_CL
) + i
) <= m_maxlimit(MC_CL
));
4400 } else if (bufsize
== m_maxsize(MC_BIGCL
)) {
4402 if (m_bigclusters
< MINBIGCL
)
4403 return (MINBIGCL
- m_bigclusters
);
4404 /* Too few (free < 1/16 total) and not over maximum */
4405 if (m_bigclusters
< m_maxlimit(MC_BIGCL
)) {
4406 if (m_bigclfree
>= MBIGCL_LOWAT
)
4408 if (num
>= m_bigclfree
)
4409 i
= num
- m_bigclfree
;
4410 if (((m_bigclusters
+ num
) >> 4) > m_bigclfree
)
4411 j
= ((m_bigclusters
+ num
) >> 4) - m_bigclfree
;
4413 if (i
+ m_bigclusters
>= m_maxlimit(MC_BIGCL
))
4414 i
= m_maxlimit(MC_BIGCL
) - m_bigclusters
;
4416 VERIFY((m_total(MC_BIGCL
) + i
) <= m_maxlimit(MC_BIGCL
));
4420 if (m_16kclusters
< MIN16KCL
)
4421 return (MIN16KCL
- m_16kclusters
);
4422 /* Too few (free < 1/16 total) and not over maximum */
4423 if (m_16kclusters
< m_maxlimit(MC_16KCL
)) {
4424 if (m_16kclfree
>= M16KCL_LOWAT
)
4426 if (num
>= m_16kclfree
)
4427 i
= num
- m_16kclfree
;
4428 if (((m_16kclusters
+ num
) >> 4) > m_16kclfree
)
4429 j
= ((m_16kclusters
+ num
) >> 4) - m_16kclfree
;
4431 if (i
+ m_16kclusters
>= m_maxlimit(MC_16KCL
))
4432 i
= m_maxlimit(MC_16KCL
) - m_16kclusters
;
4434 VERIFY((m_total(MC_16KCL
) + i
) <= m_maxlimit(MC_16KCL
));
4441 * Copy data from a buffer back into the indicated mbuf chain,
4442 * starting "off" bytes from the beginning, extending the mbuf
4443 * chain if necessary.
4446 m_copyback(struct mbuf
*m0
, int off
, int len
, caddr_t cp
)
4449 struct mbuf
*m
= m0
, *n
;
4454 while (off
> (mlen
= m
->m_len
)) {
4457 if (m
->m_next
== NULL
) {
4458 n
= m_getclr(M_DONTWAIT
, m
->m_type
);
4461 n
->m_len
= MIN(MLEN
, len
+ off
);
4467 mlen
= MIN(m
->m_len
- off
, len
);
4468 bcopy(cp
, off
+ MTOD(m
, caddr_t
), (unsigned)mlen
);
4476 if (m
->m_next
== NULL
) {
4477 n
= _M_GET(M_DONTWAIT
, m
->m_type
);
4480 n
->m_len
= MIN(MLEN
, len
);
4486 if (((m
= m0
)->m_flags
& M_PKTHDR
) && (m
->m_pkthdr
.len
< totlen
))
4487 m
->m_pkthdr
.len
= totlen
;
4491 mcl_to_paddr(char *addr
)
4495 if (!MBUF_IN_MAP(addr
))
4497 base_phys
= mcl_paddr
[(addr
- (char *)mbutl
) >> PGSHIFT
];
4501 return ((char *)((int)base_phys
| ((int)addr
& PGOFSET
)));
4505 * Dup the mbuf chain passed in. The whole thing. No cute additional cruft.
4506 * And really copy the thing. That way, we don't "precompute" checksums
4507 * for unsuspecting consumers. Assumption: m->m_nextpkt == 0. Trick: for
4508 * small packets, don't dup into a cluster. That way received packets
4509 * don't take up too much room in the sockbuf (cf. sbspace()).
4514 m_dup(struct mbuf
*m
, int how
)
4516 struct mbuf
*n
, **np
;
4522 if (m
->m_flags
& M_PKTHDR
)
4526 * Quick check: if we have one mbuf and its data fits in an
4527 * mbuf with packet header, just copy and go.
4529 if (m
->m_next
== NULL
) {
4530 /* Then just move the data into an mbuf and be done... */
4532 if (m
->m_pkthdr
.len
<= MHLEN
&& m
->m_len
<= MHLEN
) {
4533 if ((n
= _M_GETHDR(how
, m
->m_type
)) == NULL
)
4535 n
->m_len
= m
->m_len
;
4536 m_dup_pkthdr(n
, m
, how
);
4537 bcopy(m
->m_data
, n
->m_data
, m
->m_len
);
4540 } else if (m
->m_len
<= MLEN
) {
4541 if ((n
= _M_GET(how
, m
->m_type
)) == NULL
)
4543 bcopy(m
->m_data
, n
->m_data
, m
->m_len
);
4544 n
->m_len
= m
->m_len
;
4550 kprintf("<%x: %x, %x, %x\n", m
, m
->m_flags
, m
->m_len
,
4554 n
= _M_GETHDR(how
, m
->m_type
);
4556 n
= _M_GET(how
, m
->m_type
);
4559 if (m
->m_flags
& M_EXT
) {
4560 if (m
->m_len
<= m_maxsize(MC_CL
))
4562 else if (m
->m_len
<= m_maxsize(MC_BIGCL
))
4563 n
= m_mbigget(n
, how
);
4564 else if (m
->m_len
<= m_maxsize(MC_16KCL
) && njcl
> 0)
4565 n
= m_m16kget(n
, how
);
4566 if (!(n
->m_flags
& M_EXT
)) {
4573 /* Don't use M_COPY_PKTHDR: preserve m_data */
4574 m_dup_pkthdr(n
, m
, how
);
4576 if (!(n
->m_flags
& M_EXT
))
4577 n
->m_data
= n
->m_pktdat
;
4579 n
->m_len
= m
->m_len
;
4581 * Get the dup on the same bdry as the original
4582 * Assume that the two mbufs have the same offset to data area
4583 * (up to word boundaries)
4585 bcopy(MTOD(m
, caddr_t
), MTOD(n
, caddr_t
), (unsigned)n
->m_len
);
4589 kprintf(">%x: %x, %x, %x\n", n
, n
->m_flags
, n
->m_len
,
4604 #define MBUF_MULTIPAGES(m) \
4605 (((m)->m_flags & M_EXT) && \
4606 ((IS_P2ALIGNED((m)->m_data, NBPG) && (m)->m_len > NBPG) || \
4607 (!IS_P2ALIGNED((m)->m_data, NBPG) && \
4608 P2ROUNDUP((m)->m_data, NBPG) < ((uintptr_t)(m)->m_data + (m)->m_len))))
4610 static struct mbuf
*
4611 m_expand(struct mbuf
*m
, struct mbuf
**last
)
4613 struct mbuf
*top
= NULL
;
4614 struct mbuf
**nm
= &top
;
4615 uintptr_t data0
, data
;
4616 unsigned int len0
, len
;
4618 VERIFY(MBUF_MULTIPAGES(m
));
4619 VERIFY(m
->m_next
== NULL
);
4620 data0
= (uintptr_t)m
->m_data
;
4628 if (IS_P2ALIGNED(data
, NBPG
) && len0
> NBPG
)
4630 else if (!IS_P2ALIGNED(data
, NBPG
) &&
4631 P2ROUNDUP(data
, NBPG
) < (data
+ len0
))
4632 len
= P2ROUNDUP(data
, NBPG
) - data
;
4637 VERIFY(m
->m_flags
& M_EXT
);
4638 m
->m_data
= (void *)data
;
4650 n
= _M_RETRY(M_DONTWAIT
, MT_DATA
);
4657 n
->m_ext
= m
->m_ext
;
4659 n
->m_flags
|= M_EXT
;
4666 m_normalize(struct mbuf
*m
)
4668 struct mbuf
*top
= NULL
;
4669 struct mbuf
**nm
= &top
;
4670 boolean_t expanded
= FALSE
;
4678 /* Does the data cross one or more page boundaries? */
4679 if (MBUF_MULTIPAGES(m
)) {
4681 if ((m
= m_expand(m
, &last
)) == NULL
) {
4697 atomic_add_32(&mb_normalized
, 1);
4702 m_mchtype(struct mbuf
*m
, int t
)
4705 mtype_stat_dec(m
->m_type
);
4710 m_mtod(struct mbuf
*m
)
4712 return (MTOD(m
, void *));
4718 return ((struct mbuf
*)((u_long
)(x
) & ~(MSIZE
-1)));
4722 m_mcheck(struct mbuf
*m
)
4728 * Inform the corresponding mcache(s) that there's a waiter below.
4731 mbuf_waiter_inc(mbuf_class_t
class, boolean_t comp
)
4733 mcache_waiter_inc(m_cache(class));
4735 if (class == MC_CL
) {
4736 mcache_waiter_inc(m_cache(MC_MBUF_CL
));
4737 } else if (class == MC_BIGCL
) {
4738 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL
));
4739 } else if (class == MC_16KCL
) {
4740 mcache_waiter_inc(m_cache(MC_MBUF_16KCL
));
4742 mcache_waiter_inc(m_cache(MC_MBUF_CL
));
4743 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL
));
4749 * Inform the corresponding mcache(s) that there's no more waiter below.
4752 mbuf_waiter_dec(mbuf_class_t
class, boolean_t comp
)
4754 mcache_waiter_dec(m_cache(class));
4756 if (class == MC_CL
) {
4757 mcache_waiter_dec(m_cache(MC_MBUF_CL
));
4758 } else if (class == MC_BIGCL
) {
4759 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL
));
4760 } else if (class == MC_16KCL
) {
4761 mcache_waiter_dec(m_cache(MC_MBUF_16KCL
));
4763 mcache_waiter_dec(m_cache(MC_MBUF_CL
));
4764 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL
));
4770 * Called during blocking allocation. Returns TRUE if one or more objects
4771 * are available at the per-CPU caches layer and that allocation should be
4772 * retried at that level.
4775 mbuf_sleep(mbuf_class_t
class, unsigned int num
, int wait
)
4777 boolean_t mcache_retry
= FALSE
;
4779 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
4781 /* Check if there's anything at the cache layer */
4782 if (mbuf_cached_above(class, wait
)) {
4783 mcache_retry
= TRUE
;
4787 /* Nothing? Then try hard to get it from somewhere */
4788 m_reclaim(class, num
, (wait
& MCR_COMP
));
4790 /* We tried hard and got something? */
4791 if (m_infree(class) > 0) {
4794 } else if (mbuf_cached_above(class, wait
)) {
4796 mcache_retry
= TRUE
;
4798 } else if (wait
& MCR_TRYHARD
) {
4799 mcache_retry
= TRUE
;
4804 * There's really nothing for us right now; inform the
4805 * cache(s) that there is a waiter below and go to sleep.
4807 mbuf_waiter_inc(class, (wait
& MCR_COMP
));
4809 VERIFY(!(wait
& MCR_NOSLEEP
));
4811 (void) msleep(mb_waitchan
, mbuf_mlock
, (PZERO
-1), m_cname(class), NULL
);
4813 /* We are now up; stop getting notified until next round */
4814 mbuf_waiter_dec(class, (wait
& MCR_COMP
));
4816 /* We waited and got something */
4817 if (m_infree(class) > 0) {
4820 } else if (mbuf_cached_above(class, wait
)) {
4822 mcache_retry
= TRUE
;
4825 return (mcache_retry
);
4829 mbuf_worker_thread(void)
4834 lck_mtx_lock(mbuf_mlock
);
4837 if (mbuf_expand_mcl
) {
4840 /* Adjust to current number of cluster in use */
4841 n
= mbuf_expand_mcl
-
4842 (m_total(MC_CL
) - m_infree(MC_CL
));
4843 if ((n
+ m_total(MC_CL
)) > m_maxlimit(MC_CL
))
4844 n
= m_maxlimit(MC_CL
) - m_total(MC_CL
);
4845 mbuf_expand_mcl
= 0;
4847 if (n
> 0 && freelist_populate(MC_CL
, n
, M_WAIT
) > 0)
4850 if (mbuf_expand_big
) {
4853 /* Adjust to current number of 4 KB cluster in use */
4854 n
= mbuf_expand_big
-
4855 (m_total(MC_BIGCL
) - m_infree(MC_BIGCL
));
4856 if ((n
+ m_total(MC_BIGCL
)) > m_maxlimit(MC_BIGCL
))
4857 n
= m_maxlimit(MC_BIGCL
) - m_total(MC_BIGCL
);
4858 mbuf_expand_big
= 0;
4860 if (n
> 0 && freelist_populate(MC_BIGCL
, n
, M_WAIT
) > 0)
4863 if (mbuf_expand_16k
) {
4866 /* Adjust to current number of 16 KB cluster in use */
4867 n
= mbuf_expand_16k
-
4868 (m_total(MC_16KCL
) - m_infree(MC_16KCL
));
4869 if ((n
+ m_total(MC_16KCL
)) > m_maxlimit(MC_16KCL
))
4870 n
= m_maxlimit(MC_16KCL
) - m_total(MC_16KCL
);
4871 mbuf_expand_16k
= 0;
4874 (void) freelist_populate(MC_16KCL
, n
, M_WAIT
);
4878 * Because we can run out of memory before filling the mbuf
4879 * map, we should not allocate more clusters than they are
4880 * mbufs -- otherwise we could have a large number of useless
4881 * clusters allocated.
4884 while (m_total(MC_MBUF
) <
4885 (m_total(MC_BIGCL
) + m_total(MC_CL
))) {
4886 if (freelist_populate(MC_MBUF
, 1, M_WAIT
) == 0)
4891 lck_mtx_unlock(mbuf_mlock
);
4893 assert_wait(&mbuf_worker_run
, THREAD_UNINT
);
4894 (void) thread_block((thread_continue_t
)mbuf_worker_thread
);
4899 mbuf_worker_thread_init(void)
4901 mbuf_worker_ready
++;
4902 mbuf_worker_thread();
4911 lck_mtx_assert(mbuf_mlock
, LCK_MTX_ASSERT_OWNED
);
4913 VERIFY(MBUF_IN_MAP(buf
));
4914 ix
= ((char *)buf
- (char *)mbutl
) >> MBSHIFT
;
4915 VERIFY(ix
< maxslabgrp
);
4917 if ((slg
= slabstbl
[ix
]) == NULL
) {
4919 * In the current implementation, we never shrink the memory
4920 * pool (hence the cluster map); if we attempt to reallocate
4921 * a cluster group when it's already allocated, panic since
4922 * this is a sign of a memory corruption (slabstbl[ix] got
4923 * nullified). This also means that there shouldn't be any
4924 * hole in the kernel sub-map for the mbuf pool.
4927 VERIFY(ix
< slabgrp
);
4929 * Slabs expansion can only be done single threaded; when
4930 * we get here, it must be as a result of m_clalloc() which
4931 * is serialized and therefore mb_clalloc_busy must be set.
4933 VERIFY(mb_clalloc_busy
);
4934 lck_mtx_unlock(mbuf_mlock
);
4936 /* This is a new buffer; create the slabs group for it */
4937 MALLOC(slg
, mcl_slabg_t
*, sizeof (*slg
), M_TEMP
,
4939 VERIFY(slg
!= NULL
);
4941 lck_mtx_lock(mbuf_mlock
);
4943 * No other thread could have gone into m_clalloc() after
4944 * we dropped the lock above, so verify that it's true.
4946 VERIFY(mb_clalloc_busy
);
4950 /* Chain each slab in the group to its forward neighbor */
4951 for (k
= 1; k
< NSLABSPMB
; k
++)
4952 slg
->slg_slab
[k
- 1].sl_next
= &slg
->slg_slab
[k
];
4953 VERIFY(slg
->slg_slab
[NSLABSPMB
- 1].sl_next
== NULL
);
4955 /* And chain the last slab in the previous group to this */
4957 VERIFY(slabstbl
[ix
- 1]->
4958 slg_slab
[NSLABSPMB
- 1].sl_next
== NULL
);
4959 slabstbl
[ix
- 1]->slg_slab
[NSLABSPMB
- 1].sl_next
=
4964 ix
= MTOCL(buf
) % NSLABSPMB
;
4965 VERIFY(ix
< NSLABSPMB
);
4967 return (&slg
->slg_slab
[ix
]);
4971 slab_init(mcl_slab_t
*sp
, mbuf_class_t
class, u_int32_t flags
,
4972 void *base
, void *head
, unsigned int len
, int refcnt
, int chunks
)
4974 sp
->sl_class
= class;
4975 sp
->sl_flags
= flags
;
4979 sp
->sl_refcnt
= refcnt
;
4980 sp
->sl_chunks
= chunks
;
4985 slab_insert(mcl_slab_t
*sp
, mbuf_class_t
class)
4987 VERIFY(slab_is_detached(sp
));
4988 m_slab_cnt(class)++;
4989 TAILQ_INSERT_TAIL(&m_slablist(class), sp
, sl_link
);
4990 sp
->sl_flags
&= ~SLF_DETACHED
;
4991 if (class == MC_BIGCL
) {
4993 /* Next slab must already be present */
4995 VERIFY(slab_is_detached(sp
));
4996 sp
->sl_flags
&= ~SLF_DETACHED
;
4997 } else if (class == MC_16KCL
) {
4999 for (k
= 1; k
< (M16KCLBYTES
/ MCLBYTES
); k
++) {
5001 /* Next slab must already be present */
5003 VERIFY(slab_is_detached(sp
));
5004 sp
->sl_flags
&= ~SLF_DETACHED
;
5010 slab_remove(mcl_slab_t
*sp
, mbuf_class_t
class)
5012 VERIFY(!slab_is_detached(sp
));
5013 VERIFY(m_slab_cnt(class) > 0);
5014 m_slab_cnt(class)--;
5015 TAILQ_REMOVE(&m_slablist(class), sp
, sl_link
);
5017 if (class == MC_BIGCL
) {
5019 /* Next slab must already be present */
5021 VERIFY(!slab_is_detached(sp
));
5023 } else if (class == MC_16KCL
) {
5025 for (k
= 1; k
< (M16KCLBYTES
/ MCLBYTES
); k
++) {
5027 /* Next slab must already be present */
5029 VERIFY(!slab_is_detached(sp
));
5036 slab_inrange(mcl_slab_t
*sp
, void *buf
)
5038 return ((uintptr_t)buf
>= (uintptr_t)sp
->sl_base
&&
5039 (uintptr_t)buf
< ((uintptr_t)sp
->sl_base
+ sp
->sl_len
));
5045 slab_nextptr_panic(mcl_slab_t
*sp
, void *addr
)
5048 unsigned int chunk_len
= sp
->sl_len
/ sp
->sl_chunks
;
5049 uintptr_t buf
= (uintptr_t)sp
->sl_base
;
5051 for (i
= 0; i
< sp
->sl_chunks
; i
++, buf
+= chunk_len
) {
5052 void *next
= ((mcache_obj_t
*)buf
)->obj_next
;
5055 if (mclaudit
== NULL
) {
5056 if (next
!= NULL
&& !MBUF_IN_MAP(next
)) {
5057 mcache_t
*cp
= m_cache(sp
->sl_class
);
5058 panic("%s: %s buffer %p in slab %p modified "
5059 "after free at offset 0: %p out of range "
5060 "[%p-%p)\n", __func__
, cp
->mc_name
,
5061 (void *)buf
, sp
, next
, mbutl
, embutl
);
5065 mcache_audit_t
*mca
= mcl_audit_buf2mca(sp
->sl_class
,
5066 (mcache_obj_t
*)buf
);
5067 mcl_audit_verify_nextptr(next
, mca
);
5073 slab_detach(mcl_slab_t
*sp
)
5075 sp
->sl_link
.tqe_next
= (mcl_slab_t
*)-1;
5076 sp
->sl_link
.tqe_prev
= (mcl_slab_t
**)-1;
5077 sp
->sl_flags
|= SLF_DETACHED
;
5081 slab_is_detached(mcl_slab_t
*sp
)
5083 return ((intptr_t)sp
->sl_link
.tqe_next
== -1 &&
5084 (intptr_t)sp
->sl_link
.tqe_prev
== -1 &&
5085 (sp
->sl_flags
& SLF_DETACHED
));
5089 mcl_audit_init(void *buf
, mcache_audit_t
**mca_list
,
5090 mcache_obj_t
**con_list
, size_t con_size
, unsigned int num
)
5092 mcache_audit_t
*mca
, *mca_tail
;
5093 mcache_obj_t
*con
= NULL
;
5094 boolean_t save_contents
= (con_list
!= NULL
);
5097 ASSERT(num
<= NMBPCL
);
5098 ASSERT(con_list
== NULL
|| con_size
!= 0);
5101 /* Make sure we haven't been here before */
5102 for (i
= 0; i
< NMBPCL
; i
++)
5103 VERIFY(mclaudit
[ix
].cl_audit
[i
] == NULL
);
5105 mca
= mca_tail
= *mca_list
;
5109 for (i
= 0; i
< num
; i
++) {
5110 mcache_audit_t
*next
;
5112 next
= mca
->mca_next
;
5113 bzero(mca
, sizeof (*mca
));
5114 mca
->mca_next
= next
;
5115 mclaudit
[ix
].cl_audit
[i
] = mca
;
5117 /* Attach the contents buffer if requested */
5118 if (save_contents
) {
5119 VERIFY(con
!= NULL
);
5120 mca
->mca_contents_size
= con_size
;
5121 mca
->mca_contents
= con
;
5122 con
= con
->obj_next
;
5123 bzero(mca
->mca_contents
, mca
->mca_contents_size
);
5127 mca
= mca
->mca_next
;
5133 *mca_list
= mca_tail
->mca_next
;
5134 mca_tail
->mca_next
= NULL
;
5138 * Given an address of a buffer (mbuf/cluster/big cluster), return
5139 * the corresponding audit structure for that buffer.
5141 static mcache_audit_t
*
5142 mcl_audit_buf2mca(mbuf_class_t
class, mcache_obj_t
*o
)
5144 mcache_audit_t
*mca
= NULL
;
5147 VERIFY(IS_P2ALIGNED(o
, MIN(m_maxsize(class), NBPG
)));
5152 * For the mbuf case, find the index of the cluster
5153 * used by the mbuf and use that index to locate the
5154 * base address of the cluster. Then find out the
5155 * mbuf index relative to the cluster base and use
5156 * it to locate the audit structure.
5158 VERIFY(MCLIDX(CLTOM(ix
), o
) < (int)NMBPCL
);
5159 mca
= mclaudit
[ix
].cl_audit
[MCLIDX(CLTOM(ix
), o
)];
5166 * Same as above, but only return the first element.
5168 mca
= mclaudit
[ix
].cl_audit
[0];
5180 mcl_audit_mbuf(mcache_audit_t
*mca
, void *addr
, boolean_t composite
,
5183 struct mbuf
*m
= addr
;
5184 mcache_obj_t
*next
= ((mcache_obj_t
*)m
)->obj_next
;
5186 VERIFY(mca
->mca_contents
!= NULL
&&
5187 mca
->mca_contents_size
== AUDIT_CONTENTS_SIZE
);
5189 mcl_audit_verify_nextptr(next
, mca
);
5192 /* Save constructed mbuf fields */
5193 mcl_audit_save_mbuf(m
, mca
);
5194 mcache_set_pattern(MCACHE_FREE_PATTERN
, m
, m_maxsize(MC_MBUF
));
5195 ((mcache_obj_t
*)m
)->obj_next
= next
;
5199 /* Check if the buffer has been corrupted while in freelist */
5200 mcache_audit_free_verify_set(mca
, addr
, 0, m_maxsize(MC_MBUF
));
5202 /* Restore constructed mbuf fields */
5203 mcl_audit_restore_mbuf(m
, mca
, composite
);
5207 mcl_audit_restore_mbuf(struct mbuf
*m
, mcache_audit_t
*mca
, boolean_t composite
)
5209 struct mbuf
*ms
= (struct mbuf
*)mca
->mca_contents
;
5212 struct mbuf
*next
= m
->m_next
;
5213 VERIFY(ms
->m_flags
== M_EXT
&& MEXT_RFA(ms
) != NULL
&&
5214 MBUF_IS_COMPOSITE(ms
));
5216 * We could have hand-picked the mbuf fields and restore
5217 * them individually, but that will be a maintenance
5218 * headache. Instead, restore everything that was saved;
5219 * the mbuf layer will recheck and reinitialize anyway.
5221 bcopy(ms
, m
, mca
->mca_contents_size
);
5225 * For a regular mbuf (no cluster attached) there's nothing
5226 * to restore other than the type field, which is expected
5229 m
->m_type
= ms
->m_type
;
5235 mcl_audit_save_mbuf(struct mbuf
*m
, mcache_audit_t
*mca
)
5238 bcopy(m
, mca
->mca_contents
, mca
->mca_contents_size
);
5242 mcl_audit_cluster(mcache_audit_t
*mca
, void *addr
, size_t size
, boolean_t alloc
,
5243 boolean_t save_next
)
5245 mcache_obj_t
*next
= ((mcache_obj_t
*)addr
)->obj_next
;
5248 mcache_set_pattern(MCACHE_FREE_PATTERN
, addr
, size
);
5250 mcl_audit_verify_nextptr(next
, mca
);
5251 ((mcache_obj_t
*)addr
)->obj_next
= next
;
5254 /* Check if the buffer has been corrupted while in freelist */
5255 mcl_audit_verify_nextptr(next
, mca
);
5256 mcache_audit_free_verify_set(mca
, addr
, 0, size
);
5261 mcl_audit_mcheck_panic(struct mbuf
*m
)
5263 mcache_audit_t
*mca
;
5266 mca
= mcl_audit_buf2mca(MC_MBUF
, (mcache_obj_t
*)m
);
5268 panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n",
5269 m
, (u_int16_t
)m
->m_type
, MT_FREE
, mcache_dump_mca(mca
));
5274 mcl_audit_verify_nextptr(void *next
, mcache_audit_t
*mca
)
5276 if (next
!= NULL
&& next
!= (void *)MCACHE_FREE_PATTERN
&&
5277 !MBUF_IN_MAP(next
)) {
5278 panic("mcl_audit: buffer %p modified after free at offset 0: "
5279 "%p out of range [%p-%p)\n%s\n",
5280 mca
->mca_addr
, next
, mbutl
, embutl
, mcache_dump_mca(mca
));
5285 SYSCTL_DECL(_kern_ipc
);
5286 SYSCTL_PROC(_kern_ipc
, KIPC_MBSTAT
, mbstat
, CTLFLAG_RD
| CTLFLAG_LOCKED
,
5287 0, 0, mbstat_sysctl
, "S,mbstat", "");
5288 SYSCTL_PROC(_kern_ipc
, OID_AUTO
, mb_stat
, CTLFLAG_RD
| CTLFLAG_LOCKED
,
5289 0, 0, mb_stat_sysctl
, "S,mb_stat", "");
5290 SYSCTL_INT(_kern_ipc
, OID_AUTO
, mb_normalized
, CTLFLAG_RD
| CTLFLAG_LOCKED
,
5291 &mb_normalized
, 0, "");